In [5]:
from bs4 import BeautifulSoup
import requests
session = requests.session()
import json

In [6]:
RETRY_COUNT = 1

In [7]:
class MinistryCorporateAffairs:
    
    def __init__(self,company_id):
        self.company_id = company_id
        self.user_Agent = None
        self.session = None
        
    def home_page(self):
        
        url_link = "https://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do"
        headers = {
            
            "user-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
        }
        response = session.get(url_link,headers = headers)
        if response.status_code ==200:
            return response
        else:
            return "Response Error"
        
    def parse_with_lxml(self,home_page_response):
        #parsing with lxml
        soup = BeautifulSoup(home_page_response.text,'lxml')
        return soup
    
    def get_captcha_image(self,soup):
        #getting the image tag from the response
        imgd = soup.find('img',attrs={"id":"captcha"})
        #extract the images link
        link = imgd['src']
        #concatinate to form the valid link
        url_link = "https://www.mca.gov.in"+link
        headers = {
            "user-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
            "referer": "https://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do"
        } 
        captcha_image = session.get(url_link,headers= headers)
        #this the path where the captcha image is stored
        PATH = '/home/nagaraj/Desktop/Nag/NK/Web-Crawl/Captchas/mcaCaptcha.jpeg'
        
        #save the captcha in the path
        with open(PATH,'wb') as f:
            f.write(captcha_image.content)
        return PATH
    
    def get_text_from_image(self,PATH):
        #here we use pytesseract to extract the text from the image
        import cv2
        from pytesseract import pytesseract
        from pytesseract import Output
        img = cv2.imread(PATH)
        # img=cv2.resize(img,(900,1200))
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
        thresh=cv2.dilate(gray,rect_kernel,iterations=1)
        image_data = pytesseract.image_to_data(thresh, output_type=Output.DICT)
        captcha_Str = ""
        for i, word in enumerate(image_data['text']):
            if word != '':
                captcha_Str = captcha_Str+word
        
        return captcha_Str

    
    def fetch_data(self,company_id,captcha_text):
        url_link ="https://www.mca.gov.in/mcafoportal/companyLLPMasterData.do"
    #     url_link = "https://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do"
        headers = {

            "user-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",
            "referer": "https://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do",
            "origin":"https://www.mca.gov.in",
            "content-type":"application/x-www-form-urlencoded",
    #         ":path":"/mcafoportal/companyLLPMasterData.do",

        }

        payload = {

            "companyName":"",
            "companyID":company_id,
            "displayCaptcha": "true",
            "userEnteredCaptcha":captcha_text
        }
        respone = session.post(url_link,headers = headers,data = payload)
#         print(respone.content)
        return respone 

     
    def extract_data(self,soup):
        #finding the table in the html using the id attribute
        table = soup.find('table',attrs={'id':'resultTab3'})
        data = {} #creating a empty dict
        
        #looping through the table and extracting the td of the tr
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            #as we don't have th for heading here first td is heading[key] and second td is the data[value]
            key = tds[0].get_text().strip()
            value = tds[1].get_text().strip()
            data[key] = value
            
        #the below step just renames the key's as per the requirements
        old_key = ['LLPIN','LLP Name','Number of Partners','Number of Designated Partners','ROC Code','Date of Incorporation','Registered Address','Email Id','Previous firm/ company details, if applicable','Total Obligation of Contribution','Main division of business activity to be carried out in India','Description of main division','Date of last financial year end date for which Statement of Accounts and Solvency filed','Date of last financial year end date for which Annual Return filed','LLP Status']
        new_keys = ['LLPIN','LLP_Name','Number_of_Partners','Number_of_Designated_Partners','ROC_Code','Date_of_Incorporation','Registered_Address','Email_Id','Previous_firm','Total_Obligation_of_Contribution','Main_division','Description_of_main_division','Solvency_filed','Annual_Return_filed','LLP_Status']
        #replace the old keys with the new keys
        result = {new_keys[i]:data.get(old_key[i],"") for i in range(len(old_key))}
        
        #return the result
        return json.dumps(result,indent=4)
        
    def get_output(self):
        
        for retry_num in range(RETRY_COUNT):
            
            self.session = requests.session()
            
            #calling the home page function and store the response in the 'home_page_response' var
            home_page_response = self.home_page()
            
                        
            #after getting the response from site parse it with lxml by calling the parse_with_lxml func
            parsed_data = self.parse_with_lxml(home_page_response)
            
            #download the image from the site with the link and return the path of the image
            captcha_response_path = self.get_captcha_image(parsed_data)
            
            #now extract the letters in the captcha using the func 'get_text_from_image'
            captcha_text = self.get_text_from_image(captcha_response_path)
            
            #now pass the id and captcha as params for the post request to fetch the data
            second_response = self.fetch_data(self.company_id,captcha_text)
            
            #now the response with the lxml 
            second_soup = self.parse_with_lxml(second_response)
            
            #extract the requried table contents 
            output = self.extract_data(second_soup)
            return output
            

In [8]:
if __name__ == '__main__':
    
    print(MinistryCorporateAffairs("AAA-1234").get_output())

{
    "LLPIN": "AAA-1234",
    "LLP_Name": "STRIDE INDUSTRIES LIMITED LIABILITY PART NERSHIP",
    "Number_of_Partners": "1",
    "Number_of_Designated_Partners": "2",
    "ROC_Code": "RoC-Mumbai",
    "Date_of_Incorporation": "28/04/2010",
    "Registered_Address": "GALA NO. 11, AGARWAL INDUSTRIAL ESTATE, WALIV NA VASAI Thane Maharashtra 401208",
    "Email_Id": "strideindustries.exports@gmail.com",
    "Previous_firm": "",
    "Total_Obligation_of_Contribution": "500000",
    "Main_division": "36",
    "Description_of_main_division": "",
    "Solvency_filed": "31/03/2022",
    "Annual_Return_filed": "31/03/2022",
    "LLP_Status": "Active"
}
