Setting up display

In [1]:
from IPython.display import HTML, display
import time

In [15]:
def progress(n):
    return HTML(""" Collected company count: {num}""".format(num=n))

Get List of Companies

In [18]:
def getLinks(url):
    import requests
    from bs4 import BeautifulSoup

    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='tb10Table')
        rows = table.find_all('tr')

        data = []

        for row in rows:
            cols = row.find_all('td')
            for col in cols:
                link = col.find('a')
                if link is not None:
                    data.append(link)

    else:
        print("Failed to retrieve the web page. Status code:", response.status_code)


    links = []
    name = []


    for i in data:
        s = str(i)
        name.append(i.text)
        links.append(s.split('/stocks/')[1].split('"')[0])

    return name,links

In [20]:
out = display(progress(0), display_id=True)

names = []
links = []
for page in range(1, 289):
    if page == 1:
        url = "https://groww.in/stocks/filter"
    else:
        url = 'https://groww.in/stocks/filter?closePriceHigh=100000&closePriceLow=0&marketCapHigh=2000000&marketCapLow=0&page='+str(page)+'&size=15&sortType=ASC'

    name, link = getLinks(url)
    names += name
    links += link

    out.update(progress(len(names)))

print("Collected a total of", len(links), "company names and corresponding page links.")

Collected a total of 4314 company names and corresponding page links.


Example of collected company names and links

In [21]:
print('Company Name', '\t\t\t', 'Link')
for i in range(5):
  print(names[i], '\t\t\t', links[i])

Company Name 			 Link
Reliance Industries 			 reliance-industries-ltd
TCS 			 tata-consultancy-services-ltd
HDFC Bank 			 hdfc-bank-ltd
ICICI Bank 			 icici-bank-ltd
Infosys 			 infosys-ltd


Storing collected information

In [23]:
import csv

file = open('company_names_&_page_links.csv', 'w', newline='', encoding='utf-8')
wr = csv.writer(file)

wr.writerow(['Company Names', 'Page Link Abbr'])

for i in range(len(names)):
  wr.writerow([names[i], links[i]])

file.close()

Import from csv

In [2]:
import csv

file = open('company_names_&_page_links.csv', 'r')
c = csv.reader(file)

k = 0
names = []
links = []

for i in c:
    if k>0:
        names.append(i[0])
        links.append(i[1])
    k += 1
file.close()


print(len(names), len(links))

4314 4314


SCRAPING TABLES

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from IPython.display import HTML, display
import time

In [4]:
def scrape_table(table_html):
    data = []
    headers = []
    soup = BeautifulSoup(table_html, 'html.parser')

    for th in soup.select('thead th'):
        header_text = th.text.strip()
        headers.append(header_text)
    data.append(headers)

    for row in soup.select('tbody tr'):
        row_data = []
        for td in row.select('td'):
            cell_text = td.text.strip()
            row_data.append(cell_text)
        data.append(row_data)
    return data

In [5]:
def getTable(link):
    url = "https://groww.in/stocks/" + link +"/company-financial/cash-flow"
    driver = webdriver.Chrome()
    driver.get(url)

    Cf = []
    Bs = []
    Is = []

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    first_table = soup.find('table', class_='tb10Table')
    first_table_data = scrape_table(first_table.prettify())
    for row in first_table_data:
        Cf.append(row)

    balance_sheet_locator = (By.XPATH, "//div[text()='Balance Sheet']")
    balance_sheet_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(balance_sheet_locator))
    balance_sheet_button.click()
    time.sleep(1)
    balance_sheet_locator = (By.CLASS_NAME, 'tb10Table')
    balance_sheet_table = WebDriverWait(driver, 10).until(EC.presence_of_element_located(balance_sheet_locator))
    balance_sheet_table_data = scrape_table(balance_sheet_table.get_attribute('outerHTML'))
    for row in balance_sheet_table_data:
        Bs.append(row)

    income_statement_locator = (By.XPATH, "//div[text()='Income Statement']")
    income_statement_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(income_statement_locator))
    income_statement_button.click()
    time.sleep(1)
    income_statement_table_locator = (By.CLASS_NAME, 'tb10Table')
    income_statement_table = WebDriverWait(driver, 10).until(EC.presence_of_element_located(income_statement_table_locator))
    income_statement_table_data = scrape_table(income_statement_table.get_attribute('outerHTML'))
    for row in income_statement_table_data:
        Is.append(row)

    driver.quit()

    return Is, Bs, Cf

In [6]:
def progress2(n1, n2):
    return HTML("""Processed company number: {num1} and table number: {num2}""".format(num1=n1, num2=n2))

In [14]:
start = 503

In [15]:
types = ['income-statement', 'balance-sheet', 'cash-flow']

import xml.etree.ElementTree as ET
doc = ET.Element('document')

out = display(progress2(0, 0), display_id=True)

try:
    for i in range(start, len(names)):
        name = names[i]
        link = links[i]
        Is, Bs, Cf = getTable(links[i])

        # print(Is, '\n\n')
        # print(Bs, '\n\n')
        # print(Cf, '\n\n')
        
        for j in range(3):
            typename = types[j]

            tb = ET.SubElement(doc, 'table')
            md = ET.SubElement(tb, 'metadata')
            md1 = ET.SubElement(md, 'CompanyName')
            md1.text = name
            md2 = ET.SubElement(md, 'Source')
            md2.text = "https://groww.in/stocks/"+link+"/company-financial/"+typename
            md3 = ET.SubElement(md, 'Type')
            md3.text = typename


            if(j == 0):
                table = Is
            elif(j == 1):
                table = Bs
            else:
                table = Cf


            head = ET.SubElement(tb, 'header')
            text = ''
            for k in table[0]:
                text += k + ' | '
            head.text = text

            for k in table[1:]:
                row = ET.SubElement(tb, 'row')
                text = ''
                for m in k:
                    text += m + ' | '
                row.text = text

            out.update(progress2(i+1, 3*i + j+1))
except:
    b_xml = ET.tostring(doc)

    file = open("tables.xml", "ab")
    file.write(b_xml)
    file.close()
    b_xml = ET.tostring(doc)

file = open("tables.xml", "ab")
file.write(b_xml)
file.close()
