<a href="https://colab.research.google.com/github/mratanusarkar/Web-Scraping-tickertapeIN/blob/main/Notebooks/scraping_tickertapeIN_stockNames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraping all stock names from tickertape.in

**Input**: None <br>
**Output**: 2 list of strings containing "Top" stock names and "All" stock names on "https://www.tickertape.in/stocks"

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

import json
import time
from datetime import timedelta

## Request and Fetch the Webpage (for one sample page)

In [2]:
# hit "https://www.tickertape.in/stocks?filter=<filter-value>"
requests.get("https://www.tickertape.in/stocks?filter=a")

<Response [200]>

In [3]:
# wow! no restriction for bots! no need of any headers!
response = requests.get("https://www.tickertape.in/stocks?filter=a")
response.text[0:500]

'<!DOCTYPE html><html lang="en-US"><head><meta http-equiv="X-UA-Compatible" content="IE=edge"/><link rel="shortcut icon" href="/favicon/favicon.png"/><link rel="apple-touch-icon" href="/favicon/favicon-192x192.png"/><link rel="manifest" href="/manifest/manifest.json"/><style type="text/css">:root {--white: #ffffff; --font_primary: #535B62; --font_dark: #2f363f; --font_light: #81878c; --font_blue: #0088ea; --font_lighter: #a2a8ae; --brand_primary: #151e28; --brand_success: #28c39a; --brand_danger:'

In [4]:
# not required for this webpage, use if bot restrictions are added in future.

# google chrome browser's request header (to make it look like, we are making this request from a browser)
header = {
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

# hit using the header
response = requests.get("https://www.tickertape.in/stocks?filter=a", headers=header)
response.text[0:500]

'<!DOCTYPE html><html lang="en-US"><head><meta http-equiv="X-UA-Compatible" content="IE=edge"/><link rel="shortcut icon" href="/favicon/favicon.png"/><link rel="apple-touch-icon" href="/favicon/favicon-192x192.png"/><link rel="manifest" href="/manifest/manifest.json"/><style type="text/css">:root {--white: #ffffff; --font_primary: #535B62; --font_dark: #2f363f; --font_light: #81878c; --font_blue: #0088ea; --font_lighter: #a2a8ae; --brand_primary: #151e28; --brand_success: #28c39a; --brand_danger:'

## Pass the fetched webpage response to Beautiful Soup

In [5]:
# give the webpage to Beautiful Soup using parsers: "html.parser" or "lxml"
soup = BeautifulSoup(response.text, 'lxml')

## Let us try and extract data (from one sample page)

- extract one company name in the page
- extract all company name in the page

### Extracting one company name

In [6]:
# company name
htmlBlock = soup.find("li")
print(htmlBlock)

<li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/a-and-m-febcon-AMF">A &amp; M Febcon Ltd</a></li>


In [7]:
htmlBlock.a['href'].split('/')[2]

'a-and-m-febcon-AMF'

In [8]:
htmlBlock.a.text

'A & M Febcon Ltd'

### Extracting all the company names

In [9]:
# find all li
htmlBlock = soup.find_all("li")
# print(htmlBlock)

In [10]:
# let's see one of the li
htmlBlock[0]

<li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/a-and-m-febcon-AMF">A &amp; M Febcon Ltd</a></li>

In [11]:
# let's extract the company name
htmlBlock[0].a.text

'A & M Febcon Ltd'

In [12]:
# let's extract the link or path to the url subdirectory
htmlBlock[0].a['href'].split('/')[2]

'a-and-m-febcon-AMF'

In [13]:
# let's apply the map to all the list of li
fullList = list(map(lambda element: element.a['href'], htmlBlock))
fullList[0:9]

['/stocks/a-and-m-febcon-AMF',
 '/stocks/ab-cotspin-india-ABCO',
 '/stocks/a-b-infrabuild-ABIN',
 '/stocks/a-f-enterprises-AFE',
 '/stocks/a-infrastructure-AIN',
 '/stocks/akcapital-services-AKC',
 '/stocks/a-and-m-jumbo-bags-AMJU',
 '/stocks/a-1-acid-AAL',
 '/stocks/a2z-infra-engineering-A2ZI']

In [14]:
# apply map & filter to pick the stock links only, as we found other links too in the list
stocksList = list(map(lambda x: x.split('/')[2], filter(lambda x: True if "stocks" in x else False, fullList)))
print(len(stocksList))
stocksList[0:10]

421


['a-and-m-febcon-AMF',
 'ab-cotspin-india-ABCO',
 'a-b-infrabuild-ABIN',
 'a-f-enterprises-AFE',
 'a-infrastructure-AIN',
 'akcapital-services-AKC',
 'a-and-m-jumbo-bags-AMJU',
 'a-1-acid-AAL',
 'a2z-infra-engineering-A2ZI',
 'aa-plus-tradelink-AAP']

In [15]:
# similarty, apply map & filter to pick the etfs only
etfsList = list(map(lambda x: x.split('/')[2], filter(lambda x: True if "etfs" in x else False, fullList)))
print(len(etfsList))
etfsList[0:10]

15


['aditya-bsl-gold-etf-AITY',
 'aditya-bsl-nifty-50-etf-ADIY',
 'aditya-bsl-sensex-30-etf-BSL',
 'aditya-birla-sun-life-nifty-healthcare-etf-HEALT',
 'aditya-birla-sun-life-nifty-bank-etf-ADIL',
 'aditya-birla-sun-life-nifty-it-etf-TECT',
 'aditya-birla-sun-life-nifty-next-50-etf-ADIB',
 'aditya-birla-sun-life-silver-etf-SILVR',
 'axis-aaa-bond-plus-sdl-etf-2026-matur-reg-growth-AXISB',
 'axis-banking-etf-AXIS']

In [16]:
# combining to get full list of all the stocks and etfs
list(filter(lambda x: True if "etfs" in x or "stocks" in x else False, fullList))[0:10]

['/stocks/a-and-m-febcon-AMF',
 '/stocks/ab-cotspin-india-ABCO',
 '/stocks/a-b-infrabuild-ABIN',
 '/stocks/a-f-enterprises-AFE',
 '/stocks/a-infrastructure-AIN',
 '/stocks/akcapital-services-AKC',
 '/stocks/a-and-m-jumbo-bags-AMJU',
 '/stocks/a-1-acid-AAL',
 '/stocks/a2z-infra-engineering-A2ZI',
 '/stocks/aa-plus-tradelink-AAP']

In [17]:
# let's come up with another map & filter, to form a full json/dictionary list with type, instead of two separate lists of stocks & etfs
filteredHtmlBlock = list(filter(lambda x: True if "etfs" in x.a['href'] or "stocks" in x.a['href'] else False, htmlBlock))
# filteredHtmlBlock[0:10]

In [18]:
# and... done!
data = list(map(lambda x: {"name": x.a.text, "type": x.a['href'].split('/')[1], "subdirectory": x.a['href'].split('/')[2]}, filteredHtmlBlock))
print(len(data))
data[0:5]

436


[{'name': 'A & M Febcon Ltd',
  'subdirectory': 'a-and-m-febcon-AMF',
  'type': 'stocks'},
 {'name': 'A B Cotspin India Ltd',
  'subdirectory': 'ab-cotspin-india-ABCO',
  'type': 'stocks'},
 {'name': 'A B Infrabuild Ltd',
  'subdirectory': 'a-b-infrabuild-ABIN',
  'type': 'stocks'},
 {'name': 'A F Enterprises Ltd',
  'subdirectory': 'a-f-enterprises-AFE',
  'type': 'stocks'},
 {'name': 'A Infrastructure Ltd',
  'subdirectory': 'a-infrastructure-AIN',
  'type': 'stocks'}]

### avoid bugs with non-functional approach

In [19]:
# since the map & filter is throwing errors and bugs on function call, 
# let's move out of this functional progrmming and write our custom functions!

In [20]:
htmlBlock[0:5]

[<li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/a-and-m-febcon-AMF">A &amp; M Febcon Ltd</a></li>,
 <li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/ab-cotspin-india-ABCO">A B Cotspin India Ltd</a></li>,
 <li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/a-b-infrabuild-ABIN">A B Infrabuild Ltd</a></li>,
 <li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/a-f-enterprises-AFE">A F Enterprises Ltd</a></li>,
 <li class="jsx-1528870203"><a class="jsx-1528870203" href="/stocks/a-infrastructure-AIN">A Infrastructure Ltd</a></li>]

In [21]:
# custom filter function
def filter_data_list_fn(listBlock):
    href = listBlock.a['href']
    return "etfs" in href or "stocks" in href

In [22]:
# function to apply filter to htmlBlock and return the filtered htmlBlock 
def get_filtered_html_blocks_list(htmlBlock):
    filtered_html_blocks_list = []
    for block in htmlBlock:
        if filter_data_list_fn(block):
            filtered_html_blocks_list.append(block)
    return filtered_html_blocks_list

In [23]:
# function to map filtered html block to desired data json/dictionary
def map_html_block_list_to_data_list(filteredHtmlBlock):
    data_list = []
    for block in filteredHtmlBlock:
        data_obj = {
            "name": block.a.text,
            "type": block.a['href'].split('/')[1],
            "subdirectory": block.a['href'].split('/')[2]
        }
        data_list.append(data_obj)
    return data_list

In [24]:
# and... done as above approach, but now with our custom, bug free functions
filteredHtmlBlock = get_filtered_html_blocks_list(htmlBlock)
data = map_html_block_list_to_data_list(filteredHtmlBlock)
print(len(data))
data[0:5]

436


[{'name': 'A & M Febcon Ltd',
  'subdirectory': 'a-and-m-febcon-AMF',
  'type': 'stocks'},
 {'name': 'A B Cotspin India Ltd',
  'subdirectory': 'ab-cotspin-india-ABCO',
  'type': 'stocks'},
 {'name': 'A B Infrabuild Ltd',
  'subdirectory': 'a-b-infrabuild-ABIN',
  'type': 'stocks'},
 {'name': 'A F Enterprises Ltd',
  'subdirectory': 'a-f-enterprises-AFE',
  'type': 'stocks'},
 {'name': 'A Infrastructure Ltd',
  'subdirectory': 'a-infrastructure-AIN',
  'type': 'stocks'}]

## Let us try and extract data (from all the pages)

Hit "https://www.tickertape.in/stocks?filter={filter}" with filter = "Top", "A"-"Z" and "Others"

In [None]:
"https://www.tickertape.in/stocks?filter="+"a"

'https://www.tickertape.in/stocks?filter=a'

In [None]:
def getNames(url_filter):

    ########## define custom functions: ##########

    # custom filter function
    def __filter_data_list_fn(listBlock):
        href = listBlock.a['href']
        return "etfs" in href or "stocks" in href
    
    # function to apply filter to htmlBlock and return the filtered htmlBlock 
    def __get_filtered_html_blocks_list(htmlBlock):
        filtered_html_blocks_list = []
        for block in htmlBlock:
            if filter_data_list_fn(block):
                filtered_html_blocks_list.append(block)
        return filtered_html_blocks_list
    
    # function to map filtered html block to desired data json/dictionary
    def __map_html_block_list_to_data_list(filteredHtmlBlock):
        data_list = []
        for block in filteredHtmlBlock:
            data_obj = {
                "name": block.a.text,
                "type": block.a['href'].split('/')[1],
                "subdirectory": block.a['href'].split('/')[2]
            }
            data_list.append(data_obj)
        return data_list

    ########## get html data from webpage and transform to req data ##########
    try:
        # hit the page and get html
        _header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
        }
        _response = requests.get("https://www.tickertape.in/stocks?filter="+url_filter, headers=_header)

        # give the webpage to Beautiful Soup using parsers: "html.parser" or "lxml"
        _soup = BeautifulSoup(_response.text, 'lxml')

        # find all li
        _htmlBlock = _soup.find_all("li")

        # filter out lis that doesn't contain our data
        _filteredHtmlBlock = __get_filtered_html_blocks_list(_htmlBlock)

        # get the data
        _data = __map_html_block_list_to_data_list(_filteredHtmlBlock)

        return _data
    except Exception as _e:
        print(_e)
        return []

In [None]:
getNames('a')[0]

{'name': 'A & M Febcon Ltd',
 'subdirectory': 'a-and-m-febcon-AMF',
 'type': 'stocks'}

In [None]:
# declare the filter values for all the page urls
tickertape_stocks_top = ["top"]
tickertape_stocks_all = list("abcdefghijklmnopqrstuvwxyz") + ["others"]
print(tickertape_stocks_top)
print(tickertape_stocks_all)

['top']
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'others']


In [None]:
# let's scrape all the pages!
fulldata = []

for filter in tickertape_stocks_all:
    print("https://www.tickertape.in/stocks?filter="+filter)
    try:
        # get data from each page and append to data list
        fulldata = fulldata + getNames(filter)
        print("successful!")
    except Exception as _e:
        # some issue occured, catch exception
        print("failed!")
        print(_e)

https://www.tickertape.in/stocks?filter=a
successful!
https://www.tickertape.in/stocks?filter=b
successful!
https://www.tickertape.in/stocks?filter=c
successful!
https://www.tickertape.in/stocks?filter=d
successful!
https://www.tickertape.in/stocks?filter=e
successful!
https://www.tickertape.in/stocks?filter=f
successful!
https://www.tickertape.in/stocks?filter=g
successful!
https://www.tickertape.in/stocks?filter=h
successful!
https://www.tickertape.in/stocks?filter=i
successful!
https://www.tickertape.in/stocks?filter=j
successful!
https://www.tickertape.in/stocks?filter=k
successful!
https://www.tickertape.in/stocks?filter=l
successful!
https://www.tickertape.in/stocks?filter=m
successful!
https://www.tickertape.in/stocks?filter=n
successful!
https://www.tickertape.in/stocks?filter=o
successful!
https://www.tickertape.in/stocks?filter=p
successful!
https://www.tickertape.in/stocks?filter=q
successful!
https://www.tickertape.in/stocks?filter=r
successful!
https://www.tickertape.in/st

In [None]:
print(len(fulldata))
fulldata[-1]

4626


{'name': '7Seas Entertainment Ltd',
 'subdirectory': '7seas-entertainment-SEAS',
 'type': 'stocks'}

In [None]:
print(type(fulldata))
print(type(fulldata[0]))

<class 'list'>
<class 'dict'>


In [None]:
topdata = getNames(tickertape_stocks_top[0])

In [None]:
print(len(topdata))
topdata[0]

100


{'name': 'Adani Enterprises Ltd',
 'subdirectory': 'adani-enterprises-ADEL',
 'type': 'stocks'}

In [None]:
print(type(topdata))
print(type(topdata[0]))

<class 'list'>
<class 'dict'>


## Exporting the data

In [None]:
# # Serializing json
# json_object = json.dumps(fulldata) 
# print(json_object)

In [None]:
with open("full-company-list.json", "w") as outfile:
    json.dump(fulldata, outfile)

In [None]:
with open("top-company-list.json", "w") as outfile:
    json.dump(topdata, outfile)