<a href="https://colab.research.google.com/github/mratanusarkar/Web-Scraping-tickertapeIN/blob/feature%2Ffilter-and-sort/Notebooks/filter_and_sort_tickertapeIN_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Filtering and Sorting data scraped from tickertape.in

Input: 
* scraped JSON data (ETFs or Stocks)
* list of keys and values and comparison type for filtering
* list of keys and the order of the values for sorting

Output: JSON data with the filters & sortings applied on the input data

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

import json
import time
import datetime as dt

import matplotlib.pyplot as plt
import plotly.express as px

## Input JSON Data from Scraping Scripts/Notebooks

In [2]:
with open('all-stocks-tickertape-data.json', 'r') as fp:
    stocksData = json.load(fp)

stocksData[0]

{'forecasts': {'buyRecommendation': None, 'forecast': None},
 'investmentChecklist': {'dividendReturns': 'negative',
  'entryPoint': 'positive',
  'intrinsicValue': 'neutral',
  'noRedFlags': 'positive',
  'roeVsFdRates': 'negative'},
 'keyMetrics': {'DividendYield': None,
  'PBRatio': '0.13',
  'PERatio': '-4.05',
  'SectorDivYld': '0.65%',
  'SectorPB': '5.21',
  'SectorPE': '68.27'},
 'marketcap': 'Smallcap',
 'name': 'A & M Febcon Ltd',
 'overview': {'absoluteChange': '0.74%',
  'capDesc': 'With a market cap of ₹1 cr, stock is ranked 4,213',
  'capType': 'Smallcap',
  'currentPrice': '1.35',
  'percentageChange': '-0.01',
  'riskDesc': 'Stock is 2.78x as volatile as Nifty',
  'riskType': 'Moderate Risk',
  'sectorDesc': 'Industrial Machinery',
  'sectorType': 'Industrials'},
 'price': '1.35',
 'profile': 'Company Profile: A & M Febcon is engaged in the services of engineering which is applied to the planning, designing and control of industrial operations and in the business of Ind

In [3]:
with open('all-etfs-tickertape-data.json', 'r') as fp:
    etfsData = json.load(fp)

etfsData[0]

{'investmentChecklist': {'entryPoint': 'negative',
  'expenseRatio': 'negative',
  'nav': 'neutral',
  'returnVsFdRates': 'positive',
  'trackingError': 'neutral'},
 'keyMetrics': {'AUM': '₹ 347.68cr',
  'ExpenseRatio': '0.58%',
  'RealtimeNAV': None,
  'SectorExpenseRatio': '0.61%',
  'SectorTrackingError': None,
  'TrackingError': None},
 'marketcap': 'Gold',
 'name': 'Aditya BSL Gold ETF',
 'overview': {'absoluteChange': '0.41%',
  'capDesc': 'ETF tracks gold prices',
  'capType': 'Gold',
  'currentPrice': '48.58',
  'percentageChange': '+0.20',
  'riskDesc': 'Average daily traded value of the ETF  is medium',
  'riskType': 'Medium Liquidity',
  'sectorDesc': 'Gold',
  'sectorType': 'ETF'},
 'price': '48.58',
 'profile': "AMC profile: Aditya Birla Sun Life AMC Limited, is a joint venture between the Aditya Birla Group and the Sun Life Financial Inc. of Canada. The joint venture brings together Aditya Birla Group's experience in the Indian market and Sun Life's global experience.",
 

## Input Filter/Sort Parameters

In [4]:
# TODO

## Processing the Data

#### 1. Stock Data

In [5]:
# handpicking the important filter-sort data fields

index = 0
print('name:', stocksData[index]['name'])
print('marcap:', stocksData[index]['marketcap'])
print('sector:', stocksData[index]['sector'])
print('risk:', stocksData[index]['risk'])
print('price:', stocksData[index]['price'])
print('intr_val:', stocksData[index]['investmentChecklist']['intrinsicValue'])
print('roe_fd:', stocksData[index]['investmentChecklist']['roeVsFdRates'])
print('div_ret:', stocksData[index]['investmentChecklist']['dividendReturns'])
print('entry_pt:', stocksData[index]['investmentChecklist']['entryPoint'])
print('red_flag:', stocksData[index]['investmentChecklist']['noRedFlags'])
print('recommend:', stocksData[index]['forecasts']['buyRecommendation'])

name: A & M Febcon Ltd
marcap: Smallcap
sector: Industrials
risk: Moderate Risk
price: 1.35
intr_val: neutral
roe_fd: negative
div_ret: negative
entry_pt: positive
red_flag: positive
recommend: None


In [6]:
# filter out the required parameters from the full data

selectedDataList = []
for i, data in enumerate(stocksData):
    if len(data) == 0:
        continue
    selectedData = {}
    selectedData['name'] = data['name']
    selectedData['marcap'] = data['marketcap']
    selectedData['sector'] = data['sector']
    selectedData['risk'] = data['risk']
    selectedData['price'] = data['price']
    selectedData['intr_val'] = data['investmentChecklist']['intrinsicValue']
    selectedData['roe_fd'] = data['investmentChecklist']['roeVsFdRates']
    selectedData['div_ret'] = data['investmentChecklist']['dividendReturns']
    selectedData['entry_pt'] = data['investmentChecklist']['entryPoint']
    selectedData['red_flag'] = data['investmentChecklist']['noRedFlags'] if 'noRedFlags' in data['investmentChecklist'] else 'negative'
    selectedData['recommend'] = data['forecasts']['buyRecommendation']

    selectedDataList.append(selectedData)

In [7]:
print(len(stocksData))
print(len(selectedDataList))

4487
4404


In [8]:
# convert into pandas dataframe for easier processing

stockDf = pd.json_normalize(selectedDataList)
stockDf

Unnamed: 0,name,marcap,sector,risk,price,intr_val,roe_fd,div_ret,entry_pt,red_flag,recommend
0,A & M Febcon Ltd,Smallcap,Industrials,Moderate Risk,1.35,neutral,negative,negative,positive,positive,
1,A B Cotspin India Ltd,Smallcap,Consumer Discretionary,High Risk,44.90,positive,neutral,neutral,positive,positive,
2,A B Infrabuild Ltd,Smallcap,Industrials,High Risk,11.10,neutral,positive,neutral,negative,positive,
3,A F Enterprises Ltd,Smallcap,Financials,Moderate Risk,135.45,neutral,positive,negative,negative,negative,
4,A Infrastructure Ltd,Smallcap,Industrials,High Risk,64.15,neutral,negative,negative,positive,negative,
...,...,...,...,...,...,...,...,...,...,...,...
4399,52 Weeks Entertainment Ltd,Smallcap,Communication Services,High Risk,2.77,neutral,negative,negative,positive,negative,
4400,5Paisa Capital Ltd,Smallcap,Financials,High Risk,364.80,negative,negative,negative,positive,positive,100%
4401,63 Moons Technologies Ltd,Smallcap,Information Technology,High Risk,294.35,neutral,negative,positive,positive,negative,
4402,7NR Retail Ltd,Smallcap,,High Risk,2.25,neutral,negative,negative,positive,negative,


In [9]:
stockDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4404 entries, 0 to 4403
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       4404 non-null   object
 1   marcap     4404 non-null   object
 2   sector     4404 non-null   object
 3   risk       4404 non-null   object
 4   price      4404 non-null   object
 5   intr_val   4404 non-null   object
 6   roe_fd     4404 non-null   object
 7   div_ret    4404 non-null   object
 8   entry_pt   4404 non-null   object
 9   red_flag   4404 non-null   object
 10  recommend  696 non-null    object
dtypes: object(11)
memory usage: 378.6+ KB


In [10]:
# categorical columns

cat_cols = ['marcap', 'sector', 'risk', 'intr_val', 'roe_fd', 'div_ret', 'entry_pt', 'red_flag', 'recommend']
for col in cat_cols:
    print(col + ': ', stockDf[col].unique())

marcap:  ['Smallcap' 'Unknown' 'Midcap' 'Largecap']
sector:  ['Industrials' 'Consumer Discretionary' 'Financials' 'Materials' ''
 'Information Technology' 'Consumer Staples' 'Real Estate' 'Health Care'
 'Energy' 'Communication Services' 'Utilities' 'Unknown']
risk:  ['Moderate Risk' 'High Risk' 'Low Risk']
intr_val:  ['neutral' 'positive' 'negative']
roe_fd:  ['negative' 'neutral' 'positive']
div_ret:  ['negative' 'neutral' 'positive']
entry_pt:  ['positive' 'negative' 'neutral']
red_flag:  ['positive' 'negative']
recommend:  [None '42%' '85%' '62%' '92%' '100%' '60%' '40%' '71%' '0%' '95%' '33%'
 '83%' '89%' '50%' '70%' '44%' '53%' '63%' '80%' '79%' '94%' '46%' '22%'
 '21%' '88%' '67%' '76%' '75%' '73%' '61%' '65%' '82%' '25%' '16%' '81%'
 '97%' '93%' '55%' '90%' '48%' '77%' '74%' '43%' '78%' '87%' '72%' '58%'
 '37%' '91%' '86%' '35%' '17%' '64%' '84%' '11%' '54%' '59%' '49%' '52%'
 '20%' '98%' '68%' '13%' '69%' '45%' '10%' '36%' '31%' '96%' '39%' '30%'
 '5%' '38%' '57%' '14%' '28%' '

In [11]:
stockDf["marcap"] = stockDf["marcap"].replace('Unknown', '0').replace('Smallcap', '1').replace('Midcap', '2').replace('Largecap', '3').astype(int)
stockDf["risk"] = stockDf["risk"].replace('Unknown', '0').replace('Low Risk', '1').replace('Moderate Risk', '2').replace('High Risk', '3').astype(int)
stockDf["intr_val"] = stockDf["intr_val"].replace('negative', '-1').replace('neutral', '0').replace('positive', '1').astype(int)
stockDf["roe_fd"] = stockDf["roe_fd"].replace('negative', '-1').replace('neutral', '0').replace('positive', '1').astype(int)
stockDf["div_ret"] = stockDf["div_ret"].replace('negative', '-1').replace('neutral', '0').replace('positive', '1').astype(int)
stockDf["entry_pt"] = stockDf["entry_pt"].replace('negative', '-1').replace('neutral', '0').replace('positive', '1').astype(int)
stockDf["red_flag"] = stockDf["intr_val"].replace('negative', '-1').replace('neutral', '0').replace('positive', '1').astype(int)

In [12]:
# decimal column

stockDf["recommend"].fillna(value=np.nan).replace('[\%]', '', regex=True).astype(float)

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
4399      NaN
4400    100.0
4401      NaN
4402      NaN
4403      NaN
Name: recommend, Length: 4404, dtype: float64

In [13]:
stockDf["recommend"] = stockDf["recommend"].fillna(value=np.nan).replace('[\%]', '', regex=True).astype(float)

In [14]:
# pricing column

stockDf["price"].replace('[\—,]', '', regex=True).replace('', 'nan').astype(float)

0         1.35
1        44.90
2        11.10
3       135.45
4        64.15
         ...  
4399      2.77
4400    364.80
4401    294.35
4402      2.25
4403     22.40
Name: price, Length: 4404, dtype: float64

In [15]:
stockDf["price"] = stockDf["price"].replace('[\—,]', '', regex=True).replace('', 'nan').astype(float)

In [16]:
# final dataframe
stockDf

Unnamed: 0,name,marcap,sector,risk,price,intr_val,roe_fd,div_ret,entry_pt,red_flag,recommend
0,A & M Febcon Ltd,1,Industrials,2,1.35,0,-1,-1,1,0,
1,A B Cotspin India Ltd,1,Consumer Discretionary,3,44.90,1,0,0,1,1,
2,A B Infrabuild Ltd,1,Industrials,3,11.10,0,1,0,-1,0,
3,A F Enterprises Ltd,1,Financials,2,135.45,0,1,-1,-1,0,
4,A Infrastructure Ltd,1,Industrials,3,64.15,0,-1,-1,1,0,
...,...,...,...,...,...,...,...,...,...,...,...
4399,52 Weeks Entertainment Ltd,1,Communication Services,3,2.77,0,-1,-1,1,0,
4400,5Paisa Capital Ltd,1,Financials,3,364.80,-1,-1,-1,1,-1,100.0
4401,63 Moons Technologies Ltd,1,Information Technology,3,294.35,0,-1,1,1,0,
4402,7NR Retail Ltd,1,,3,2.25,0,-1,-1,1,0,


In [17]:
stockDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4404 entries, 0 to 4403
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       4404 non-null   object 
 1   marcap     4404 non-null   int64  
 2   sector     4404 non-null   object 
 3   risk       4404 non-null   int64  
 4   price      4398 non-null   float64
 5   intr_val   4404 non-null   int64  
 6   roe_fd     4404 non-null   int64  
 7   div_ret    4404 non-null   int64  
 8   entry_pt   4404 non-null   int64  
 9   red_flag   4404 non-null   int64  
 10  recommend  696 non-null    float64
dtypes: float64(2), int64(7), object(2)
memory usage: 378.6+ KB


#### 2. ETF Data

In [18]:
# TODO

## Data Analysis and EDA

In [19]:
# TODO

## Filter and Sort Data based on Input Parameters

In [20]:
# TODO