In [1]:
## Get the library Setup
from equity_downloader import *
from datetime import date, timedelta, datetime
import random
import time
import random
import requests
import pandas as pd
import numpy as np
import xgboost as xgb
from itertools import chain
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score
import os
import re
from sklearn.model_selection import train_test_split


In [2]:
# get the dates for which download is needed
daily_folder = "E:/Equity Research/daily"
index_folder = "E:/Equity Research/index"
daily_format = "https://archives.nseindia.com/products/content/sec_bhavdata_full_"
index_format = "https://archives.nseindia.com/content/indices/ind_close_all_"

daily_max_date = max([datetime.strptime(re.sub(".csv","",re.sub("sec_bhavdata_full_","",x)), "%d%m%Y") for x in os.listdir(daily_folder)]) + timedelta(1)
index_max_date = max([datetime.strptime(re.sub(".csv","",re.sub("ind_close_all_","",x)), "%d%m%Y") for x in os.listdir(index_folder)]) + timedelta(1)

daily_lst = [daily_max_date.year, daily_max_date.month, daily_max_date.day]
today_lst = [date.today().year, date.today().month, date.today().day]

print(f"Downloading data from {daily_max_date} to {date.today()}")

Downloading data from 2023-06-02 00:00:00 to 2023-06-03


In [3]:
# Download the files for daily and index
if daily_max_date.date()<date.today():
    for x in generate_date_list(start=daily_lst,end=today_lst):
        url = f"{daily_format}{x}.csv"
        download_file(url, 'E:/Equity Research/temp_daily',2)
        print(url)
        delay = random.uniform(3, 5)

        # Pause execution for the random delay
        time.sleep(delay)
# Download the index file  
if index_max_date.date()<date.today():
    for x in generate_date_list(start=daily_lst,end=today_lst):
        url = f"{index_format}{x}.csv"
        download_file(url, 'E:/Equity Research/temp_index',2)
        print(url)
        delay = random.uniform(3, 5)

        # Pause execution for the random delay
        time.sleep(delay)
    

File downloaded successfully to 'E:/Equity Research/temp_daily\sec_bhavdata_full_02062023.csv'
https://archives.nseindia.com/products/content/sec_bhavdata_full_02062023.csv
File downloaded successfully to 'E:/Equity Research/temp_index\ind_close_all_02062023.csv'
https://archives.nseindia.com/content/indices/ind_close_all_02062023.csv


In [4]:
print("Arrange and clean up the folders")

# Read the base file and the temp files
daily_base = pd.read_csv("Misc Files/bhavcopy_base.csv", low_memory=False)
daily_base.columns = [x.strip() for x in daily_base.columns]
index_base = pd.read_csv("Misc Files/index_base.csv")

if len(os.listdir("E:/Equity Research/temp_daily"))>0:
    copy_files("E:/Equity Research/temp_daily", "E:/Equity Research/daily")
    copy_files("E:/Equity Research/temp_index", "E:/Equity Research/index")

    combine_files(folder_path="E:/Equity Research/temp_index", output_file = "E:/Equity Research/Misc Files/temp_index.csv")
    combine_files(folder_path="E:/Equity Research/temp_daily", output_file = "E:/Equity Research/Misc Files/temp_daily.csv")

    temp_daily = pd.read_csv("E:/Equity Research/Misc Files/temp_daily.csv")
    temp_daily.columns = [x.strip() for x in temp_daily.columns]
    temp_daily = temp_daily[temp_daily['SERIES']==' EQ']

    temp_index = pd.read_csv("E:/Equity Research/Misc Files/temp_index.csv")

    # Append the files to get the current dataset
    daily_base = pd.concat([daily_base, temp_daily], ignore_index=True)
    index_base = pd.concat([index_base, temp_index], ignore_index=True)

    # Clean the temp folders
    clean_folder(folder_path="E:/Equity Research/temp_index")
    clean_folder(folder_path="E:/Equity Research/temp_daily")

# Now save this dataset and replace the old dataset
daily_base.to_csv("Misc Files/bhavcopy_base.csv", index=False)
index_base.to_csv("Misc Files/index_base.csv", index=False)

Arrange and clean up the folders
ind_close_all_02062023.csv
Combined data saved to 'E:/Equity Research/Misc Files/temp_index.csv'
sec_bhavdata_full_02062023.csv
Combined data saved to 'E:/Equity Research/Misc Files/temp_daily.csv'
All files have been removed from the folder.
All files have been removed from the folder.


In [5]:
# Get the market cap and industry data
sec_mcap = pd.read_csv("E:/Equity Research/Misc Files/Eq_Mcap_Industry_data.csv")

eq_df_industry = pd.merge(daily_base, sec_mcap, on='SYMBOL')

# Apply the conversion function to the 'date_string' column

eq_df_industry['DATE'] = [convert_date(x) for x in eq_df_industry['DATE1'].tolist()]
eq_df_industry['DATE'] = pd.to_datetime(eq_df_industry['DATE'], format='%Y-%m-%d')
eq_df_industry.sort_values(['SYMBOL', 'DATE'], inplace=True)

eq_df_industry['DELIV_QTY'] = eq_df_industry['DELIV_QTY'].replace(' -', 0)
eq_df_industry['DELIV_QTY'] = eq_df_industry['DELIV_QTY'].astype(float)
eq_df_industry['DELIV_PER'] = eq_df_industry['DELIV_PER'].replace(' -', 0)
eq_df_industry['DELIV_PER'] = eq_df_industry['DELIV_PER'].astype(float)
eq_df_industry.drop(['DATE1', 'SERIES', 'LAST_PRICE', 'AVG_PRICE'], axis=1, inplace=True)

eq_df_industry = eq_df_industry[['SYMBOL', 'DATE', 'COMPANY NAME', 'INDUSTRY', 'MCAP_INLAKHS', 'PREV_CLOSE', 'OPEN_PRICE', 'HIGH_PRICE', 'LOW_PRICE'
  , 'CLOSE_PRICE', 'TTL_TRD_QNTY', 'TURNOVER_LACS', 'NO_OF_TRADES', 'DELIV_QTY', 'DELIV_PER']]

eq_df_industry['AVG_TURNOVER'] = eq_df_industry.groupby('SYMBOL')['TURNOVER_LACS'].transform('mean')
eq_df_industry['MIN_TURNOVER'] = eq_df_industry.groupby('SYMBOL')['TURNOVER_LACS'].transform('min')
# eq_df_industry['MEAN_TURNOVER_240'] = eq_df_industry.groupby('SYMBOL')['TURNOVER_LACS'].rolling(window=240).mean().reset_index(0,drop=True)

eq_df_industry['TOTAL_TRADE_DAYS'] = eq_df_industry.groupby('SYMBOL')['SYMBOL'].transform('count')
eq_df_industry['MAX_DATE'] = eq_df_industry.groupby('SYMBOL')['DATE'].transform('max')

eq_df_industry_filtered = eq_df_industry[(eq_df_industry['TOTAL_TRADE_DAYS']>600) & (eq_df_industry['AVG_TURNOVER']>250)].copy()

eq_df_industry_filtered.sort_values(['SYMBOL', 'DATE'], inplace=True)

eq_df_industry_filtered.to_csv("Misc Files/bhavcopy_modeldf.csv", index=False)

In [6]:
eq_df_industry_filtered.shape

(511372, 19)