# 01 - Data Consolidation

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
from pathlib import Path

import pandas as pd

## Constants

In [3]:
STOCK_DATA_DIR = Path("..", "data", "NSE")
STOCK_DATA_DIR.exists()

True

## Consolidating historical stock data

### Merging historical data from multiple years

In [4]:
def get_consolidate_stock_data(
    stock_symbol: str
):
    hist_dfs = []
    file_pattern = f"*{stock_symbol}*.csv"

    for f in STOCK_DATA_DIR.joinpath(stock_symbol).glob(file_pattern):
        hist_df = pd.read_csv(f, thousands = ',')
        hist_df.columns = [c.strip() for c in hist_df.columns]
        hist_df["Date"] = pd.to_datetime(hist_df["Date"])
        hist_dfs.append(hist_df)
    
    if len(hist_dfs) > 0:
        hist_df: pd.DataFrame = hist_dfs[0]

        for df in hist_dfs[1:]:
            hist_df = hist_df.merge(df, how = "outer")

        return hist_df.sort_values("Date").reset_index(drop = True)
    else:
        return None

get_consolidate_stock_data("ITBEES")

Unnamed: 0,Date,series,OPEN,HIGH,LOW,PREV. CLOSE,ltp,close,vwap,52W H,52W L,VOLUME,VALUE,No of trades
0,2020-07-01,EQ,17.71,17.71,14.65,14.76,14.65,14.65,14.97,17.71,14.65,26187,3.919319e+05,55
1,2020-07-02,EQ,14.65,15.74,14.65,14.65,15.21,15.26,15.07,17.71,14.65,5602,8.443024e+04,31
2,2020-07-03,EQ,15.41,15.41,15.28,15.26,15.39,15.38,15.31,17.71,14.65,13559,2.076389e+05,18
3,2020-07-06,EQ,15.50,18.16,15.41,15.38,15.54,15.55,16.05,18.16,14.65,33643,5.400182e+05,168
4,2020-07-07,EQ,15.55,16.29,15.55,15.55,15.75,15.74,15.73,18.16,14.65,15727,2.473728e+05,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,2023-12-22,EQ,37.00,37.85,36.77,36.89,37.70,37.70,37.35,38.95,27.56,5761095,2.151888e+08,14112
867,2023-12-26,EQ,37.80,37.80,37.06,37.70,37.55,37.55,37.49,38.95,27.56,5384568,2.018781e+08,19101
868,2023-12-27,EQ,37.66,37.97,37.45,37.55,37.70,37.73,37.68,38.95,27.56,4912091,1.850791e+08,12177
869,2023-12-28,EQ,38.99,38.99,37.62,37.73,37.73,37.72,37.73,38.99,27.56,3567491,1.346162e+08,12787


In [5]:
for stock_dir in STOCK_DATA_DIR.glob("*"):
    print(f"{stock_dir.stem} - {len(list(stock_dir.glob('*.csv')))} files")
    stock_df = get_consolidate_stock_data(stock_dir.stem)
    
    if stock_df is not None:
        print(f"\t{stock_df.shape[0]} records")
        print(f"\t{stock_df['Date'].min().date()} to {stock_df['Date'].max().date()}")
        stock_df.to_parquet(stock_dir.joinpath("consolidated.parquet"), index = False)

HDFCBANK - 4 files
	1009 records
	2020-01-01 to 2023-12-29
ITBEES - 4 files
	871 records
	2020-07-01 to 2023-12-29
