# 02 - Data Processing

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import numpy as np
import pandas as pd

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['HDFCBANK', 'ITBEES']

In [4]:
STOCK_SYMBOL = stock_symbols[0]
STOCK_SYMBOL

'HDFCBANK'

## Data loading

In [5]:
stock_df = pd.read_parquet(cnst.NSE_DATA_DIR.joinpath(STOCK_SYMBOL, "consolidated.parquet"))
stock_df = stock_df.drop(columns = ['series', 'PREV. CLOSE'])
stock_df.columns = [
    'Date',
    'Open', 'High', 'Low', 'LTP', 'Close',
    'VWAP', '52W H', '52W L',
    'Volume', 'Value', '#Trades'
]
stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades
0,2020-01-01,1276.10,1280.00,1270.60,1279.00,1278.60,1276.64,2503.3,1084.00,1836849,2.345001e+09,46625
1,2020-01-02,1279.00,1288.00,1279.00,1286.00,1286.75,1284.56,2503.3,1084.00,3068583,3.941792e+09,104570
2,2020-01-03,1282.20,1285.00,1263.60,1268.50,1268.40,1270.48,2503.3,1084.00,5427775,6.895886e+09,157066
3,2020-01-06,1260.00,1261.80,1236.00,1240.25,1240.95,1247.24,2503.3,1084.00,5445093,6.791348e+09,155007
4,2020-01-07,1258.90,1271.45,1252.25,1261.00,1260.60,1261.48,2503.3,1084.00,7362247,9.287302e+09,189026
...,...,...,...,...,...,...,...,...,...,...,...,...
1020,2024-01-16,1673.00,1683.65,1658.10,1678.00,1679.15,1672.44,1757.5,1460.25,12661250,2.117514e+10,347404
1021,2024-01-17,1570.00,1596.80,1528.40,1542.15,1537.50,1565.65,1757.5,1460.25,85072618,1.331936e+11,2098772
1022,2024-01-18,1494.00,1515.00,1480.05,1490.00,1486.15,1495.03,1757.5,1460.25,80535465,1.204027e+11,1582497
1023,2024-01-19,1505.95,1510.25,1468.40,1474.90,1470.65,1483.88,1757.5,1460.25,54800269,8.131686e+10,1275220


In [6]:
stock_df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     1025 non-null   datetime64[ns]
 1   Open     1025 non-null   float64       
 2   High     1025 non-null   float64       
 3   Low      1025 non-null   float64       
 4   LTP      1025 non-null   float64       
 5   Close    1025 non-null   float64       
 6   VWAP     1025 non-null   float64       
 7   52W H    1025 non-null   float64       
 8   52W L    1025 non-null   float64       
 9   Volume   1025 non-null   int64         
 10  Value    1025 non-null   float64       
 11  #Trades  1025 non-null   int64         
dtypes: datetime64[ns](1), float64(9), int64(2)
memory usage: 96.2 KB


## Feature engineering

### Daily candle based columns

In [7]:
stock_df['Range'] = stock_df['High'] - stock_df['Low']
stock_df['IsGreen'] = (stock_df['Close'] >= stock_df['Open']).astype(int)
stock_df['Is52WLow'] = np.isclose(stock_df['Low'], stock_df['52W L'], atol = 0, rtol = 1e-4).astype(int)
stock_df['Is52WHigh'] = np.isclose(stock_df['High'], stock_df['52W H'], atol = 0, rtol = 1e-4).astype(int)
stock_df.sample(10)

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,Range,IsGreen,Is52WLow,Is52WHigh
712,2022-10-19,1464.0,1477.4,1453.4,1459.95,1458.65,1464.96,1725.0,1271.6,6483520,9498088000.0,186569,24.0,0,0,0
764,2023-01-03,1622.2,1643.0,1622.2,1640.5,1639.35,1634.63,1722.1,1271.6,4189015,6847496000.0,102047,20.8,1,0,0
749,2022-12-13,1650.0,1657.0,1645.0,1645.3,1648.3,1651.67,1722.1,1271.6,8515851,14065340000.0,165451,12.0,0,0,0
305,2021-02-25,1609.75,1636.25,1602.0,1605.4,1606.4,1620.67,1641.0,738.75,10054785,16295460000.0,231264,34.25,0,0,0
216,2020-10-20,1207.5,1227.4,1205.9,1224.6,1223.95,1218.32,1305.5,738.75,13705994,16698230000.0,278115,21.5,1,0,0
41,2020-02-19,1222.5,1230.0,1213.9,1227.2,1227.2,1221.68,2503.3,1084.0,5006134,6115909000.0,101692,16.1,1,0,0
412,2021-08-04,1441.0,1474.5,1440.0,1465.5,1465.3,1462.94,1641.0,993.0,11026974,16131830000.0,294722,34.5,1,0,0
575,2022-03-31,1471.0,1484.7,1465.9,1470.0,1470.35,1473.98,1725.0,1292.0,7147857,10535800000.0,175701,18.8,0,0,0
615,2022-05-31,1395.9,1404.8,1380.0,1384.0,1388.95,1396.23,1725.0,1278.3,6742694,9414335000.0,178695,24.8,0,0,0
696,2022-09-26,1425.35,1436.45,1418.0,1429.8,1426.65,1426.27,1725.0,1271.6,4461743,6363659000.0,183001,18.45,1,0,0


In [8]:
((stock_df[['IsGreen', 'Is52WLow', 'Is52WHigh']].sum() * 100) / len(stock_df)).round(2)

IsGreen      49.07
Is52WLow      1.46
Is52WHigh     2.54
dtype: float64

In [9]:
stock_df.shape

(1025, 16)

### Date based columns

In [10]:
stock_df['Day'] = stock_df['Date'].dt.day
stock_df['Month'] = stock_df['Date'].dt.month
stock_df['Year'] = stock_df['Date'].dt.year
stock_df['Weekday'] = stock_df['Date'].dt.weekday
stock_df['DayOfYear'] = stock_df['Date'].dt.day_of_year
stock_df['Quarter'] = stock_df['Date'].dt.quarter
stock_df['DaysSinceLastTradingSession'] = (stock_df['Date'] - stock_df['Date'].shift(1)).dt.days.fillna(1).astype(int)

stock_df.sample(10)

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,Range,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession
329,2021-04-05,1480.0,1485.0,1431.0,1450.2,1449.6,1451.04,1641.0,826.1,8003293,11613130000.0,238305,54.0,0,0,0,5,4,2021,0,95,2,4
207,2020-10-07,1144.9,1164.85,1137.6,1160.5,1162.25,1154.88,1305.5,738.75,10624527,12270100000.0,210164,27.25,1,0,0,7,10,2020,2,281,4,1
198,2020-09-23,1046.0,1055.0,1034.35,1048.95,1047.25,1046.38,1305.5,738.75,8039935,8412864000.0,181416,20.65,1,0,0,23,9,2020,2,267,3,1
705,2022-10-10,1408.0,1426.0,1398.2,1415.95,1415.0,1409.21,1725.0,1271.6,6554651,9236903000.0,215487,27.8,1,0,0,10,10,2022,0,283,4,3
182,2020-09-01,1128.0,1143.6,1108.0,1123.95,1127.3,1128.0,2288.8,738.75,14438949,16287150000.0,217329,35.6,0,0,0,1,9,2020,1,245,3,1
248,2020-12-04,1381.0,1401.45,1373.3,1390.1,1385.6,1386.77,1464.4,738.75,13456161,18660620000.0,287684,28.15,1,0,0,4,12,2020,4,339,4,1
13,2020-01-20,1304.85,1304.85,1252.5,1257.35,1254.9,1265.0,2503.3,1084.0,11089225,14027830000.0,245672,52.35,0,0,0,20,1,2020,0,20,1,3
1001,2023-12-19,1650.0,1658.65,1644.05,1650.5,1652.9,1651.23,1757.5,1460.25,12004223,19821700000.0,257002,14.6,1,0,0,19,12,2023,1,353,4,1
645,2022-07-12,1405.25,1405.25,1389.4,1391.2,1391.8,1395.12,1725.0,1271.6,4938487,6889788000.0,107652,15.85,0,0,0,12,7,2022,1,193,3,1
914,2023-08-11,1638.4,1638.4,1616.2,1620.9,1618.8,1625.76,1757.5,1365.0,14112911,22944250000.0,271050,22.2,0,0,0,11,8,2023,4,223,3,1


In [11]:
stock_df.shape

(1025, 23)

### Moving averages

#### `Close`

In [12]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Close {window}MA'] = stock_df['Close'].rolling(window = window, min_periods = 1).mean().round(2)

stock_df.filter(regex = "Close.*")

Unnamed: 0,Close,Close 7MA,Close 15MA,Close 30MA,Close 60MA
0,1278.60,1278.60,1278.60,1278.60,1278.60
1,1286.75,1282.68,1282.68,1282.68,1282.68
2,1268.40,1277.92,1277.92,1277.92,1277.92
3,1240.95,1268.68,1268.68,1268.68,1268.68
4,1260.60,1267.06,1267.06,1267.06,1267.06
...,...,...,...,...,...
1020,1679.15,1658.86,1678.20,1664.56,1585.82
1021,1537.50,1640.87,1667.15,1661.69,1586.06
1022,1486.15,1617.39,1652.54,1656.97,1585.73
1023,1470.65,1590.92,1636.63,1651.64,1585.30


#### `Range`

In [13]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Range {window}MA'] = stock_df['Range'].rolling(window = window, min_periods = 1).mean().round(2)

stock_df.filter(regex = "Range.*")

Unnamed: 0,Range,Range 7MA,Range 15MA,Range 30MA,Range 60MA
0,9.40,9.40,9.40,9.40,9.40
1,9.00,9.20,9.20,9.20,9.20
2,21.40,13.27,13.27,13.27,13.27
3,25.80,16.40,16.40,16.40,16.40
4,19.20,16.96,16.96,16.96,16.96
...,...,...,...,...,...
1020,25.55,25.30,24.76,22.80,20.58
1021,68.40,31.88,27.46,24.33,21.35
1022,34.95,32.44,28.49,24.77,21.46
1023,41.85,35.76,30.02,25.57,21.63


#### `VWAP`

In [14]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'VWAP {window}MA'] = stock_df['VWAP'].rolling(window = window, min_periods = 1).mean().round(2)

stock_df.filter(regex = "VWAP.*")

Unnamed: 0,VWAP,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA
0,1276.64,1276.64,1276.64,1276.64,1276.64
1,1284.56,1280.60,1280.60,1280.60,1280.60
2,1270.48,1277.23,1277.23,1277.23,1277.23
3,1247.24,1269.73,1269.73,1269.73,1269.73
4,1261.48,1268.08,1268.08,1268.08,1268.08
...,...,...,...,...,...
1020,1672.44,1660.48,1678.64,1664.72,1585.57
1021,1565.65,1645.31,1669.96,1662.71,1586.34
1022,1495.03,1621.62,1655.54,1658.30,1585.93
1023,1483.88,1597.43,1640.64,1653.48,1585.56


#### `Volume`

In [15]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Volume {window}MA'] = stock_df['Volume'].rolling(window = window, min_periods = 1).mean().astype(int)

stock_df.filter(regex = "Volume.*")

Unnamed: 0,Volume,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA
0,1836849,1836849,1836849,1836849,1836849
1,3068583,2452716,2452716,2452716,2452716
2,5427775,3444402,3444402,3444402,3444402
3,5445093,3944575,3944575,3944575,3944575
4,7362247,4628109,4628109,4628109,4628109
...,...,...,...,...,...
1020,12661250,10797971,12592121,16089685,15332293
1021,85072618,21665597,17363326,18054925,16574386
1022,80535465,31361600,21263141,20095890,17730176
1023,54800269,38039080,24082779,21581725,18407581


#### `Value`

In [16]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Value {window}MA'] = stock_df['Value'].rolling(window = window, min_periods = 1).mean().astype(int)

stock_df.filter(regex = "Value.*")

Unnamed: 0,Value,Value 7MA,Value 15MA,Value 30MA,Value 60MA
0,2.345001e+09,2345000988,2345000988,2345000988,2345000988
1,3.941792e+09,3143396262,3143396262,3143396262,3143396262
2,6.895886e+09,4394226092,4394226092,4394226092,4394226092
3,6.791348e+09,4993506527,4993506527,4993506527,4993506527
4,9.287302e+09,5852265530,5852265530,5852265530,5852265530
...,...,...,...,...,...
1020,2.117514e+10,17939626497,21173698410,26756826044,24372218817
1021,1.331936e+11,34817957164,28526480964,29781095464,26325036311
1022,1.204027e+11,49013774376,34039123030,32747198969,28048301982
1023,8.131686e+10,58727408569,38036670502,34902710598,29048300599


#### `#Trades`

In [17]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'#Trades {window}MA'] = stock_df['#Trades'].rolling(window = window, min_periods = 1).mean().astype(int)

stock_df.filter(regex = "#Trades.*")

Unnamed: 0,#Trades,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA
0,46625,46625,46625,46625,46625
1,104570,75597,75597,75597,75597
2,157066,102753,102753,102753,102753
3,155007,115817,115817,115817,115817
4,189026,130458,130458,130458,130458
...,...,...,...,...,...
1020,347404,297593,324871,336519,323849
1021,2098772,544785,443365,393241,354372
1022,1582497,732341,520660,434624,376597
1023,1275220,889730,588066,469801,391650


### Target columns

In [18]:
for window in cnst.TARGET_WINDOWS:
    stock_df[f'Target {window}D'] = stock_df['Close'].shift(-window)

stock_df.filter(regex = "Target.*")

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
0,1240.95,1282.70,1240.85,1240.60
1,1260.60,1286.00,1244.85,1249.00
2,1257.30,1289.50,1244.55,1241.40
3,1271.40,1284.25,1213.20,1219.35
4,1282.70,1287.65,1223.20,1217.15
...,...,...,...,...
1020,1470.65,,,
1021,1478.85,,,
1022,,,,
1023,,,,


### Saving processed data

In [19]:
stock_df.to_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-processed.parquet'), 
    index = False
)
stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,Range,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,2020-01-01,1276.10,1280.00,1270.60,1279.00,1278.60,1276.64,2503.3,1084.00,1836849,2.345001e+09,46625,9.40,1,0,0,1,1,2020,2,1,1,1,1278.60,1278.60,1278.60,1278.60,9.40,9.40,9.40,9.40,1276.64,1276.64,1276.64,1276.64,1836849,1836849,1836849,1836849,2345000988,2345000988,2345000988,2345000988,46625,46625,46625,46625,1240.95,1282.70,1240.85,1240.60
1,2020-01-02,1279.00,1288.00,1279.00,1286.00,1286.75,1284.56,2503.3,1084.00,3068583,3.941792e+09,104570,9.00,1,0,0,2,1,2020,3,2,1,1,1282.68,1282.68,1282.68,1282.68,9.20,9.20,9.20,9.20,1280.60,1280.60,1280.60,1280.60,2452716,2452716,2452716,2452716,3143396262,3143396262,3143396262,3143396262,75597,75597,75597,75597,1260.60,1286.00,1244.85,1249.00
2,2020-01-03,1282.20,1285.00,1263.60,1268.50,1268.40,1270.48,2503.3,1084.00,5427775,6.895886e+09,157066,21.40,0,0,0,3,1,2020,4,3,1,1,1277.92,1277.92,1277.92,1277.92,13.27,13.27,13.27,13.27,1277.23,1277.23,1277.23,1277.23,3444402,3444402,3444402,3444402,4394226092,4394226092,4394226092,4394226092,102753,102753,102753,102753,1257.30,1289.50,1244.55,1241.40
3,2020-01-06,1260.00,1261.80,1236.00,1240.25,1240.95,1247.24,2503.3,1084.00,5445093,6.791348e+09,155007,25.80,0,0,0,6,1,2020,0,6,1,3,1268.68,1268.68,1268.68,1268.68,16.40,16.40,16.40,16.40,1269.73,1269.73,1269.73,1269.73,3944575,3944575,3944575,3944575,4993506527,4993506527,4993506527,4993506527,115817,115817,115817,115817,1271.40,1284.25,1213.20,1219.35
4,2020-01-07,1258.90,1271.45,1252.25,1261.00,1260.60,1261.48,2503.3,1084.00,7362247,9.287302e+09,189026,19.20,1,0,0,7,1,2020,1,7,1,1,1267.06,1267.06,1267.06,1267.06,16.96,16.96,16.96,16.96,1268.08,1268.08,1268.08,1268.08,4628109,4628109,4628109,4628109,5852265530,5852265530,5852265530,5852265530,130458,130458,130458,130458,1282.70,1287.65,1223.20,1217.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,2024-01-16,1673.00,1683.65,1658.10,1678.00,1679.15,1672.44,1757.5,1460.25,12661250,2.117514e+10,347404,25.55,1,0,0,16,1,2024,1,16,1,1,1658.86,1678.20,1664.56,1585.82,25.30,24.76,22.80,20.58,1660.48,1678.64,1664.72,1585.57,10797971,12592121,16089685,15332293,17939626497,21173698410,26756826044,24372218817,297593,324871,336519,323849,1470.65,,,
1021,2024-01-17,1570.00,1596.80,1528.40,1542.15,1537.50,1565.65,1757.5,1460.25,85072618,1.331936e+11,2098772,68.40,0,0,0,17,1,2024,2,17,1,1,1640.87,1667.15,1661.69,1586.06,31.88,27.46,24.33,21.35,1645.31,1669.96,1662.71,1586.34,21665597,17363326,18054925,16574386,34817957164,28526480964,29781095464,26325036311,544785,443365,393241,354372,1478.85,,,
1022,2024-01-18,1494.00,1515.00,1480.05,1490.00,1486.15,1495.03,1757.5,1460.25,80535465,1.204027e+11,1582497,34.95,0,0,0,18,1,2024,3,18,1,1,1617.39,1652.54,1656.97,1585.73,32.44,28.49,24.77,21.46,1621.62,1655.54,1658.30,1585.93,31361600,21263141,20095890,17730176,49013774376,34039123030,32747198969,28048301982,732341,520660,434624,376597,,,,
1023,2024-01-19,1505.95,1510.25,1468.40,1474.90,1470.65,1483.88,1757.5,1460.25,54800269,8.131686e+10,1275220,41.85,0,0,0,19,1,2024,4,19,1,1,1590.92,1636.63,1651.64,1585.30,35.76,30.02,25.57,21.63,1597.43,1640.64,1653.48,1585.56,38039080,24082779,21581725,18407581,58727408569,38036670502,34902710598,29048300599,889730,588066,469801,391650,,,,


## Data standardization

### Stock price based columns

In [20]:
stock_price_cols = [
    'Open', 'High', 'Low', 'LTP', '52W H', '52W L'
] + (
    stock_df.filter(regex = "Close.*").columns.to_list() +
    stock_df.filter(regex = "Range.*").columns.to_list() +
    stock_df.filter(regex = "VWAP.*").columns.to_list() +
    stock_df.filter(regex = "Target.*").columns.to_list()
)

stock_price_cols

['Open',
 'High',
 'Low',
 'LTP',
 '52W H',
 '52W L',
 'Close',
 'Close 7MA',
 'Close 15MA',
 'Close 30MA',
 'Close 60MA',
 'Range',
 'Range 7MA',
 'Range 15MA',
 'Range 30MA',
 'Range 60MA',
 'VWAP',
 'VWAP 7MA',
 'VWAP 15MA',
 'VWAP 30MA',
 'VWAP 60MA',
 'Target 3D',
 'Target 7D',
 'Target 15D',
 'Target 30D']

In [21]:
stock_df[stock_price_cols] = stock_df[stock_price_cols].div(stock_df['Close'], axis = 0).round(3)
stock_df[stock_price_cols]

Unnamed: 0,Open,High,Low,LTP,52W H,52W L,Close,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,1.958,0.848,1.0,1.000,1.000,1.000,1.000,0.007,0.007,0.007,0.007,0.007,0.998,0.998,0.998,0.998,0.998,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,1.945,0.842,1.0,0.997,0.997,0.997,0.997,0.007,0.007,0.007,0.007,0.007,0.998,0.995,0.995,0.995,0.995,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.974,0.855,1.0,1.008,1.008,1.008,1.008,0.017,0.010,0.010,0.010,0.010,1.002,1.007,1.007,1.007,1.007,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,2.017,0.874,1.0,1.022,1.022,1.022,1.022,0.021,0.013,0.013,0.013,0.013,1.005,1.023,1.023,1.023,1.023,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.986,0.860,1.0,1.005,1.005,1.005,1.005,0.015,0.013,0.013,0.013,0.013,1.001,1.006,1.006,1.006,1.006,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,1.047,0.870,1.0,0.988,0.999,0.991,0.944,0.015,0.015,0.015,0.014,0.012,0.996,0.989,1.000,0.991,0.944,0.876,,,
1021,1.021,1.039,0.994,1.003,1.143,0.950,1.0,1.067,1.084,1.081,1.032,0.044,0.021,0.018,0.016,0.014,1.018,1.070,1.086,1.081,1.032,0.962,,,
1022,1.005,1.019,0.996,1.003,1.183,0.983,1.0,1.088,1.112,1.115,1.067,0.024,0.022,0.019,0.017,0.014,1.006,1.091,1.114,1.116,1.067,,,,
1023,1.024,1.027,0.998,1.003,1.195,0.993,1.0,1.082,1.113,1.123,1.078,0.028,0.024,0.020,0.017,0.015,1.009,1.086,1.116,1.124,1.078,,,,


### Volume based columns

In [22]:
volume_cols = stock_df.filter(regex = "Volume.*").columns.to_list()

volume_cols

['Volume', 'Volume 7MA', 'Volume 15MA', 'Volume 30MA', 'Volume 60MA']

In [23]:
stock_df[volume_cols] = stock_df[volume_cols].div(stock_df['Volume'], axis = 0).round(3)
stock_df[volume_cols]

Unnamed: 0,Volume,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA
0,1.0,1.000,1.000,1.000,1.000
1,1.0,0.799,0.799,0.799,0.799
2,1.0,0.635,0.635,0.635,0.635
3,1.0,0.724,0.724,0.724,0.724
4,1.0,0.629,0.629,0.629,0.629
...,...,...,...,...,...
1020,1.0,0.853,0.995,1.271,1.211
1021,1.0,0.255,0.204,0.212,0.195
1022,1.0,0.389,0.264,0.250,0.220
1023,1.0,0.694,0.439,0.394,0.336


### Value based columns

In [24]:
value_cols = stock_df.filter(regex = "Value.*").columns.to_list()

value_cols

['Value', 'Value 7MA', 'Value 15MA', 'Value 30MA', 'Value 60MA']

In [25]:
stock_df[value_cols] = stock_df[value_cols].div(stock_df['Value'], axis = 0).round(3)
stock_df[value_cols]

Unnamed: 0,Value,Value 7MA,Value 15MA,Value 30MA,Value 60MA
0,1.0,1.000,1.000,1.000,1.000
1,1.0,0.797,0.797,0.797,0.797
2,1.0,0.637,0.637,0.637,0.637
3,1.0,0.735,0.735,0.735,0.735
4,1.0,0.630,0.630,0.630,0.630
...,...,...,...,...,...
1020,1.0,0.847,1.000,1.264,1.151
1021,1.0,0.261,0.214,0.224,0.198
1022,1.0,0.407,0.283,0.272,0.233
1023,1.0,0.722,0.468,0.429,0.357


### Trade count based columns

In [26]:
trade_count_cols = stock_df.filter(regex = "#Trades.*").columns.to_list()

trade_count_cols

['#Trades', '#Trades 7MA', '#Trades 15MA', '#Trades 30MA', '#Trades 60MA']

In [27]:
stock_df[trade_count_cols] = stock_df[trade_count_cols].div(stock_df['#Trades'], axis = 0).round(3)
stock_df[trade_count_cols]

Unnamed: 0,#Trades,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA
0,1.0,1.000,1.000,1.000,1.000
1,1.0,0.723,0.723,0.723,0.723
2,1.0,0.654,0.654,0.654,0.654
3,1.0,0.747,0.747,0.747,0.747
4,1.0,0.690,0.690,0.690,0.690
...,...,...,...,...,...
1020,1.0,0.857,0.935,0.969,0.932
1021,1.0,0.260,0.211,0.187,0.169
1022,1.0,0.463,0.329,0.275,0.238
1023,1.0,0.698,0.461,0.368,0.307


### Dropping unnecessary columns

In [28]:
stock_df = stock_df.drop(columns = ['Date', 'Close', 'Range', 'Volume', 'Value', '#Trades'])
stock_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,0.998,1.958,0.848,1,0,0,1,1,2020,2,1,1,1,1.000,1.000,1.000,1.000,0.007,0.007,0.007,0.007,0.998,0.998,0.998,0.998,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,0.998,1.945,0.842,1,0,0,2,1,2020,3,2,1,1,0.997,0.997,0.997,0.997,0.007,0.007,0.007,0.007,0.995,0.995,0.995,0.995,0.799,0.799,0.799,0.799,0.797,0.797,0.797,0.797,0.723,0.723,0.723,0.723,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.002,1.974,0.855,0,0,0,3,1,2020,4,3,1,1,1.008,1.008,1.008,1.008,0.010,0.010,0.010,0.010,1.007,1.007,1.007,1.007,0.635,0.635,0.635,0.635,0.637,0.637,0.637,0.637,0.654,0.654,0.654,0.654,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,1.005,2.017,0.874,0,0,0,6,1,2020,0,6,1,3,1.022,1.022,1.022,1.022,0.013,0.013,0.013,0.013,1.023,1.023,1.023,1.023,0.724,0.724,0.724,0.724,0.735,0.735,0.735,0.735,0.747,0.747,0.747,0.747,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.001,1.986,0.860,1,0,0,7,1,2020,1,7,1,1,1.005,1.005,1.005,1.005,0.013,0.013,0.013,0.013,1.006,1.006,1.006,1.006,0.629,0.629,0.629,0.629,0.630,0.630,0.630,0.630,0.690,0.690,0.690,0.690,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,0.996,1.047,0.870,1,0,0,16,1,2024,1,16,1,1,0.988,0.999,0.991,0.944,0.015,0.015,0.014,0.012,0.989,1.000,0.991,0.944,0.853,0.995,1.271,1.211,0.847,1.000,1.264,1.151,0.857,0.935,0.969,0.932,0.876,,,
1021,1.021,1.039,0.994,1.003,1.018,1.143,0.950,0,0,0,17,1,2024,2,17,1,1,1.067,1.084,1.081,1.032,0.021,0.018,0.016,0.014,1.070,1.086,1.081,1.032,0.255,0.204,0.212,0.195,0.261,0.214,0.224,0.198,0.260,0.211,0.187,0.169,0.962,,,
1022,1.005,1.019,0.996,1.003,1.006,1.183,0.983,0,0,0,18,1,2024,3,18,1,1,1.088,1.112,1.115,1.067,0.022,0.019,0.017,0.014,1.091,1.114,1.116,1.067,0.389,0.264,0.250,0.220,0.407,0.283,0.272,0.233,0.463,0.329,0.275,0.238,,,,
1023,1.024,1.027,0.998,1.003,1.009,1.195,0.993,0,0,0,19,1,2024,4,19,1,1,1.082,1.113,1.123,1.078,0.024,0.020,0.017,0.015,1.086,1.116,1.124,1.078,0.694,0.439,0.394,0.336,0.722,0.468,0.429,0.357,0.698,0.461,0.368,0.307,,,,


### Saving standardized data

In [29]:
stock_df.to_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-standardized.parquet'), 
    index = False
)
stock_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,0.998,1.958,0.848,1,0,0,1,1,2020,2,1,1,1,1.000,1.000,1.000,1.000,0.007,0.007,0.007,0.007,0.998,0.998,0.998,0.998,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,0.998,1.945,0.842,1,0,0,2,1,2020,3,2,1,1,0.997,0.997,0.997,0.997,0.007,0.007,0.007,0.007,0.995,0.995,0.995,0.995,0.799,0.799,0.799,0.799,0.797,0.797,0.797,0.797,0.723,0.723,0.723,0.723,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.002,1.974,0.855,0,0,0,3,1,2020,4,3,1,1,1.008,1.008,1.008,1.008,0.010,0.010,0.010,0.010,1.007,1.007,1.007,1.007,0.635,0.635,0.635,0.635,0.637,0.637,0.637,0.637,0.654,0.654,0.654,0.654,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,1.005,2.017,0.874,0,0,0,6,1,2020,0,6,1,3,1.022,1.022,1.022,1.022,0.013,0.013,0.013,0.013,1.023,1.023,1.023,1.023,0.724,0.724,0.724,0.724,0.735,0.735,0.735,0.735,0.747,0.747,0.747,0.747,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.001,1.986,0.860,1,0,0,7,1,2020,1,7,1,1,1.005,1.005,1.005,1.005,0.013,0.013,0.013,0.013,1.006,1.006,1.006,1.006,0.629,0.629,0.629,0.629,0.630,0.630,0.630,0.630,0.690,0.690,0.690,0.690,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,0.996,1.047,0.870,1,0,0,16,1,2024,1,16,1,1,0.988,0.999,0.991,0.944,0.015,0.015,0.014,0.012,0.989,1.000,0.991,0.944,0.853,0.995,1.271,1.211,0.847,1.000,1.264,1.151,0.857,0.935,0.969,0.932,0.876,,,
1021,1.021,1.039,0.994,1.003,1.018,1.143,0.950,0,0,0,17,1,2024,2,17,1,1,1.067,1.084,1.081,1.032,0.021,0.018,0.016,0.014,1.070,1.086,1.081,1.032,0.255,0.204,0.212,0.195,0.261,0.214,0.224,0.198,0.260,0.211,0.187,0.169,0.962,,,
1022,1.005,1.019,0.996,1.003,1.006,1.183,0.983,0,0,0,18,1,2024,3,18,1,1,1.088,1.112,1.115,1.067,0.022,0.019,0.017,0.014,1.091,1.114,1.116,1.067,0.389,0.264,0.250,0.220,0.407,0.283,0.272,0.233,0.463,0.329,0.275,0.238,,,,
1023,1.024,1.027,0.998,1.003,1.009,1.195,0.993,0,0,0,19,1,2024,4,19,1,1,1.082,1.113,1.123,1.078,0.024,0.020,0.017,0.015,1.086,1.116,1.124,1.078,0.694,0.439,0.394,0.336,0.722,0.468,0.429,0.357,0.698,0.461,0.368,0.307,,,,
