# 02 - Data Processing

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import numpy as np
import pandas as pd

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['HDFCBANK', 'ITBEES']

In [4]:
STOCK_SYMBOL = stock_symbols[0]
STOCK_SYMBOL

'HDFCBANK'

## Data loading

In [5]:
stock_df = pd.read_parquet(cnst.NSE_DATA_DIR.joinpath(STOCK_SYMBOL, "consolidated.parquet"))
stock_df = stock_df.drop(columns = ['series', 'PREV. CLOSE'])
stock_df.columns = [
    'Date',
    'Open', 'High', 'Low', 'LTP', 'Close',
    'VWAP', '52W H', '52W L',
    'Volume', 'Value', '#Trades'
]
stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades
0,2020-01-01,1276.10,1280.00,1270.60,1279.00,1278.60,1276.64,2503.3,1084.00,1836849,2.345001e+09,46625
1,2020-01-02,1279.00,1288.00,1279.00,1286.00,1286.75,1284.56,2503.3,1084.00,3068583,3.941792e+09,104570
2,2020-01-03,1282.20,1285.00,1263.60,1268.50,1268.40,1270.48,2503.3,1084.00,5427775,6.895886e+09,157066
3,2020-01-06,1260.00,1261.80,1236.00,1240.25,1240.95,1247.24,2503.3,1084.00,5445093,6.791348e+09,155007
4,2020-01-07,1258.90,1271.45,1252.25,1261.00,1260.60,1261.48,2503.3,1084.00,7362247,9.287302e+09,189026
...,...,...,...,...,...,...,...,...,...,...,...,...
1020,2024-01-16,1673.00,1683.65,1658.10,1678.00,1679.15,1672.44,1757.5,1460.25,12661250,2.117514e+10,347404
1021,2024-01-17,1570.00,1596.80,1528.40,1542.15,1537.50,1565.65,1757.5,1460.25,85072618,1.331936e+11,2098772
1022,2024-01-18,1494.00,1515.00,1480.05,1490.00,1486.15,1495.03,1757.5,1460.25,80535465,1.204027e+11,1582497
1023,2024-01-19,1505.95,1510.25,1468.40,1474.90,1470.65,1483.88,1757.5,1460.25,54800269,8.131686e+10,1275220


In [6]:
stock_df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     1025 non-null   datetime64[ns]
 1   Open     1025 non-null   float64       
 2   High     1025 non-null   float64       
 3   Low      1025 non-null   float64       
 4   LTP      1025 non-null   float64       
 5   Close    1025 non-null   float64       
 6   VWAP     1025 non-null   float64       
 7   52W H    1025 non-null   float64       
 8   52W L    1025 non-null   float64       
 9   Volume   1025 non-null   int64         
 10  Value    1025 non-null   float64       
 11  #Trades  1025 non-null   int64         
dtypes: datetime64[ns](1), float64(9), int64(2)
memory usage: 96.2 KB


## Feature engineering

### Daily candle based columns

In [7]:
stock_df['IsGreen'] = (stock_df['Close'] >= stock_df['Open']).astype(int)
stock_df['Is52WLow'] = np.isclose(stock_df['Low'], stock_df['52W L'], atol = 0, rtol = 1e-4).astype(int)
stock_df['Is52WHigh'] = np.isclose(stock_df['High'], stock_df['52W H'], atol = 0, rtol = 1e-4).astype(int)
stock_df.sample(10)

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,IsGreen,Is52WLow,Is52WHigh
536,2022-02-02,1511.95,1535.0,1505.6,1535.0,1531.2,1526.26,1725.0,1353.0,6984645,10660390000.0,182857,1,0,0
774,2023-01-17,1589.8,1611.0,1577.5,1607.45,1608.9,1602.4,1722.1,1271.6,5393177,8642052000.0,136682,1,0,0
593,2022-04-28,1372.0,1379.0,1362.1,1372.5,1371.35,1369.11,1725.0,1292.0,10386861,14220710000.0,182049,0,0,0
605,2022-05-17,1312.6,1317.0,1298.2,1316.0,1314.0,1309.07,1725.0,1285.3,8320722,10892430000.0,204025,1,0,0
190,2020-09-11,1096.45,1104.85,1071.0,1080.5,1078.65,1083.3,2288.8,738.75,10397545,11263630000.0,201271,0,0,0
639,2022-07-04,1353.7,1360.2,1342.25,1356.55,1355.65,1352.23,1725.0,1271.6,4243740,5738509000.0,122056,1,0,0
939,2023-09-18,1653.75,1655.0,1626.1,1629.65,1629.05,1636.5,1757.5,1365.0,18528144,30321220000.0,292658,0,0,0
242,2020-11-25,1451.3,1464.4,1397.0,1406.5,1402.8,1431.81,1464.4,738.75,12076301,17290970000.0,310836,0,0,1
140,2020-07-03,1093.0,1095.8,1070.0,1075.5,1073.95,1079.3,2503.3,738.75,13798879,14893160000.0,227183,0,0,0
1005,2023-12-26,1673.25,1685.95,1668.55,1684.1,1682.45,1678.46,1757.5,1460.25,9022928,15144580000.0,243596,1,0,0


In [8]:
((stock_df[['IsGreen', 'Is52WLow', 'Is52WHigh']].sum() * 100) / len(stock_df)).round(2)

IsGreen      49.07
Is52WLow      1.46
Is52WHigh     2.54
dtype: float64

In [9]:
stock_df.shape

(1025, 15)

### Date based columns

In [10]:
stock_df['Day'] = stock_df['Date'].dt.day
stock_df['Month'] = stock_df['Date'].dt.month
stock_df['Year'] = stock_df['Date'].dt.year
stock_df['Weekday'] = stock_df['Date'].dt.weekday
stock_df['DayOfYear'] = stock_df['Date'].dt.day_of_year
stock_df['Quarter'] = stock_df['Date'].dt.quarter
stock_df['DaysSinceLastTradingSession'] = (stock_df['Date'] - stock_df['Date'].shift(1)).dt.days.fillna(1).astype(int)

stock_df.sample(10)

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession
270,2021-01-06,1435.0,1440.0,1413.1,1419.8,1420.55,1423.55,1464.4,738.75,11067025,15754510000.0,151854,0,0,0,6,1,2021,2,6,1,1
344,2021-04-28,1436.25,1479.0,1431.0,1475.0,1476.8,1463.19,1641.0,826.1,12051970,17634380000.0,197146,1,0,0,28,4,2021,2,118,2,1
417,2021-08-11,1514.9,1518.85,1491.05,1494.5,1494.95,1501.47,1641.0,1020.2,6292893,9448574000.0,236089,0,0,0,11,8,2021,2,223,3,1
440,2021-09-15,1535.0,1554.8,1535.0,1546.75,1546.8,1543.76,1641.0,1025.0,7411499,11441550000.0,196864,1,0,0,15,9,2021,2,258,3,1
606,2022-05-18,1324.05,1333.3,1310.0,1315.85,1313.9,1321.44,1725.0,1285.3,7866192,10394670000.0,206774,0,0,0,18,5,2022,2,138,2,1
123,2020-06-10,990.0,999.85,978.2,993.0,991.85,989.98,2503.3,738.75,18248880,18066100000.0,263364,1,0,0,10,6,2020,2,162,2,1
496,2021-12-07,1513.95,1532.0,1509.9,1524.65,1525.7,1523.82,1725.0,1342.0,6213767,9468635000.0,148533,1,0,0,7,12,2021,1,341,4,1
764,2023-01-03,1622.2,1643.0,1622.2,1640.5,1639.35,1634.63,1722.1,1271.6,4189015,6847496000.0,102047,1,0,0,3,1,2023,1,3,1,1
26,2020-02-05,1234.9,1248.0,1227.3,1246.0,1244.65,1237.42,2503.3,1084.0,9010341,11149560000.0,150882,1,0,0,5,2,2020,2,36,1,1
940,2023-09-20,1599.0,1599.0,1560.4,1566.0,1563.7,1570.63,1757.5,1365.0,57737324,90684080000.0,977717,0,0,0,20,9,2023,2,263,3,2


In [11]:
stock_df.shape

(1025, 22)

### Moving averages

#### `Close`

In [12]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Close {window}MA'] = stock_df['Close'].rolling(window = window, min_periods = 1).mean().round(2)

stock_df.filter(regex = "Close.*")

Unnamed: 0,Close,Close 3MA,Close 7MA,Close 15MA,Close 30MA
0,1278.60,1278.60,1278.60,1278.60,1278.60
1,1286.75,1282.68,1282.68,1282.68,1282.68
2,1268.40,1277.92,1277.92,1277.92,1277.92
3,1240.95,1265.37,1268.68,1268.68,1268.68
4,1260.60,1256.65,1267.06,1267.06,1267.06
...,...,...,...,...,...
1020,1679.15,1664.38,1658.86,1678.20,1664.56
1021,1537.50,1629.82,1640.87,1667.15,1661.69
1022,1486.15,1567.60,1617.39,1652.54,1656.97
1023,1470.65,1498.10,1590.92,1636.63,1651.64


#### `VWAP`

In [13]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'VWAP {window}MA'] = stock_df['VWAP'].rolling(window = window, min_periods = 1).mean().round(2)

stock_df.filter(regex = "VWAP.*")

Unnamed: 0,VWAP,VWAP 3MA,VWAP 7MA,VWAP 15MA,VWAP 30MA
0,1276.64,1276.64,1276.64,1276.64,1276.64
1,1284.56,1280.60,1280.60,1280.60,1280.60
2,1270.48,1277.23,1277.23,1277.23,1277.23
3,1247.24,1267.43,1269.73,1269.73,1269.73
4,1261.48,1259.73,1268.08,1268.08,1268.08
...,...,...,...,...,...
1020,1672.44,1661.38,1660.48,1678.64,1664.72
1021,1565.65,1634.90,1645.31,1669.96,1662.71
1022,1495.03,1577.71,1621.62,1655.54,1658.30
1023,1483.88,1514.85,1597.43,1640.64,1653.48


#### `Volume`

In [14]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Volume {window}MA'] = stock_df['Volume'].rolling(window = window, min_periods = 1).mean().astype(int)

stock_df.filter(regex = "Volume.*")

Unnamed: 0,Volume,Volume 3MA,Volume 7MA,Volume 15MA,Volume 30MA
0,1836849,1836849,1836849,1836849,1836849
1,3068583,2452716,2452716,2452716,2452716
2,5427775,3444402,3444402,3444402,3444402
3,5445093,4647150,3944575,3944575,3944575
4,7362247,6078371,4628109,4628109,4628109
...,...,...,...,...,...
1020,12661250,12236068,10797971,12592121,16089685
1021,85072618,37298015,21665597,17363326,18054925
1022,80535465,59423111,31361600,21263141,20095890
1023,54800269,73469450,38039080,24082779,21581725


#### `Value`

In [15]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'Value {window}MA'] = stock_df['Value'].rolling(window = window, min_periods = 1).mean().astype(int)

stock_df.filter(regex = "Value.*")

Unnamed: 0,Value,Value 3MA,Value 7MA,Value 15MA,Value 30MA
0,2.345001e+09,2345000988,2345000988,2345000988,2345000988
1,3.941792e+09,3143396262,3143396262,3143396262,3143396262
2,6.895886e+09,4394226092,4394226092,4394226092,4394226092
3,6.791348e+09,5876341707,4993506527,4993506527,4993506527
4,9.287302e+09,7658178376,5852265530,5852265530,5852265530
...,...,...,...,...,...
1020,2.117514e+10,20346476315,17939626497,21173698410,26756826044
1021,1.331936e+11,59322762592,34817957164,28526480964,29781095464
1022,1.204027e+11,91590480296,49013774376,34039123030,32747198969
1023,8.131686e+10,111637719963,58727408569,38036670502,34902710598


#### `#Trades`

In [16]:
for window in cnst.ROLLING_WINDOWS:
    stock_df[f'#Trades {window}MA'] = stock_df['#Trades'].rolling(window = window, min_periods = 1).mean().astype(int)

stock_df.filter(regex = "#Trades.*")

Unnamed: 0,#Trades,#Trades 3MA,#Trades 7MA,#Trades 15MA,#Trades 30MA
0,46625,46625,46625,46625,46625
1,104570,75597,75597,75597,75597
2,157066,102753,102753,102753,102753
3,155007,138881,115817,115817,115817
4,189026,167033,130458,130458,130458
...,...,...,...,...,...
1020,347404,346951,297593,324871,336519
1021,2098772,953645,544785,443365,393241
1022,1582497,1342891,732341,520660,434624
1023,1275220,1652163,889730,588066,469801


### Target columns

In [17]:
for window in cnst.TARGET_WINDOWS:
    stock_df[f'Target {window}D'] = stock_df['Close'].shift(-window)

stock_df.filter(regex = "Target.*")

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
0,1240.95,1282.70,1240.85,1240.60
1,1260.60,1286.00,1244.85,1249.00
2,1257.30,1289.50,1244.55,1241.40
3,1271.40,1284.25,1213.20,1219.35
4,1282.70,1287.65,1223.20,1217.15
...,...,...,...,...
1020,1470.65,,,
1021,1478.85,,,
1022,,,,
1023,,,,


### Saving processed data

In [18]:
stock_df.to_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-processed.parquet'), 
    index = False
)
stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 3MA,Close 7MA,Close 15MA,Close 30MA,VWAP 3MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,Volume 3MA,Volume 7MA,Volume 15MA,Volume 30MA,Value 3MA,Value 7MA,Value 15MA,Value 30MA,#Trades 3MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,Target 3D,Target 7D,Target 15D,Target 30D
0,2020-01-01,1276.10,1280.00,1270.60,1279.00,1278.60,1276.64,2503.3,1084.00,1836849,2.345001e+09,46625,1,0,0,1,1,2020,2,1,1,1,1278.60,1278.60,1278.60,1278.60,1276.64,1276.64,1276.64,1276.64,1836849,1836849,1836849,1836849,2345000988,2345000988,2345000988,2345000988,46625,46625,46625,46625,1240.95,1282.70,1240.85,1240.60
1,2020-01-02,1279.00,1288.00,1279.00,1286.00,1286.75,1284.56,2503.3,1084.00,3068583,3.941792e+09,104570,1,0,0,2,1,2020,3,2,1,1,1282.68,1282.68,1282.68,1282.68,1280.60,1280.60,1280.60,1280.60,2452716,2452716,2452716,2452716,3143396262,3143396262,3143396262,3143396262,75597,75597,75597,75597,1260.60,1286.00,1244.85,1249.00
2,2020-01-03,1282.20,1285.00,1263.60,1268.50,1268.40,1270.48,2503.3,1084.00,5427775,6.895886e+09,157066,0,0,0,3,1,2020,4,3,1,1,1277.92,1277.92,1277.92,1277.92,1277.23,1277.23,1277.23,1277.23,3444402,3444402,3444402,3444402,4394226092,4394226092,4394226092,4394226092,102753,102753,102753,102753,1257.30,1289.50,1244.55,1241.40
3,2020-01-06,1260.00,1261.80,1236.00,1240.25,1240.95,1247.24,2503.3,1084.00,5445093,6.791348e+09,155007,0,0,0,6,1,2020,0,6,1,3,1265.37,1268.68,1268.68,1268.68,1267.43,1269.73,1269.73,1269.73,4647150,3944575,3944575,3944575,5876341707,4993506527,4993506527,4993506527,138881,115817,115817,115817,1271.40,1284.25,1213.20,1219.35
4,2020-01-07,1258.90,1271.45,1252.25,1261.00,1260.60,1261.48,2503.3,1084.00,7362247,9.287302e+09,189026,1,0,0,7,1,2020,1,7,1,1,1256.65,1267.06,1267.06,1267.06,1259.73,1268.08,1268.08,1268.08,6078371,4628109,4628109,4628109,7658178376,5852265530,5852265530,5852265530,167033,130458,130458,130458,1282.70,1287.65,1223.20,1217.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,2024-01-16,1673.00,1683.65,1658.10,1678.00,1679.15,1672.44,1757.5,1460.25,12661250,2.117514e+10,347404,1,0,0,16,1,2024,1,16,1,1,1664.38,1658.86,1678.20,1664.56,1661.38,1660.48,1678.64,1664.72,12236068,10797971,12592121,16089685,20346476315,17939626497,21173698410,26756826044,346951,297593,324871,336519,1470.65,,,
1021,2024-01-17,1570.00,1596.80,1528.40,1542.15,1537.50,1565.65,1757.5,1460.25,85072618,1.331936e+11,2098772,0,0,0,17,1,2024,2,17,1,1,1629.82,1640.87,1667.15,1661.69,1634.90,1645.31,1669.96,1662.71,37298015,21665597,17363326,18054925,59322762592,34817957164,28526480964,29781095464,953645,544785,443365,393241,1478.85,,,
1022,2024-01-18,1494.00,1515.00,1480.05,1490.00,1486.15,1495.03,1757.5,1460.25,80535465,1.204027e+11,1582497,0,0,0,18,1,2024,3,18,1,1,1567.60,1617.39,1652.54,1656.97,1577.71,1621.62,1655.54,1658.30,59423111,31361600,21263141,20095890,91590480296,49013774376,34039123030,32747198969,1342891,732341,520660,434624,,,,
1023,2024-01-19,1505.95,1510.25,1468.40,1474.90,1470.65,1483.88,1757.5,1460.25,54800269,8.131686e+10,1275220,0,0,0,19,1,2024,4,19,1,1,1498.10,1590.92,1636.63,1651.64,1514.85,1597.43,1640.64,1653.48,73469450,38039080,24082779,21581725,111637719963,58727408569,38036670502,34902710598,1652163,889730,588066,469801,,,,


## Data standardization

### Stock price based columns

In [19]:
stock_price_cols = [
    'Open', 'High', 'Low', 'LTP', '52W H', '52W L'
] + (
    stock_df.filter(regex = "Close.*").columns.to_list() +
    stock_df.filter(regex = "VWAP.*").columns.to_list() +
    stock_df.filter(regex = "Target.*").columns.to_list()
)

stock_price_cols

['Open',
 'High',
 'Low',
 'LTP',
 '52W H',
 '52W L',
 'Close',
 'Close 3MA',
 'Close 7MA',
 'Close 15MA',
 'Close 30MA',
 'VWAP',
 'VWAP 3MA',
 'VWAP 7MA',
 'VWAP 15MA',
 'VWAP 30MA',
 'Target 3D',
 'Target 7D',
 'Target 15D',
 'Target 30D']

In [20]:
stock_df[stock_price_cols] = stock_df[stock_price_cols].div(stock_df['Close'], axis = 0).round(3)
stock_df[stock_price_cols]

Unnamed: 0,Open,High,Low,LTP,52W H,52W L,Close,Close 3MA,Close 7MA,Close 15MA,Close 30MA,VWAP,VWAP 3MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,1.958,0.848,1.0,1.000,1.000,1.000,1.000,0.998,0.998,0.998,0.998,0.998,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,1.945,0.842,1.0,0.997,0.997,0.997,0.997,0.998,0.995,0.995,0.995,0.995,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.974,0.855,1.0,1.008,1.008,1.008,1.008,1.002,1.007,1.007,1.007,1.007,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,2.017,0.874,1.0,1.020,1.022,1.022,1.022,1.005,1.021,1.023,1.023,1.023,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.986,0.860,1.0,0.997,1.005,1.005,1.005,1.001,0.999,1.006,1.006,1.006,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,1.047,0.870,1.0,0.991,0.988,0.999,0.991,0.996,0.989,0.989,1.000,0.991,0.876,,,
1021,1.021,1.039,0.994,1.003,1.143,0.950,1.0,1.060,1.067,1.084,1.081,1.018,1.063,1.070,1.086,1.081,0.962,,,
1022,1.005,1.019,0.996,1.003,1.183,0.983,1.0,1.055,1.088,1.112,1.115,1.006,1.062,1.091,1.114,1.116,,,,
1023,1.024,1.027,0.998,1.003,1.195,0.993,1.0,1.019,1.082,1.113,1.123,1.009,1.030,1.086,1.116,1.124,,,,


### Volume based columns

In [21]:
volume_cols = stock_df.filter(regex = "Volume.*").columns.to_list()

volume_cols

['Volume', 'Volume 3MA', 'Volume 7MA', 'Volume 15MA', 'Volume 30MA']

In [22]:
stock_df[volume_cols] = stock_df[volume_cols].div(stock_df['Volume'], axis = 0).round(3)
stock_df[volume_cols]

Unnamed: 0,Volume,Volume 3MA,Volume 7MA,Volume 15MA,Volume 30MA
0,1.0,1.000,1.000,1.000,1.000
1,1.0,0.799,0.799,0.799,0.799
2,1.0,0.635,0.635,0.635,0.635
3,1.0,0.853,0.724,0.724,0.724
4,1.0,0.826,0.629,0.629,0.629
...,...,...,...,...,...
1020,1.0,0.966,0.853,0.995,1.271
1021,1.0,0.438,0.255,0.204,0.212
1022,1.0,0.738,0.389,0.264,0.250
1023,1.0,1.341,0.694,0.439,0.394


### Value based columns

In [23]:
value_cols = stock_df.filter(regex = "Value.*").columns.to_list()

value_cols

['Value', 'Value 3MA', 'Value 7MA', 'Value 15MA', 'Value 30MA']

In [24]:
stock_df[value_cols] = stock_df[value_cols].div(stock_df['Value'], axis = 0).round(3)
stock_df[value_cols]

Unnamed: 0,Value,Value 3MA,Value 7MA,Value 15MA,Value 30MA
0,1.0,1.000,1.000,1.000,1.000
1,1.0,0.797,0.797,0.797,0.797
2,1.0,0.637,0.637,0.637,0.637
3,1.0,0.865,0.735,0.735,0.735
4,1.0,0.825,0.630,0.630,0.630
...,...,...,...,...,...
1020,1.0,0.961,0.847,1.000,1.264
1021,1.0,0.445,0.261,0.214,0.224
1022,1.0,0.761,0.407,0.283,0.272
1023,1.0,1.373,0.722,0.468,0.429


### Trade count based columns

In [25]:
trade_count_cols = stock_df.filter(regex = "#Trades.*").columns.to_list()

trade_count_cols

['#Trades', '#Trades 3MA', '#Trades 7MA', '#Trades 15MA', '#Trades 30MA']

In [26]:
stock_df[trade_count_cols] = stock_df[trade_count_cols].div(stock_df['#Trades'], axis = 0).round(3)
stock_df[trade_count_cols]

Unnamed: 0,#Trades,#Trades 3MA,#Trades 7MA,#Trades 15MA,#Trades 30MA
0,1.0,1.000,1.000,1.000,1.000
1,1.0,0.723,0.723,0.723,0.723
2,1.0,0.654,0.654,0.654,0.654
3,1.0,0.896,0.747,0.747,0.747
4,1.0,0.884,0.690,0.690,0.690
...,...,...,...,...,...
1020,1.0,0.999,0.857,0.935,0.969
1021,1.0,0.454,0.260,0.211,0.187
1022,1.0,0.849,0.463,0.329,0.275
1023,1.0,1.296,0.698,0.461,0.368


### Dropping unnecessary columns

In [27]:
stock_df = stock_df.drop(columns = ['Date', 'Close', 'Volume', 'Value', '#Trades'])
stock_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 3MA,Close 7MA,Close 15MA,Close 30MA,VWAP 3MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,Volume 3MA,Volume 7MA,Volume 15MA,Volume 30MA,Value 3MA,Value 7MA,Value 15MA,Value 30MA,#Trades 3MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,0.998,1.958,0.848,1,0,0,1,1,2020,2,1,1,1,1.000,1.000,1.000,1.000,0.998,0.998,0.998,0.998,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,0.998,1.945,0.842,1,0,0,2,1,2020,3,2,1,1,0.997,0.997,0.997,0.997,0.995,0.995,0.995,0.995,0.799,0.799,0.799,0.799,0.797,0.797,0.797,0.797,0.723,0.723,0.723,0.723,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.002,1.974,0.855,0,0,0,3,1,2020,4,3,1,1,1.008,1.008,1.008,1.008,1.007,1.007,1.007,1.007,0.635,0.635,0.635,0.635,0.637,0.637,0.637,0.637,0.654,0.654,0.654,0.654,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,1.005,2.017,0.874,0,0,0,6,1,2020,0,6,1,3,1.020,1.022,1.022,1.022,1.021,1.023,1.023,1.023,0.853,0.724,0.724,0.724,0.865,0.735,0.735,0.735,0.896,0.747,0.747,0.747,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.001,1.986,0.860,1,0,0,7,1,2020,1,7,1,1,0.997,1.005,1.005,1.005,0.999,1.006,1.006,1.006,0.826,0.629,0.629,0.629,0.825,0.630,0.630,0.630,0.884,0.690,0.690,0.690,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,0.996,1.047,0.870,1,0,0,16,1,2024,1,16,1,1,0.991,0.988,0.999,0.991,0.989,0.989,1.000,0.991,0.966,0.853,0.995,1.271,0.961,0.847,1.000,1.264,0.999,0.857,0.935,0.969,0.876,,,
1021,1.021,1.039,0.994,1.003,1.018,1.143,0.950,0,0,0,17,1,2024,2,17,1,1,1.060,1.067,1.084,1.081,1.063,1.070,1.086,1.081,0.438,0.255,0.204,0.212,0.445,0.261,0.214,0.224,0.454,0.260,0.211,0.187,0.962,,,
1022,1.005,1.019,0.996,1.003,1.006,1.183,0.983,0,0,0,18,1,2024,3,18,1,1,1.055,1.088,1.112,1.115,1.062,1.091,1.114,1.116,0.738,0.389,0.264,0.250,0.761,0.407,0.283,0.272,0.849,0.463,0.329,0.275,,,,
1023,1.024,1.027,0.998,1.003,1.009,1.195,0.993,0,0,0,19,1,2024,4,19,1,1,1.019,1.082,1.113,1.123,1.030,1.086,1.116,1.124,1.341,0.694,0.439,0.394,1.373,0.722,0.468,0.429,1.296,0.698,0.461,0.368,,,,


### Saving standardized data

In [28]:
stock_df.to_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-standardized.parquet'), 
    index = False
)
stock_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 3MA,Close 7MA,Close 15MA,Close 30MA,VWAP 3MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,Volume 3MA,Volume 7MA,Volume 15MA,Volume 30MA,Value 3MA,Value 7MA,Value 15MA,Value 30MA,#Trades 3MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,0.998,1.958,0.848,1,0,0,1,1,2020,2,1,1,1,1.000,1.000,1.000,1.000,0.998,0.998,0.998,0.998,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,0.998,1.945,0.842,1,0,0,2,1,2020,3,2,1,1,0.997,0.997,0.997,0.997,0.995,0.995,0.995,0.995,0.799,0.799,0.799,0.799,0.797,0.797,0.797,0.797,0.723,0.723,0.723,0.723,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.002,1.974,0.855,0,0,0,3,1,2020,4,3,1,1,1.008,1.008,1.008,1.008,1.007,1.007,1.007,1.007,0.635,0.635,0.635,0.635,0.637,0.637,0.637,0.637,0.654,0.654,0.654,0.654,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,1.005,2.017,0.874,0,0,0,6,1,2020,0,6,1,3,1.020,1.022,1.022,1.022,1.021,1.023,1.023,1.023,0.853,0.724,0.724,0.724,0.865,0.735,0.735,0.735,0.896,0.747,0.747,0.747,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.001,1.986,0.860,1,0,0,7,1,2020,1,7,1,1,0.997,1.005,1.005,1.005,0.999,1.006,1.006,1.006,0.826,0.629,0.629,0.629,0.825,0.630,0.630,0.630,0.884,0.690,0.690,0.690,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,0.996,1.047,0.870,1,0,0,16,1,2024,1,16,1,1,0.991,0.988,0.999,0.991,0.989,0.989,1.000,0.991,0.966,0.853,0.995,1.271,0.961,0.847,1.000,1.264,0.999,0.857,0.935,0.969,0.876,,,
1021,1.021,1.039,0.994,1.003,1.018,1.143,0.950,0,0,0,17,1,2024,2,17,1,1,1.060,1.067,1.084,1.081,1.063,1.070,1.086,1.081,0.438,0.255,0.204,0.212,0.445,0.261,0.214,0.224,0.454,0.260,0.211,0.187,0.962,,,
1022,1.005,1.019,0.996,1.003,1.006,1.183,0.983,0,0,0,18,1,2024,3,18,1,1,1.055,1.088,1.112,1.115,1.062,1.091,1.114,1.116,0.738,0.389,0.264,0.250,0.761,0.407,0.283,0.272,0.849,0.463,0.329,0.275,,,,
1023,1.024,1.027,0.998,1.003,1.009,1.195,0.993,0,0,0,19,1,2024,4,19,1,1,1.019,1.082,1.113,1.123,1.030,1.086,1.116,1.124,1.341,0.694,0.439,0.394,1.373,0.722,0.468,0.429,1.296,0.698,0.461,0.368,,,,
