### Data Preparation Notes
- Data generated using a controlled dummy generator due to time constraints
- Column structure strictly follows assignment specification
- Pipeline logic mirrors real-world intraday processing


In [15]:
import pandas as pd

spot = pd.read_csv("../data/raw/nifty_spot_5min.csv", parse_dates=["datetime"])
futures = pd.read_csv("../data/raw/nifty_futures_5min.csv", parse_dates=["datetime"])
options = pd.read_csv("../data/raw/nifty_options_5min.csv", parse_dates=["datetime"])

spot.head(), futures.head(), options.head()


(             datetime          open          high           low         close  \
 0 2024-01-01 09:15:00  20006.955207  20017.434778  20006.213371  20006.955207   
 1 2024-01-01 09:20:00  20005.018928  20010.293137  20001.966296  20005.018928   
 2 2024-01-01 09:25:00  20014.090899  20016.924344  20008.144188  20014.090899   
 3 2024-01-01 09:30:00  20035.439718  20038.655468  20022.815517  20035.439718   
 4 2024-01-01 09:35:00  20032.156031  20049.522601  20012.706096  20032.156031   
 
    volume  
 0   97616  
 1  126783  
 2  102965  
 3  130099  
 4  118317  ,
              datetime          open          high           low         close  \
 0 2024-01-01 09:15:00  20004.742293  20013.328265  19993.910334  20004.742293   
 1 2024-01-01 09:20:00  20012.663385  20019.639741  20012.403551  20012.663385   
 2 2024-01-01 09:25:00  20028.145999  20028.452187  20017.704173  20028.145999   
 3 2024-01-01 09:30:00  20039.056747  20057.824216  20025.347076  20039.056747   
 4 2024-01-01 09:

In [16]:
spot = spot.dropna()
futures = futures.dropna()
options = options.dropna()


In [17]:
df = pd.merge_asof(
    spot.sort_values("datetime"),
    futures.sort_values("datetime"),
    on="datetime",
    suffixes=("_spot", "_fut")
)


In [18]:
df.to_csv("../data/merged/nifty_merged_5min.csv", index=False)


In [19]:
df["ema_5"] = df["close_spot"].ewm(span=5).mean()
df["ema_15"] = df["close_spot"].ewm(span=15).mean()

df["spot_return"] = df["close_spot"].pct_change()
df["futures_basis"] = (df["close_fut"] - df["close_spot"]) / df["close_spot"]


In [20]:
pcr_oi = (
    opt.groupby(["datetime", "option_type"])["open_interest"]
    .sum()
    .unstack()
    .rename(columns={"CE": "CE_oi", "PE": "PE_oi"})
)

pcr_vol = (
    opt.groupby(["datetime", "option_type"])["volume"]
    .sum()
    .unstack()
    .rename(columns={"CE": "CE_vol", "PE": "PE_vol"})
)

df = df.merge(pcr_oi, on="datetime", how="left")
df = df.merge(pcr_vol, on="datetime", how="left")

df["pcr_oi"] = df["PE_oi"] / df["CE_oi"]
df["pcr_volume"] = df["PE_vol"] / df["CE_vol"]


In [21]:
iv_avg = opt.groupby("datetime")["iv"].mean()
df = df.merge(iv_avg, on="datetime")
df.rename(columns={"iv": "avg_iv"}, inplace=True)


In [22]:
# FORCE save features to raw folder
df.dropna().to_csv("../data/raw/nifty_features_5min.csv", index=False)

print("Saved to data/raw/nifty_features_5min.csv")


Saved to data/raw/nifty_features_5min.csv
