In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import numpy as np
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from tqdm import tqdm


Melting train data into long format

In [2]:
import pandas as pd

# Load the dataset
train = pd.read_parquet("train_data.parquet")  # update path if needed

# Identify IV and X columns
iv_columns = [col for col in train.columns if col.startswith('call_iv_') or col.startswith('put_iv_')]
x_columns = [f'X{i}' for i in range(42)]  # X0 to X41

# Melt IV columns into long format
iv_long = train.melt(
    id_vars=['timestamp', 'underlying', 'expiry'] + x_columns,
    value_vars=iv_columns,
    var_name='iv_type_strike',
    value_name='iv'
)

# Extract option type and strike price
iv_long['option_type'] = iv_long['iv_type_strike'].str.extract(r'(call|put)')
iv_long['strike_price'] = iv_long['iv_type_strike'].str.extract(r'_(\d+)$').astype(float) 

# Drop combined column and reorder
iv_long = iv_long.drop(columns=['iv_type_strike'])
iv_long = iv_long[['timestamp', 'underlying', 'expiry', 'option_type', 'strike_price', 'iv'] + x_columns]



Melting test data into long format


In [3]:
test = pd.read_parquet("test_data.parquet")

# Identify IV and X columns
iv_columns = [col for col in test.columns if col.startswith('call_iv_') or col.startswith('put_iv_')]
x_columns = [f'X{i}' for i in range(42)]  # X0 to X41

# Melt IV columns into long format
iv_long_test = test.melt(
    id_vars=[ 'underlying'] + x_columns,
    value_vars=iv_columns,
    var_name='iv_type_strike',
    value_name='iv'
)

# Extract option type and strike price
iv_long_test['option_type'] = iv_long_test['iv_type_strike'].str.extract(r'(call|put)')
iv_long_test['strike_price'] = iv_long_test['iv_type_strike'].str.extract(r'_(\d+)$').astype(float) 

# Drop combined column and reorder
iv_long_test = iv_long_test.drop(columns=['iv_type_strike'])
iv_long_test = iv_long_test[[ 'underlying', 'option_type', 'strike_price', 'iv'] + x_columns]




Adding extra features to train and test data

In [4]:
iv_long['moneyness'] = iv_long['strike_price'] / iv_long['underlying']
iv_long['strike_price']=iv_long['strike_price'] - iv_long['underlying']
iv_long['option_type']  = iv_long['option_type'].map({'call':1, 'put':-1})

iv_long_test['moneyness'] = iv_long_test['strike_price'] / iv_long_test['underlying']
iv_long_test['strike_price']=iv_long_test['strike_price'] - iv_long_test['underlying']
iv_long_test['option_type']  = iv_long_test['option_type'].map({'call':1, 'put':-1})

In [5]:
feature_columns = [f'X{i}' for i in range(42)]
feature_columns.append('strike_price')
feature_columns.append('moneyness')
feature_columns.append('option_type')

feature_columns_test = [f'X{i}' for i in range(42)]
feature_columns_test.append('strike_price')
feature_columns_test.append('moneyness')
feature_columns_test.append('option_type')


In [6]:
iv_long_cleaned = iv_long[(iv_long['iv'] < 1) & (iv_long['iv'] > 0)].copy()


Scaling train(features and target) and test(only features)

In [7]:

train_scaled_df = iv_long_cleaned.copy()

test_scaled_df = iv_long_test.copy()


Splitting test data into validation and submission

In [8]:
# Split based on whether 'iv' is NaN
submission_df = test_scaled_df[test_scaled_df['iv'].isna()]
test_df = test_scaled_df[test_scaled_df['iv'].notna()]



In [9]:
CORRELATION_THRESHOLD = 0.8

### Removing feature with high avg and corr>0.80 with specific feature

In [10]:
correlation_matrix = train_scaled_df[feature_columns].corr().abs()

# Step 2: Compute average correlation for each feature (excluding self-correlation)
avg_corr = correlation_matrix.apply(lambda x: (x.sum() - 1) / (len(x) - 1), axis=1)

# Step 3: Get top 10 features with highest average correlation
top_10_high_avg_corr = avg_corr.sort_values(ascending=False).head(10)

# Step 4: Find features among top 10 that have correlation > 0.80 with **any** other feature
features_to_drop = []
for feature in top_10_high_avg_corr.index:
    # Drop self-correlation and check if any other correlation is > 0.80
    high_corr_features = correlation_matrix[feature][correlation_matrix[feature] > 0.80].drop(labels=feature)
    if not high_corr_features.empty:
        features_to_drop.append(feature)

# Output features to drop
print("Dropping features due to high average and pairwise correlation:", features_to_drop)

# Optional: Drop from your dataset
train_scaled_df_dropped = train_scaled_df.drop(columns=features_to_drop)
test_scaled_df_dropped = test_df.drop(columns=features_to_drop)
submission_df_dropped = submission_df.drop(columns=features_to_drop)



Dropping features due to high average and pairwise correlation: ['X1', 'X38', 'X29', 'X16', 'X28']


In [11]:
print(iv_long[feature_columns].shape)
print(iv_long_test[feature_columns_test].shape)
print(train_scaled_df.shape)
print(train_scaled_df_dropped.shape)
print(test_df.shape)
print(test_scaled_df_dropped.shape)

(9273680, 45)
(627380, 45)
(8770739, 49)
(8770739, 44)
(250876, 47)
(250876, 42)


TRAINING

In [12]:
#train 
train_dataset1 = train_scaled_df_dropped.drop(columns=['iv','timestamp', 'underlying', 'expiry'])
train_dataset2 = train_scaled_df.drop(columns=['iv','timestamp', 'underlying', 'expiry'])
train_target = train_scaled_df['iv']

#test
test_dataset1 = test_scaled_df_dropped.drop(columns=['iv', 'underlying'])
test_dataset2 = test_df.drop(columns=['iv', 'underlying'])
test_target = test_df['iv']

#submission
submission_dataset1 = submission_df_dropped.drop(columns=['iv', 'underlying'])
submission_dataset2 = submission_df.drop(columns=['iv', 'underlying'])

In [39]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler


df = train_dataset2.copy()
df_test = test_dataset2.copy()
df1 = train_dataset1.copy()
df_test1 = test_dataset1.copy()
df_submission2 = submission_dataset2.copy()
df_submission1 = submission_dataset1.copy()

categorical_cols = ['option_type']

numeric_cols1 = [col for col in df.columns if col not in (categorical_cols) + (features_to_drop)]
numeric_cols = [col for col in df.columns if col not in (categorical_cols) ]

In [40]:
scaler = MinMaxScaler()

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df_test[numeric_cols] = scaler.fit_transform(df_test[numeric_cols])

df1[numeric_cols1] = scaler.fit_transform(df1[numeric_cols1])
df_test1[numeric_cols1] = scaler.fit_transform(df_test1[numeric_cols1])

df_submission2[numeric_cols] = scaler.fit_transform(df_submission2[numeric_cols])
df_submission1[numeric_cols1] = scaler.fit_transform(df_submission1[numeric_cols1])


In [41]:
import numpy as np

selected_cols = ['strike_price', 'moneyness', 'option_type']

# Train
X_train1 = df1.to_numpy(dtype=np.float32)
X_train2 = df.to_numpy(dtype=np.float32)
y_train = train_target.to_numpy(dtype=np.float32).reshape(-1, 1)  # Make it (N,1) for regression

# Test
X_test1 = df_test1.to_numpy(dtype=np.float32)
X_test2 = df_test.to_numpy(dtype=np.float32)
y_test = test_target.to_numpy(dtype=np.float32).reshape(-1, 1)    # Optional if test_target exists

#submission
X_submission1 = df_submission1.to_numpy(dtype=np.float32)
X_submission2 = df_submission2.to_numpy(dtype=np.float32)



#hybrid
selected_cols = ['strike_price', 'moneyness', 'option_type']

# Train
X_train3 = df_test1[selected_cols].to_numpy(dtype=np.float32)


# Submission
X_submission3 = df_submission1[selected_cols].to_numpy(dtype=np.float32)




In [50]:
import importlib
import model  # ensure model is imported
importlib.reload(model)
from model import Trainer


In [51]:
trainer = Trainer(
    input_dim=X_test2.shape[1],  # number of features
    learning_rate=0.001,
    batch_size=2048,
    epochs=500
)

# Train the model
trainer.train(X_test2, y_test)

[INFO] Training on device: cuda
[INFO] X_tensor shape: torch.Size([250876, 45])
[INFO] y_tensor shape: torch.Size([250876, 1])

[INFO] Epoch 1/500
[DEBUG] First batch X shape: torch.Size([2048, 45])
[DEBUG] First batch y shape: torch.Size([2048, 1])
 Epoch 1 completed. Total Loss: 0.4273

[INFO] Epoch 2/500
 Epoch 2 completed. Total Loss: 0.2280

[INFO] Epoch 3/500
 Epoch 3 completed. Total Loss: 0.2082

[INFO] Epoch 4/500
 Epoch 4 completed. Total Loss: 0.1978

[INFO] Epoch 5/500
 Epoch 5 completed. Total Loss: 0.1902

[INFO] Epoch 6/500
 Epoch 6 completed. Total Loss: 0.1872

[INFO] Epoch 7/500
 Epoch 7 completed. Total Loss: 0.1833

[INFO] Epoch 8/500
 Epoch 8 completed. Total Loss: 0.1808

[INFO] Epoch 9/500
 Epoch 9 completed. Total Loss: 0.1774

[INFO] Epoch 10/500
 Epoch 10 completed. Total Loss: 0.1745

[INFO] Epoch 11/500
 Epoch 11 completed. Total Loss: 0.1728

[INFO] Epoch 12/500
 Epoch 12 completed. Total Loss: 0.1702

[INFO] Epoch 13/500
 Epoch 13 completed. Total Loss: 0.

In [52]:

metrics = trainer.evaluate(X_train2, y_train)
print("Evaluation metrics:", metrics)

Evaluation metrics: {'MSE': 0.17573031111263346, 'MAE': 0.10512927919626236, 'R² Score': 0.02036154270172119}


In [53]:
# Predict on new data

predictions = trainer.predict(X_submission2)

submission_df['iv'] = predictions.flatten()  

combined_df = pd.concat([submission_df['iv'], test_df['iv']])
combined_df = combined_df.sort_index()

print(combined_df.head())

0    0.280939
1    0.270276
2    0.243598
3    0.241888
4    0.235328
Name: iv, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['iv'] = predictions.flatten()


In [55]:
hhh = pd.read_csv("sample_submission.csv")

# Ensure combined_df has exactly 627380 rows (12065 * 52)
assert len(combined_df) == 12065 * 52, "combined_df does not have exactly 627380 rows."

# Define your column names
column_names = [
    'call_iv_24000', 'call_iv_24100', 'call_iv_24200', 'call_iv_24300', 'call_iv_24400',
    'call_iv_24500', 'call_iv_24600', 'call_iv_24700', 'call_iv_24800', 'call_iv_24900',
    'call_iv_25000', 'call_iv_25100', 'call_iv_25200', 'call_iv_25300', 'call_iv_25400',
    'call_iv_25500', 'call_iv_25600', 'call_iv_25700', 'call_iv_25800', 'call_iv_25900',
    'call_iv_26000', 'call_iv_26100', 'call_iv_26200', 'call_iv_26300', 'call_iv_26400',
    'call_iv_26500', 'put_iv_23000', 'put_iv_23100', 'put_iv_23200', 'put_iv_23300',
    'put_iv_23400', 'put_iv_23500', 'put_iv_23600', 'put_iv_23700', 'put_iv_23800',
    'put_iv_23900', 'put_iv_24000', 'put_iv_24100', 'put_iv_24200', 'put_iv_24300',
    'put_iv_24400', 'put_iv_24500', 'put_iv_24600', 'put_iv_24700', 'put_iv_24800',
    'put_iv_24900', 'put_iv_25000', 'put_iv_25100', 'put_iv_25200', 'put_iv_25300',
    'put_iv_25400', 'put_iv_25500'
]

# Reshape into (12065, 52)
reshaped_df = combined_df.values.reshape(52, 12065).T  # transpose to get (12065, 52)

# Create new DataFrame
final_df = pd.DataFrame(reshaped_df, columns=column_names)
final_df['timestamp'] = final_df.index

# Move 'timestamp' to the first column
final_df = final_df[["timestamp"] + [col for col in final_df.columns if col != "timestamp"]]

assert final_df.shape == hhh.shape, "final_df does not have the expected shape ."

# Save to CSV
final_df.to_csv("submission9_m2.csv", index=False)