In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [1]:
#Removes specified columns ('row_id', 'time_id', 'date_id') from the DataFrame.
def remove_unneeded_columns(data_frame):
    relevant_columns = [column for column in data_frame.columns if column not in ['row_id', 'time_id', 'date_id']]
    cleaned_data = data_frame[relevant_columns]
    return cleaned_data

#Calculates lower and upper bounds for outliers using the 1st and 99th percentiles of the specified column.
def calculate_outlier_limits(data_frame, col):
    first_quantile = data_frame[col].quantile(0.01)
    third_quantile = data_frame[col].quantile(0.99)
    inter_quantile_range = third_quantile - first_quantile
    lower_bound = first_quantile - 1.5 * inter_quantile_range
    upper_bound = third_quantile + 1.5 * inter_quantile_range
    return lower_bound, upper_bound

#Modifies the values in the specified column of the DataFrame to be within the previously calculated outlier limits.
def adjust_outliers(data_frame, col_name):
    lower, upper = calculate_outlier_limits(data_frame, col_name)
    data_frame.loc[data_frame[col_name] > upper, col_name] = upper
    data_frame.loc[data_frame[col_name] < lower, col_name] = lower

#Performs several preprocessing steps on the DataFrame and One-hot encodes the 'imbalance_buy_sell_flag' column and Creates a new column 'ratio_imbalance' by dividing 'imbalance_size' by 'matched_size'.
def data_preprocessing(data_frame):
    data_frame = pd.get_dummies(data_frame, columns=['imbalance_buy_sell_flag'], prefix='flag', drop_first=True)
    data_frame['ratio_imbalance'] = data_frame['imbalance_size'] / data_frame['matched_size']
    return data_frame


In [None]:
def engineered_features(df):
    

    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]


    df["volume"] = df.eval("ask_size + bid_size")


    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")


    df["bid_ask_size_ratio"] = df["bid_size"] / df["ask_size"]
    df["imbalance_bid_size_ratio"] = df["imbalance_size"] / df["bid_size"]
    df["imbalance_ask_size_ratio"] = df["imbalance_size"] / df["ask_size"]
    df["matched_size_ratio"] = df["matched_size"] / (df["bid_size"] + df["ask_size"])
    df["ref_wap_difference"] = df["reference_price"] - df["wap"]
    df["bid_ask_spread"] = df["ask_price"] - df["bid_price"]
    df["near_far_price_difference"] = df["far_price"] - df["near_price"]


    df["wap_rate_of_change"] = df.groupby('stock_id')["wap"].pct_change()
    df["wap_momentum"] = df.groupby('stock_id')["wap"].diff()

    df["auction_start"] = (df["seconds_in_bucket"] == 0).astype(int)
    df["auction_end"] = (df["seconds_in_bucket"] == 550).astype(int)
    df["time_since_last_change"] = df.groupby('stock_id')['imbalance_buy_sell_flag'].diff(periods=1).ne(0).cumsum()
    df["time_until_auction_close"] = 600 - df["seconds_in_bucket"]


    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")


    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])


    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)

    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    df['wap_askprice_diff'] = df['ask_price'] - df['wap']
    df['wap_bidprice_diff'] = df['wap'] - df['bid_price']
    df['wap_askprice_diff_urg'] = df['wap_askprice_diff'] * df['liquidity_imbalance']
    df['wap_bidprice_diff_urg'] = df['wap_bidprice_diff'] * df['liquidity_imbalance']
    df['bid_size_ask_size_diff'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imbalance_size_matched_size_diff'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')


    df["dow"] = df["date_id"] % 5  
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  


    global_stock_id_feats = {
        "median_size": df.groupby("stock_id")["bid_size"].median() + df.groupby("stock_id")["ask_size"].median(),
        "std_size": df.groupby("stock_id")["bid_size"].std() + df.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df.groupby("stock_id")["bid_size"].max() - df.groupby("stock_id")["bid_size"].min(),
        "median_price": df.groupby("stock_id")["bid_price"].median() + df.groupby("stock_id")["ask_price"].median(),
        "std_price": df.groupby("stock_id")["bid_price"].std() + df.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df.groupby("stock_id")["bid_price"].max() - df.groupby("stock_id")["ask_price"].min(),
    }
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df.replace([np.inf, -np.inf], 0)

In [None]:
df = pd.read_csv("/input/optiver-trading-at-the-close/train.csv")
test = pd.read_csv("/input/optiver-trading-at-the-close/example_test_files/test.csv") 

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.head()

In [None]:

df.isnull().sum().sum()

In [None]:

ask_p_na = df[df['ask_price'].isna()]
print(ask_p_na.stock_id.unique())
print(ask_p_na.date_id.unique())
print(ask_p_na.seconds_in_bucket.unique())
for date in ask_p_na.date_id.unique():
    x = df[df.date_id == date]
    print(date, x[x.ask_price.isna()]['stock_id'].unique().tolist())

In [None]:

df = df.dropna(subset=["ask_price"], axis=0)
df.loc[df['seconds_in_bucket'] <= 300, "near_price"] = 0
df.loc[df['seconds_in_bucket'] <= 300, "far_price"] = 0
df['far_price'] = df['far_price'].interpolate()

In [None]:


df.isnull().sum().sum()



In [None]:


df = engineered_features(df)

In [None]:


from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error

In [None]:

X = df.drop(["target", "row_id"], axis=1)
y = df[["target"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:


lgb_model = lgbm.LGBMRegressor()
lgb_model.fit(X, y)

In [None]:

def plot_importance(model, features, num=len(X)):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show(block=True)

In [None]:
plot_importance(lgb_model, X)

In [None]:
lgb_params = {"learning_rate": [0.01 , 0.1],
               "n_estimators": [100, 300, 500, 1000],
               "colsample_bytree": [0.5, 0.7, 1]}

In [None]:
lgbm_best_grid = GridSearchCV(lgb_model, lgb_params, cv=5, n_jobs=-1, verbose=True).fit(X, y)

In [None]:
lgb_final = lgb_model.set_params(**lgbm_best_grid.best_params_, random_state=17).fit(X, y)

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
ctr = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    test = engineered_features(test)
    test_df = test.drop(["row_id"], axis=1)
    sample_prediction['target'] = lgb_model.predict(test_df)
    env.predict(sample_prediction)
    
    ctr += 1