In [2]:
#PERFORM LIGHTGM MODEL ..........

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# FUNTION TO HANDLE OUTLIERS 
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df



# TIME BASED FEATURE ENG..........
def time_based_features(df):
    df['date_id'] = pd.to_datetime(df['date_id'], format='%Y-%m-%d')
    df['day_of_week'] = df['date_id'].dt.dayofweek
    df['month'] = df['date_id'].dt.month
    df['year'] = df['date_id'].dt.year
    df['hour_of_day'] = (df['seconds_in_bucket'] / 3600).astype(int)
    return df



# PRICE BADES FEATURE ENG .........
def price_based_features(df):
    
    df['price_spread'] = df['ask_price'] - df['bid_price']
  
    df['price_ratio'] = df['ask_price'] / df['bid_price']
    
    
    
    df['price_pct_change_5'] = df.groupby('stock_id')['reference_price'].pct_change(periods=5)
    df['price_pct_change_10'] = df.groupby('stock_id')['reference_price'].pct_change(periods=10)
    df['price_pct_change_30'] = df.groupby('stock_id')['reference_price'].pct_change(periods=30)
    
    return df



# VOLUME BASED FEATURE ENG...
def volume_based_features(df):
    df['volume_spread'] = df['ask_size'] - df['bid_size']
    
    df['volume_ratio'] = df['ask_size'] / df['bid_size']
    
    df['volume_pct_change'] = df.groupby('stock_id')['matched_size'].pct_change()
    
    return df


# PEFORM IMBALANCED BASE FEATURE ENG......
def imbalance_features(df):
    # CRATE A FEATURE THAT CAPTURE THE RELATIONSHIP BETWEEN imbalance_size AND  imbalance_buy_sell_flag
    df['imbalance_relation'] = df['imbalance_size'] * (df['imbalance_buy_sell_flag'] == 'B') - df['imbalance_size'] * (df['imbalance_buy_sell_flag'] == 'S')
    
    
    # COMPUTE COMPULATIVE  IMBALANCE FEATURE OVER TIME 
    df['cumulative_imbalance_size'] = df.groupby(['stock_id', 'time_id'])['imbalance_size'].cumsum()
    
    return df



# FUNCTION TO PERFORM MOVING AVERAGE  AND EXPONENTIAL MOVING AVERAGE   

def moving_averages_ema(df):
    
    # CALCULATE MOVING AVERAGE  FOR PRICE  AND VOLUME RELATED COLUMNS OVER DIFFERENT TIME WINDOW 
    windows = [5, 10, 30]  # TIME WINDOW FOR MOVING AVERAGE 
    for window in windows:
        df[f'price_ma_{window}'] = df.groupby('stock_id')['wap'].transform(lambda x: x.rolling(window).mean())
        df[f'volume_ma_{window}'] = df.groupby('stock_id')['volume_spread'].transform(lambda x: x.rolling(window).mean())
    
    # CALCULATE THE EXPONENETIAL MOVING AVERAGE  
    alpha = [0.1, 0.2, 0.5]  # ALPHA VALUES FOR EMA 
    for a in alpha:
        df[f'price_ema_{a}'] = df.groupby('stock_id')['wap'].transform(lambda x: x.ewm(alpha=a, adjust=False).mean())
        df[f'volume_ema_{a}'] = df.groupby('stock_id')['volume_spread'].transform(lambda x: x.ewm(alpha=a, adjust=False).mean())
    
    return df


# FUNTION TO PERFORM PRICE TRENDS FEATURE ENGINNERRING 
def price_trends(df):
    periods = [5, 10, 30]  # NUMBER OF PREVIOUS TIME PERIODS
    for period in periods:
        df[f'price_trend_{period}'] = (df.groupby('stock_id')['reference_price'].shift(0) - df.groupby('stock_id')['reference_price'].shift(period)) > 0
    
    return df


# FUNTION TO CALCULATE KURTOSIS
def calculate_kurtosis(grouped_data):
    return grouped_data.kurtosis()

# FUNCTION TO CALCULATE STATISTICAL MEATURES
def calculate_statistics(df):
    # DEFINE NUMERIC COL
    numerical_cols = [
        'seconds_in_bucket', 'imbalance_size', 'reference_price', 'matched_size',
        'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'target'
    ]

    # CALCULATE STATISTICS FOR EACH NUMERIC COLUMN
    for col in numerical_cols:
        df[f'{col}_mean'] = df.groupby('stock_id')[col].transform('mean')
        df[f'{col}_std'] = df.groupby('stock_id')[col].transform('std')
        df[f'{col}_skew'] = df.groupby('stock_id')[col].transform('skew')
        df[f'{col}_kurtosis'] = df.groupby('stock_id')[col].transform(calculate_kurtosis)
    
    return df





# DATASET
df_train = pd.read_csv(r"C:\Users\nh013\Desktop\predict us stock price movement compitetion\train.csv")

# FEATURE 
columns_to_keep = [
    'stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag',
    'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size',
    'ask_price', 'ask_size', 'wap', 'target', 'time_id', 'row_id'
]
df_train = df_train[columns_to_keep]

# DEFINE CATEGORICAL AND NUMERIC COLUMN
categorical_cols = [
    'stock_id', 'date_id', 'imbalance_buy_sell_flag', 'time_id', 'row_id'
]
numerical_cols = [
    'seconds_in_bucket', 'imbalance_size', 'reference_price', 'matched_size',
    'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'target'
]

#IDENTIFY MISSING VALIES WITH IMPUTE 
imputer = SimpleImputer(strategy='mean')
df_train[['target']] = imputer.fit_transform(df_train[['target']])

# DROP ROWS WITH MISSING VALUES
df_train.dropna(inplace=True)

# HANDLE OUTLIERS 
df_train = handle_outliers(df_train, 'target')



df_train = time_based_features(df_train)
df_train = price_based_features(df_train)
df_train = volume_based_features(df_train)
df_train = imbalance_features(df_train)
df_train = moving_averages_ema(df_train)
df_train = price_trends(df_train)
df_train = calculate_statistics(df_train)


# DROP ROWS WITH MISSING VALUES
df_train.dropna(inplace=True)

# PERFORM BINARY ENCODE FOR CATEGORICAL COL
encoder = ce.BinaryEncoder(cols=categorical_cols)
df_train_encoded = encoder.fit_transform(df_train)

# NORMALIZE AND SCALEING 
scaler_train = StandardScaler()
df_train_encoded[numerical_cols] = scaler_train.fit_transform(df_train_encoded[numerical_cols])


# DEFINE FEATURE AND TARGET VARIABLE 
X = df_train_encoded.drop(columns=['target'])
y = df_train_encoded['target']

# SPLIT DATA INTO TRAINING AND TESTING SET 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CREATE LIGHTGM DATASET FOR TRAINIGN 
train_data = lgb.Dataset(X_train, label=y_train)

# DEFINE HYPERPARAMETERS FOR LIGHTGM MODEL 
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# TRAIN THE LIGHTGM MODEL 
num_round = 100
bst = lgb.train(params, train_data, num_round)

# PREDICTION ON THE TESTSET 
y_pred_prob = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred = [1 if p > 0.5 else 0 for p in y_pred_prob]  # Convert probabilities to binary predictions

# CONVERT CONTINUOUS VALUES TO BINARY LABELS  USING A THRESHOLD
threshold = 0.5
y_test_binary = [1 if p > threshold else 0 for p in y_test]

# EVALUATE THE MODEL WITH BINARY LABEL 
accuracy = accuracy_score(y_test_binary, y_pred)
classification_rep = classification_report(y_test_binary, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)


[LightGBM] [Info] Number of positive: 887226, number of negative: 883232
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17682
[LightGBM] [Info] Number of data points in the train set: 1770458, number of used features: 138
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501128 -> initscore=0.004512
[LightGBM] [Info] Start training from score 0.004512
Accuracy: 0.5409757916021825
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.52      0.61    312179
           1       0.34      0.60      0.44    130436

    accuracy                           0.54    442615
   macro avg       0.55      0.56      0.52    442615
weighted avg       0.63      0.54      0.56    442615

