In [3]:
import pandas as pd
import numpy as np
import lightgbm 
from lightgbm import LGBMClassifier

## Dataframe after filtering and adding new features

In [None]:
alldf.id.unique()

### Work on stock with average difficulty (average accruacy compared to other stocks )

In [None]:
tmpdf = alldf[alldf['id']==7].copy()
tmpdf

In [None]:
tmpdf.shape

In [None]:
#stock label is balanced
tmpdf.value_counts('target')

In [None]:
tmpdf = tmpdf.replace(np.nan,0)
tmpdf.head()

### Function to reduce dimensions of dataframe to desired dimension, using truncated svd dimensionality reduction method

In [None]:
def reduce(df, dim):
    svd = TruncatedSVD(n_components=dim, n_iter=7, random_state=42)
    svd.fit(tmpdf.iloc[0:3000,:-1])
    return svd.transform(df)

In [None]:
svd_all = reduce(tmpdf.iloc[: ,:-1],100)
sv_all = pd.DataFrame(svd_all, columns = list(str(i).zfill(2) for i in range(svd_all.shape[1])))
sv_all

## Feature selection on truncated SVD datasframe, using backward feature selection to filter top 20 feautres

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from mlxtend.evaluate import PredefinedHoldoutSplit
#customized train test split for feature selection evaluation
piter = PredefinedHoldoutSplit(list(range(3000,4490)))

In [None]:
sfs = SequentialFeatureSelector(LGBMClassifier(seed=42), 
                                n_features_to_select = 20,
                                direction='backward',
                                scoring='accuracy',
                                cv=piter) #Try 'backward'

In [None]:
#fit backward feature selection wrapper to filter best 20 features
sfs.fit(sv_all, np.ravel(tmpdf.iloc[:,-1:]))

In [None]:
lgb = LGBMClassifier(seed=42)
y = tmpdf.iloc[0:3000,-1:]
lgb.fit(sv_all[list(sv_all.columns[sfs.get_support()])].iloc[0:3000], y)

In [None]:
#model evaluation, predicting data
y_pred=lgb.predict(sv_all[list(sv_all.columns[sfs.get_support()])].iloc[3000:])
y_test = tmpdf.iloc[3000:,-1:]

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test, y_pred)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy))

## Tuning Model for better accuracy using optuna hyperparameter tuning

In [None]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
import sklearn

In [None]:
import optuna
def objective(trial):
    X_train,X_test,y_train,y_test = train_test_split(sv_all,y,test_size=0.25)
    dtrain = lgb.Dataset(X_train, label=y_train)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(y_test, pred_labels)
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)