# IST 5520 Competition Problem (HW 11)
## By: Sayantan Majumdar and Dawit Wolday Asfaw
### Contributions: 

#### Sayantan- Model implementation, Hyperparameter Tuning; Dawit- Data transformation, Model comparison
##### Supplementary notebook: spam_classifier_model_tuning.ipynb

In [1]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
import pandas as pd

In [2]:
train_df = pd.read_csv('Data/train_data.csv')
test_df = pd.read_csv('Data/test_data.csv')

In [3]:
y = train_df['class'].copy()
X = train_df.drop(columns='class')

In [4]:
y_test = test_df['class'].copy()
X_test = test_df.drop(columns='class')

In [5]:
# Data Normalization
scaler = MinMaxScaler()
train_norm_arr = scaler.fit_transform(X)
test_norm_arr = scaler.transform(X_test)

In [6]:
# These are the best parameters obtained using GridSearchCV
param_dict = {
    'colsample_bynode': 1, 
    'colsample_bytree': 0.5, 
    'learning_rate': 0.05, 
    'max_bin': 127, 
    'max_depth': 10, 
    'n_estimators': 300, 
    'num_leaves': 63, 
    'path_smooth': 0.2, 
    'subsample': 1,
}

In [7]:
model = LGBMClassifier(
            boosting_type='gbdt',
            tree_learner='feature', 
            class_weight='balanced',
            random_state=0,
            deterministic=True, force_row_wise=True
        )

In [8]:
model.set_params(**param_dict)

LGBMClassifier(class_weight='balanced', colsample_bynode=1,
               colsample_bytree=0.5, deterministic=True, force_row_wise=True,
               learning_rate=0.05, max_bin=127, max_depth=10, n_estimators=300,
               num_leaves=63, path_smooth=0.2, random_state=0, subsample=1,
               tree_learner='feature')

In [9]:
model.fit(train_norm_arr, y)

LGBMClassifier(class_weight='balanced', colsample_bynode=1,
               colsample_bytree=0.5, deterministic=True, force_row_wise=True,
               learning_rate=0.05, max_bin=127, max_depth=10, n_estimators=300,
               num_leaves=63, path_smooth=0.2, random_state=0, subsample=1,
               tree_learner='feature')

In [10]:
y_pred = model.predict(test_norm_arr)
print('Test ROC_AUC:', roc_auc_score(y_test, y_pred))

Test ROC_AUC: 0.9612147726474103
