In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score

In [2]:
train = pd.read_csv("Train_added_features_final.csv", index_col=0)
test = pd.read_csv("Test_added_features_final.csv", index_col=0)

In [3]:
train.head()

Unnamed: 0_level_0,comp,left,Reused_id,1_ratings,2_ratings,3_ratings,4_ratings,total_num_ratings,lastrating,Delta_days,...,2_reviews_opposed,3_reviews_supported,3_reviews_opposed,4_reviews_supported,4_reviews_opposed,total_opp,total_supp,Reliability,NonReliability,Reliability_factor
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.066176,0,0.0,0.25,0.25,0.25,0.25,4.0,4,283,...,0.25,0.25,0.25,0.25,0.25,0.0,0.0,15,2,0.882353
4,0.066176,0,0.0,0.0,0.136364,0.272727,0.590909,22.0,4,92,...,0.0,0.0,0.5,0.666667,0.5,2.0,3.0,30,8,0.789474
6,0.066176,0,0.0,0.015385,0.025641,0.046154,0.912821,195.0,4,0,...,0.083333,0.212121,0.333333,0.545455,0.416667,12.0,66.0,80,10,0.888889
7,0.066176,0,0.0,0.137931,0.172414,0.448276,0.241379,29.0,4,0,...,0.0,0.222222,0.0,0.388889,0.666667,3.0,18.0,69,32,0.683168
8,0.066176,1,1.0,0.066667,0.2,0.6,0.133333,15.0,3,2,...,0.083333,0.083333,0.0,0.5,0.416667,0.0,12.0,65,72,0.474453


In [4]:
X = train.drop('left', axis=1) 
y = train['left']

In [5]:
np.unique(y.values, return_counts=True)

(array([0, 1], dtype=int64), array([2932,  581], dtype=int64))

#### Due to the presence of class imbalance, negative sampling is done 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

train_data = pd.DataFrame.copy(X_train)
train_data['left']  = y_train 

In [7]:
# Over Sampling Negative Class

pos_class = train_data[train_data['left']==1]
neg_class = train_data[train_data['left']==0]
neg_resampled = neg_class.sample(n=int(len(pos_class)), replace=False)
    
train_data_resampled = pd.concat([pos_class, neg_resampled])

# Use these for train_dataings

X_train_resampled = train_data_resampled.drop('left', axis=1).values
y_train_resampled = train_data_resampled['left']

In [8]:
train_data_resampled.columns

Index(['comp', 'Reused_id', '1_ratings', '2_ratings', '3_ratings', '4_ratings',
       'total_num_ratings', 'lastrating', 'Delta_days', 'Supported', 'Opposed',
       '1_reviews_supported', '1_reviews_opposed', '2_reviews_supported',
       '2_reviews_opposed', '3_reviews_supported', '3_reviews_opposed',
       '4_reviews_supported', '4_reviews_opposed', 'total_opp', 'total_supp',
       'Reliability', 'NonReliability', 'Reliability_factor', 'left'],
      dtype='object')

### GridSearch CV

In [27]:
model = lgb.LGBMClassifier()
param_grid = {
    'n_estimators': [100, 500],
#     'max_depth': [15,20,25, -1],
    'num_leaves': [30, 50, 100],
    'reg_alpha': [1, 10, 20],
    'reg_lambda': [1, 10, 20],
    'feature_fraction': [0.5, 0.6, 0.7, 0.8],
    'bagging_fraction': [0.8, 0.9, 1],
    'subsample_freq': [10, 20, 50],
    'metric': ['auc']
}



cv = 5

gs = GridSearchCV(estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring='balanced_accuracy',
        verbose=2)
gs.fit(X_train_resampled, y_train_resampled)

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 808 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 1493 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done 1971 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 2480 tasks      | elapsed:   49.9s
[Parallel(n_jobs=-1)]: Done 3185 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3991 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 4859 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 5841 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 7048 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 8175 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 9543 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 9720 out of 9720 | elapsed:  3.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambd...
                                      subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid={'bagging_fraction': [0.8, 0.9, 1],
                         'feature_fraction': [0.5, 0.6, 0.7, 0.8],
                         'metric': ['auc'], 'n_estimat

In [28]:
LGB_models2 = pd.DataFrame(gs.cv_results_)
best_LGB_model2 = LGB_models2.sort_values(by='mean_test_score', ascending=False).iloc[:5]

best_LGB_model2.loc[:, ['params', 'mean_test_score']]

Unnamed: 0,params,mean_test_score
850,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
877,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
823,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
39,"{'bagging_fraction': 0.8, 'feature_fraction': ...",0.857595
66,"{'bagging_fraction': 0.8, 'feature_fraction': ...",0.857595


In [29]:
best_LGB_model2['params'].iloc[0]

{'bagging_fraction': 0.9,
 'feature_fraction': 0.6,
 'metric': 'auc',
 'n_estimators': 100,
 'num_leaves': 50,
 'reg_alpha': 10,
 'reg_lambda': 10,
 'subsample_freq': 20}

In [20]:
LGB_models1 = pd.DataFrame(gs.cv_results_)
best_LGB_model1 = LGB_models1.sort_values(by='mean_test_score', ascending=False).iloc[:5]

best_LGB_model1.loc[:, ['params', 'mean_test_score']]

Unnamed: 0,params,mean_test_score
7801,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
8071,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
8341,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
7828,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865
8530,"{'bagging_fraction': 0.9, 'feature_fraction': ...",0.85865


In [22]:
best_LGB_model1['params'].iloc[0]

{'bagging_fraction': 0.9,
 'feature_fraction': 0.6,
 'max_depth': 15,
 'metric': 'auc',
 'n_estimators': 100,
 'num_leaves': 50,
 'reg_alpha': 10,
 'reg_lambda': 10,
 'subsample_freq': 20}

In [26]:
best_LGB_model1['params'].iloc[4]

{'bagging_fraction': 0.9,
 'feature_fraction': 0.6,
 'max_depth': -1,
 'metric': 'auc',
 'n_estimators': 100,
 'num_leaves': 50,
 'reg_alpha': 10,
 'reg_lambda': 10,
 'subsample_freq': 20}

In [12]:
# LGB_models = pd.DataFrame(gs.cv_results_)
# best_LGB_model = LGB_models.sort_values(by='mean_test_score', ascending=False).iloc[:5]

# best_LGB_model.loc[:, ['params', 'mean_test_score']]

Unnamed: 0,params,mean_test_score
2235,"{'colsample_bytree': 0.8, 'max_depth': 20, 'me...",0.853376
2009,"{'colsample_bytree': 0.8, 'max_depth': 20, 'me...",0.853376
1658,"{'colsample_bytree': 0.8, 'max_depth': 15, 'me...",0.853376
2441,"{'colsample_bytree': 0.8, 'max_depth': 25, 'me...",0.853376
2883,"{'colsample_bytree': 0.8, 'max_depth': 25, 'me...",0.853376


In [17]:
best_LGB_model['params'].iloc[2]

{'colsample_bytree': 0.8,
 'max_depth': 15,
 'metric': 'auc',
 'min_split_gain': 0.3,
 'n_estimators': 1000,
 'num_leaves': 100,
 'reg_alpha': 1.2,
 'reg_lambda': 1.1,
 'subsample': 0.9,
 'subsample_freq': 20}

Notes :
1. Try bagging of models? (mean probability and then classify)

### Day 2 params - with bagging

In [9]:
n_models = 50

In [11]:
cv_params = {
    
    'objective': 'binary',   # 'cross_entropy',
#     'n_estimators': 1000,     # 100
#     'learning_rate':  0.01,   # 0.1
#     'num_leaves' : 31,        # 31
    'num_threads' : -1, 
    
    'max_depth': -1,          #-1 (no limit)
 
    'feature_fraction': 0.6,  # 1
    'bagging_fraction': 0.8,  # 1
    'reg_lambda': 10,

    'metric': 'auc',     
    }

models = []
y_train_preds = []
y_test_preds = []

for i in range(n_models):
    
    # Over Sampling Negative Class
    pos_class = train_data[train_data['left']==1]
    neg_class = train_data[train_data['left']==0]
    neg_resampled = neg_class.sample(n=int(len(pos_class)), replace=False)

    train_data_resampled = pd.concat([pos_class, neg_resampled])

    # Use these for train_dataings
    X_train_resampled = train_data_resampled.drop('left', axis=1).values
    y_train_resampled = train_data_resampled['left']    
        
    lgb_train = lgb.Dataset(X_train_resampled, y_train_resampled)

    model = lgb.train(cv_params, lgb_train,)

    y_train_prob = model.predict(X_train_resampled)
    y_test_prob = model.predict(X_test)

    models.append(model)
    y_train_preds.append(y_train_prob)
    y_test_preds.append(y_test_prob)
    
    thres = 0.5
    y_pred_train = np.zeros(len(y_train_prob))
    y_pred_train[np.argwhere(y_train_prob>thres)] = 1
    y_pred_test = np.zeros(len(y_test_prob))
    y_pred_test[np.argwhere(y_test_prob>thres)] = 1

    
    print('\n Model : ', i)
    print('Train Accuracy =', accuracy_score(y_train_resampled, y_pred_train))
    print('Train Balanced Accuracy =', balanced_accuracy_score(y_train_resampled, y_pred_train))
    print('Train F1 Score =', f1_score(y_train_resampled, y_pred_train))

    print('\n')

    print('Test Accuracy =', accuracy_score(y_test, y_pred_test))
    print('Test Balanced Accuracy =', balanced_accuracy_score(y_test, y_pred_test))
    print('Test F1 Score =', f1_score(y_test, y_pred_test))


 Model :  0
Train Accuracy = 0.9630801687763713
Train Balanced Accuracy = 0.9630801687763713
Train F1 Score = 0.9631966351209252


Test Accuracy = 0.8605974395448079
Test Balanced Accuracy = 0.8679436116163833
Test F1 Score = 0.6573426573426573

 Model :  1
Train Accuracy = 0.9588607594936709
Train Balanced Accuracy = 0.9588607594936709
Train F1 Score = 0.9591623036649215


Test Accuracy = 0.8805120910384068
Test Balanced Accuracy = 0.8796885780593364
Test F1 Score = 0.6911764705882354

 Model :  2
Train Accuracy = 0.9630801687763713
Train Balanced Accuracy = 0.9630801687763713
Train F1 Score = 0.9635796045785641


Test Accuracy = 0.8534850640113798
Test Balanced Accuracy = 0.8599150097221351
Test F1 Score = 0.643598615916955

 Model :  3
Train Accuracy = 0.9462025316455697
Train Balanced Accuracy = 0.9462025316455696
Train F1 Score = 0.9465968586387434


Test Accuracy = 0.8662873399715505
Test Balanced Accuracy = 0.8597974032490748
Test F1 Score = 0.6594202898550725

 Model :  4
Trai


 Model :  34
Train Accuracy = 0.9567510548523207
Train Balanced Accuracy = 0.9567510548523207
Train F1 Score = 0.956887486855941


Test Accuracy = 0.8705547652916074
Test Balanced Accuracy = 0.8661481527943298
Test F1 Score = 0.6690909090909092

 Model :  35
Train Accuracy = 0.9567510548523207
Train Balanced Accuracy = 0.9567510548523207
Train F1 Score = 0.956887486855941


Test Accuracy = 0.8605974395448079
Test Balanced Accuracy = 0.8641096405946183
Test F1 Score = 0.6549295774647887

 Model :  36
Train Accuracy = 0.9588607594936709
Train Balanced Accuracy = 0.9588607594936709
Train F1 Score = 0.9590766002098636


Test Accuracy = 0.8549075391180654
Test Balanced Accuracy = 0.8530859938531017
Test F1 Score = 0.6408450704225352

 Model :  37
Train Accuracy = 0.959915611814346
Train Balanced Accuracy = 0.9599156118143459
Train F1 Score = 0.9603340292275575


Test Accuracy = 0.8677098150782361
Test Balanced Accuracy = 0.8683042714671015
Test F1 Score = 0.6666666666666667

 Model :  38
T

In [14]:
y_test_preds = np.array(y_test_preds)

In [18]:
y_test_prob_final = np.mean(y_test_preds, axis=0)


thres = 0.5

y_pred_test_final = np.zeros(len(y_test_prob_final))
y_pred_test_final[np.argwhere(y_test_prob_final>thres)] = 1


print('Test Accuracy =', accuracy_score(y_test, y_pred_test_final))
print('Test Balanced Accuracy =', balanced_accuracy_score(y_test, y_pred_test_final))
print('Test F1 Score =', f1_score(y_test, y_pred_test_final))

Test Accuracy = 0.8648648648648649
Test Balanced Accuracy = 0.8627924480963431
Test F1 Score = 0.6594982078853047


### Training 

In [35]:
# Over Sampling Negative Class

pos_class = train[train['left']==1]
neg_class = train[train['left']==0]
neg_resampled = neg_class.sample(n=int(len(pos_class)), replace=False)
    
train_resampled = pd.concat([pos_class, neg_resampled])

# Use these for trainings

X_train_resampled = train_resampled.drop('left', axis=1).values
y_train_resampled = train_resampled['left']

In [36]:
X_test = test.values

In [37]:
# cv_params = {
#     'max_depth': -1,
#     'objective': 'binary',
#     'metric':'auc',  
#     'feature_fraction': 0.6, 
#     'bagging_fraction': 0.8,
#     'reg_lambda': 10,
#     'n_estimators':1000
    
#     }

# cv_params = {
    
#     'objective': 'binary',   # 'cross_entropy',
# #     'n_estimators': 1000,     # 100
# #     'learning_rate':  0.01,   # 0.1
# #     'num_leaves' : 31,        # 31
#     'num_threads' : -1, 
    
#     'max_depth': -1,          #-1 (no limit)
    
 
#     'feature_fraction': 0.6,  # 1
#     'bagging_fraction': 0.8,  # 1
#     'reg_lambda': 10,

    
#     'metric': 'auc',     
#     }

cv_params = {'bagging_fraction': 0.9,
 'feature_fraction': 0.6,
 'metric': 'auc',
 'n_estimators': 100,
 'num_leaves': 50,
 'reg_alpha': 10,
 'reg_lambda': 10,
 'subsample_freq': 20}


lgb_train = lgb.Dataset(X_train_resampled, y_train_resampled)

model = lgb.train(cv_params, 
                lgb_train, 
                verbose_eval=1)

y_train_prob = model.predict(X_train_resampled)
y_test_prob = model.predict(X_test)

     
thres = 0.5
y_pred_train = np.zeros(len(y_train_prob))
y_pred_train[np.argwhere(y_train_prob>thres)] = 1
y_pred_test = np.zeros(len(y_test_prob))
y_pred_test[np.argwhere(y_test_prob>thres)] = 1
     
print('Train Accuracy =', accuracy_score(y_train_resampled, y_pred_train))
print('Train Balanced Accuracy =', balanced_accuracy_score(y_train_resampled, y_pred_train))
print('Train F1 Score =', f1_score(y_train_resampled, y_pred_train))

# print('\n\n')

# print('Test Accuracy =', accuracy_score(y_test, y_pred_test))
# print('Test Balanced Accuracy =', balanced_accuracy_score(y_test, y_pred_test))
# print('Test F1 Score =', f1_score(y_test, y_pred_test))
    


Train Accuracy = 0.9561101549053356
Train Balanced Accuracy = 0.9561101549053357
Train F1 Score = 0.9565957446808512


In [38]:
predictions = pd.DataFrame({'left':y_pred_test.astype(int)}, index=test.index)

In [39]:
baseline = pd.read_csv('Dataset/baseline_0.csv')

In [41]:
predictions_ord = predictions.loc[baseline['id'].values, :]

In [42]:
# Change this: format _day<no.>_<attempt no.>

# predictions_ord.to_csv('predictions_lgb_day2.csv')