**All data used in this notebook comes from Feature Engineering**

In [None]:
def draw_feature_importance(feature_importance, features):
    feature_imp = pd.DataFrame(zip(feature_importance, features), columns=['Value','Feature'])
    plt.figure(figsize=(20, 500))
    sns.barplot(x="Value", y="Feature", data = feature_imp.sort_values(by = "Value", ascending = False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()

In [None]:
def get_important_features(feature_importance, feature):
    feature_imp = pd.DataFrame(zip(feature_importance, features), columns=['Value','Feature'])
    feature_imp = feature_imp.sort_values(by = 'Value', ascending = False)
    return feature_imp

**Using Bayesian Optimization to detect the best parameters**

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

In [None]:
def model(reduce_train, reduce_test, useful_features, n_splits, num_leaves, max_depth, min_child_weight, feature_fraction, lambda_l1, lambda_l2, 
          bagging_fraction, min_data_in_leaf, learning_rate, reg_alpha, reg_lambda, n_estimators):
     
        params =  {'num_leaves': int(num_leaves),  
            'max_depth' : int(max_depth),
           'min_child_weight': min_child_weight,
           'feature_fraction': feature_fraction,
           'bagging_fraction': bagging_fraction,
           'min_data_in_leaf': int(min_data_in_leaf), 
           'objective': 'regression',
           "metric": 'rmse',
           'learning_rate': learning_rate, 
           "boosting_type": "gbdt",
           "bagging_seed": 11,
           "verbosity": -1,
           'reg_alpha': reg_alpha,
           'reg_lambda': reg_lambda,
           'random_state': 46,
           'num_threads': 16,
           'lambda_l1': lambda_l1,  
           'lambda_l2': lambda_l2, 
           'n_estimators': int(n_estimators),
           'early_stopping': 150
    }
        def run_lgb(reduce_train, reduce_test, useful_features, n_splits = n_splits):
            #useful_features.remove('installation_id')
            rmse_score_list = []
            useful_features = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in useful_features]
            reduce_train = change_json(reduce_train)
            reduce_test = change_json(reduce_test)
            kf = StratifiedKFold(n_splits = n_splits, random_state = 42, shuffle = True)
            oof_predict = np.zeros((len(reduce_train), ))
            y_pred = np.zeros((len(reduce_test), ))
            for fold, (train_index, test_index) in enumerate(kf.split(reduce_train, reduce_train[target])):
                X_train = reduce_train[useful_features].iloc[train_index]
                X_val = reduce_train[useful_features].iloc[test_index]
                y_train = reduce_train[target].iloc[train_index]
                y_val = reduce_train[target].iloc[test_index]
                train_set = lgb.Dataset(X_train, y_train, categorical_feature = categoricals)
                val_set = lgb.Dataset(X_val, y_val, categorical_feature = categoricals)
                lgb_model = lgb.train(params, train_set, num_boost_round = params['n_estimators'], valid_sets = [train_set, val_set],
                             early_stopping_rounds = params['early_stopping'])
                val_predict = lgb_model.predict(X_val)
                rmse_score = np.sqrt(mean_squared_error(val_predict, y_val))
                rmse_score_list.append(rmse_score)
            return -np.mean(rmse_score_list)
        
        return run_lgb(reduce_train, reduce_test, useful_features)

In [None]:
from functools import partial
partial_model = partial(model, reduce_train, reduce_test, new_features, n_splits = 2)

In [None]:
bounds_LGB = {
    'num_leaves' : (50, 100),
    'max_depth': (8, 30),
    'min_child_weight' : (0.01, 0.6),
    'min_data_in_leaf' : (80, 120),
    'feature_fraction' : (0.1, 0.8),
    'lambda_l1': (0, 10),
    'lambda_l2': (0, 10),
    'bagging_fraction': (0.2, 1),
    'learning_rate': (0.01, 0.8),
    'reg_alpha' : (0.1 , 5), 
    'reg_lambda' : (0.1, 5),
    'n_estimators' : (5000,8000)
}

In [None]:
import warnings

In [None]:
init_points = 16
n_iter = 16
LGB_BO = BayesianOptimization(partial_model, bounds_LGB, random_state=1029)
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points = init_points, n_iter = n_iter, acq='ei', alpha=1e-6)

In [None]:
best_LGB_BO_params = LGB_BO.max['params']

**Use the selected parameters to cross validation and find the best features**

In [None]:
bayesian_params =  {'num_leaves': 50,  
            'max_depth' : 30,
           'min_child_weight': 0.01,
           'feature_fraction': 0.8,
           'bagging_fraction': 0.2,
           'min_data_in_leaf': 80, 
           'objective': 'regression',
           "metric": 'rmse',
           'learning_rate': 0.01, 
           "boosting_type": "gbdt",
           "bagging_seed": 11,
           "verbosity": -1,
           'reg_alpha': 50,
           'reg_lambda': 0.1,
           'random_state': 46,
           'num_threads': 16,
           'lambda_l1': 10,  
           'lambda_l2': 0, 
           'n_estimators': 5149,
           'early_stopping': 150
    }

In [None]:
y_pred_bayes, oof_predict_bayes, feature_importance_bayes = run_lgb_regression(reduce_train, reduce_test, new_features, 5, 10, bayesian_params)

In [None]:
oof_predict_bayes

In [None]:
eval_qwk_lgb_regr(reduce_train[target], oof_predict_bayes)

In [None]:
feature_imp_fold_1 = get_important_features(feature_importance_bayes['fold_1'], new_features)
feature_imp_fold_2 = get_important_features(feature_importance_bayes['fold_2'], new_features)
feature_imp_fold_3 = get_important_features(feature_importance_bayes['fold_3'], new_features)
feature_imp_fold_4 = get_important_features(feature_importance_bayes['fold_4'], new_features)
feature_imp_fold_5 = get_important_features(feature_importance_bayes['fold_5'], new_features)
feature_imp_fold_list = [feature_imp_fold_1, feature_imp_fold_2, feature_imp_fold_3, feature_imp_fold_4, feature_imp_fold_5]

In [None]:
def merge_feature_imp(feature_imp_fold_list):
    feature_imp_fold_1 = feature_imp_fold_list[0].set_index('Feature')
    feature_imp_fold_2 = feature_imp_fold_list[1].set_index('Feature')
    feature_imp_fold_3 = feature_imp_fold_list[2].set_index('Feature')
    feature_imp_fold_4 = feature_imp_fold_list[3].set_index('Feature')
    feature_imp_fold_5 = feature_imp_fold_list[4].set_index('Feature')
    df1 = pd.merge(feature_imp_fold_1, feature_imp_fold_2, how = 'inner', left_index = True, right_index = True)
    df2 = df1.merge(feature_imp_fold_3, how = 'inner', left_index = True, right_index = True)
    df3 = df2.merge(feature_imp_fold_4, how = 'inner', left_index = True, right_index = True)
    final_df = df3.merge(feature_imp_fold_5, how = 'inner', left_index = True, right_index = True)
    final_df.columns = ['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5']
    final_df['average'] = final_df[['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5']].mean(axis = 1)
    return final_df

In [None]:
feature_importance_from_all_folders = merge_feature_imp(feature_imp_fold_list)

In [None]:
feature_importance_from_all_folders = feature_importance_from_all_folders.sort_values('average', ascending = False)

In [None]:
top_features = feature_importance_from_all_folders.loc[(feature_importance_from_all_folders['average'] > 5) & (feature_importance_from_all_folders['average'] < 600), :].index

In [None]:
top_features = list(top_features.values)

In [None]:
if 'session_title' not in top_features:
    top_features.append('session_title')

**Use the top features, we run lightgbm and tune the parameters**

In [None]:
partial_model_top_features = partial(model, reduce_train, reduce_test, top_features, 5)

In [None]:
init_points = 16
n_iter = 16
LGB_BO_top_features = BayesianOptimization(partial_model_top_features, bounds_LGB, random_state = 1029)
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO_top_features.maximize(init_points = init_points, n_iter = n_iter, acq='ei', alpha=1e-6)

In [None]:
best_LGB_BO_top_features_params = LGB_BO_top_features.max['params']

In [None]:
LGB_BO_top_features.max

In [None]:
best_LGB_BO_top_features_params

In [None]:
y_pred_bayes_top_features, oof_predict_bayes_top_features, feature_importance_bayes_top_features = run_lgb_regression(reduce_train, reduce_test, 
                                                                                             top_features, 5, 10, bayesian_params_top_features)

In [None]:
oof_predict_bayes_top_features

In [None]:
eval_qwk_lgb_regr(reduce_train[target], oof_predict_bayes_top_features)

In [None]:
def reg_to_cat(y_regress):
    dist = Counter(reduce_train['accuracy_group'])
    for k in dist:
        dist[k] /= len(reduce_train)
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_regress, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_regress)))
    return y_pred

In [None]:
y_pred = reg_to_cat(y_pred_bayes_top_features)

In [None]:
sample_submission['accuracy_group'] = y_pred.astype(int)
sample_submission.to_csv('./submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)