In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score, accuracy_score,confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
   

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
# calculate class weights based on the training data

In [24]:
test = pd.read_parquet('/Users/natalie/Desktop/DS Thesis/Code/data/test.parquet')
train = pd.read_parquet('/Users/natalie/Desktop/DS Thesis/Code/data/train.parquet')

In [25]:
# drop column province
test = test.drop(columns=['province'])
train = train.drop(columns=['province'])

In [27]:
TARGET ='churn_user'
CATEGORICAL_FEATURES  = ['os_name', 'age_group','gender', 'country', 'region', 'province_type']
DATETIME_FEATURES  = ['first_date', 'lastest_active_day']
SEARCH_CC_FEATURES = [ 'clicks', 'search_volume', 'dating_search', 'videoclip_search', 'technical_search', 'housekeeping_family_search', 'marketing_search', 'other_search']
SEARCH_GG_FEATURES = [ 'serp_click', 'search_volume_gg', 'search_clicks_gg', 'other_search_gg','housekeeping_family_search_gg','videoclip_search_gg', 'dating_search_gg', 'marketing_search_gg', 'technical_search_gg']
ACTIVE_FEATURES = ['active_day', 'life_time',  'not_active_day', 'total_active_time']
ADS_FEATURES =  ['ads_impression', 'ads_click', 'ads_revenue']
OTHERS_FEATURES =[ 'newtab_count', 'download_count', 'pip_count', 'sidebar_count', 'incognito_count', 'signin_count', 'youtube_count',
                    'work_count', 'social_count', 'news_count', 'entertainment_count', 'ecommerce_count']
NUMERICAL_FEATURES = SEARCH_CC_FEATURES + SEARCH_GG_FEATURES + ACTIVE_FEATURES + ADS_FEATURES + OTHERS_FEATURES
MODEL_NAMES = ['log_reg', 'randomforest','lightgbm', 'xgboost', 'mlp']


In [30]:
def find_best_threshold_f1_score(train_labels, oofs, average='macro'):
    scores = []
    thresholds = []
    best_score = 0
    best_threshold = 0
    for threshold in np.arange(0.1, 0.9, 0.01):
        print(f'{threshold:.02f}, ', end='')
        preds = (oofs > threshold).astype('int')
        m = f1_score(train_labels, preds, average=average)
        scores.append(m)
        thresholds.append(threshold)
        if m > best_score:
            best_score = m
            best_threshold = threshold
    return best_threshold, best_score


In [31]:
def fillna(df):
    df['total_active_time'] = df['total_active_time'].fillna(0)
    df['ads_impression'] = df['ads_impression'].fillna(0)
    df['ads_click'] = df['ads_click'].fillna(0)
    df['ads_revenue'] = df['ads_revenue'].fillna(0)
    df['clicks'] = df['clicks'].fillna(0)
    for c in OTHERS_FEATURES:
        df[c] = df[c].fillna(0)
    return df

def process_data(df,oh_encoder=None, robust_scaler=None,agg_features=None):
    if not oh_encoder:
        print("fit train OneHotEncoder")
        oh_encoder = OneHotEncoder()
        oh_encoder.fit(df[CATEGORICAL_FEATURES])
    else:
        print("loadd onehot encoder")
    if not robust_scaler:
        print("fit train RobustScaler")
        robust_scaler = RobustScaler()
        robust_scaler.fit(df[NUMERICAL_FEATURES])
    else:
        print("loadd robust scaler")
    df_cat = pd.DataFrame(oh_encoder.transform(df[CATEGORICAL_FEATURES]).toarray())
    new_cat_cols = oh_encoder.get_feature_names_out(CATEGORICAL_FEATURES)
    df_cat.columns = new_cat_cols
    df_num = pd.DataFrame(robust_scaler.transform(df[NUMERICAL_FEATURES]))
    df_num.columns = NUMERICAL_FEATURES
    new_df = pd.concat([df_cat.reset_index(drop=True), df_num.reset_index(drop=True)], axis=1)
    new_df = fillna(new_df)
    return new_df, oh_encoder, robust_scaler

In [39]:
N_ESTIMATORS = 200
SEED=42

LGBM_Hyperparameters = {
    "n_estimators": N_ESTIMATORS,
    'learning_rate': 0.09249779656872704,
    'max_depth': 8,
    'colsample_bytree': 1.0,
    'subsample': 0.1,
    'reg_lambda': 0.9648085464373999,
    'reg_alpha': 0.9903394475964249,
    'random_state': SEED
    #'device':'gpu',
    #"class_weight": "balanced"
}

XGBoost_Hyperparameters = {
    'objective': 'binary:logistic',
    'eval_metric': ['logloss', 'auc'],
    'n_estimators': N_ESTIMATORS,
    'learning_rate': 0.034630277480196384,
    'max_depth': 9,
    'colsample_bytree': 0.8,
    'subsample': 0.30000000000000004,
    'reg_alpha': 0.0020136244579038245,
    'reg_lambda': 1.3270228907353322e-06,
    'seed': SEED,
    # 'scale_pos_weight':3,
    'enable_categorical':True,
    'early_stopping_rounds': 50,
    #'tree_method':'gpu_hist'
}

RF_Hyperparameters = {
    'n_estimators': N_ESTIMATORS,
    'max_depth': 9,
    'max_features': 0.5, #rf_items['colsample_bytree']
    'random_state':SEED,
    'n_jobs': -1
}

MLP_Hyperparametesr = {
    'hidden_layer_sizes':(5,125), #'hidden_layer_sizes': 5
    'random_state':SEED, 
    'max_iter':min(N_ESTIMATORS,100)}


Logreg_Hyperparameters = {'max_iter':N_ESTIMATORS,'random_state':SEED}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

def cross_validate(train, USE_SMOTE=False, USE_CLASS_WEIGHT=False, USE_UNDER_SAMPLING=False):
    oofs = np.zeros((train.shape[0], len(MODEL_NAMES)))
    for i, (train_index, valid_index) in enumerate(kfold.split(train, train[TARGET])):
        print(f"===========fold {i}================")
        X_train, oh_encoder, robust_scaler = process_data(train.iloc[train_index])
        X_valid, _, _  = process_data(train.iloc[valid_index], oh_encoder,robust_scaler)
        print(X_train.isnull().sum())
        y_train = train.iloc[train_index][TARGET].values
        y_valid = train.iloc[valid_index][TARGET].values
        logreg_hyperparameters = Logreg_Hyperparameters.copy()
        lgb_hyperparameters = LGBM_Hyperparameters.copy()
        xgboost_hyperparameters = XGBoost_Hyperparameters.copy()
        rf_hyperparameters = RF_Hyperparameters.copy()
        mlp_hyperparameters = MLP_Hyperparametesr.copy()
        if USE_SMOTE:
            print("SMOTEEEE")
            sm = SMOTE(random_state=42)
            X_train, y_train = sm.fit_resample(X_train, y_train)
        elif USE_CLASS_WEIGHT:
            print("CLASS_WEIGHTTTT")
           
            class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
            class_weights =  {0: class_weights[0], 1: class_weights[1]}
            lgb_hyperparameters['class_weight'] = class_weights
            logreg_hyperparameters['class_weight'] = class_weights
            xgboost_hyperparameters['scale_pos_weight'] = class_weights[1] /  class_weights[0]
            rf_hyperparameters['class_weight'] = class_weights
        elif USE_UNDER_SAMPLING:
            print("UNDER SAMPLING")
            rus = RandomUnderSampler(random_state=42)
            X_train, y_train = rus.fit_resample(X_train, y_train)
        print("LOGREG--------------")
        logreg_model = LogisticRegression(**logreg_hyperparameters)
        logreg_model.fit(X_train, y_train)
        logreg_y_pred_proba = logreg_model.predict_proba(X_valid)[:,1]
        best_threshold, best_score = find_best_threshold_f1_score(y_valid, logreg_y_pred_proba)
        print(f"\n best_threshold {best_threshold} best_score {best_score}")
        y_pred = [1 if y_hat >=best_threshold else 0 for y_hat in logreg_y_pred_proba]

        print(roc_auc_score(y_valid, logreg_y_pred_proba))

        print(classification_report(y_valid, y_pred, digits=4))
        oofs[valid_index,0] = logreg_y_pred_proba

        print("Random Forest--------------")
        rf_model = RandomForestClassifier(**rf_hyperparameters)
        rf_model.fit(X_train, y_train)
        rf_y_pred_proba = rf_model.predict_proba(X_valid)[:,1]
        best_threshold, best_score = find_best_threshold_f1_score(y_valid, rf_y_pred_proba)
        print(f"\n best_threshold {best_threshold} best_score {best_score}")
        y_pred = [1 if y_hat >=best_threshold else 0 for y_hat in rf_y_pred_proba]

        print(roc_auc_score(y_valid, rf_y_pred_proba))

        print(classification_report(y_valid, y_pred, digits=4))
        oofs[valid_index,1] = rf_y_pred_proba
    #     models.append(model)
        print("LGBModel--------------")
        lgb_model = LGBMClassifier(**lgb_hyperparameters)
        callbacks = [lgb.early_stopping(200, verbose=50), lgb.log_evaluation(period=50)]
        lgb_model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  eval_metric=["logloss", "auc"],
                  callbacks=callbacks)

        lgb_y_pred_proba = lgb_model.predict_proba(X_valid)[:,1]
        best_threshold, best_score = find_best_threshold_f1_score(y_valid, lgb_y_pred_proba)
        print(f"\n best_threshold {best_threshold} best_score {best_score}")
        y_pred = [1 if y_hat >=best_threshold else 0 for y_hat in lgb_y_pred_proba]
        print(roc_auc_score(y_valid, lgb_y_pred_proba))
        print(classification_report(y_valid, y_pred, digits=4))
        oofs[valid_index,2] = lgb_y_pred_proba
    #     models.append(model)
        # display(pd.DataFrame({'score': lgb_model.feature_importances_, 'feature': lgb_model.feature_name_}).sort_values('score',ascending=False))

        print("XGBoost--------------")
        xgb_model = XGBClassifier(**xgboost_hyperparameters)
        xgb_model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  verbose=50)
        xgb_y_pred_proba = xgb_model.predict_proba(X_valid)[:,1]
        best_threshold, best_score = find_best_threshold_f1_score(y_valid, xgb_y_pred_proba)
        print(f"\n best_threshold {best_threshold} best_score {best_score}")
        y_pred = [1 if y_hat >=best_threshold else 0 for y_hat in xgb_y_pred_proba]
        print(roc_auc_score(y_valid, xgb_y_pred_proba))
        print(classification_report(y_valid, y_pred, digits=4))
        oofs[valid_index, 3] = xgb_y_pred_proba
        
        print("MLP------------------")
        mlp_model = MLPClassifier(**mlp_hyperparameters)
        mlp_model.fit(X_train, y_train)
        mlp_y_pred_proba = mlp_model.predict_proba(X_valid)[:,1]
        best_threshold, best_score = find_best_threshold_f1_score(y_valid, mlp_y_pred_proba)
        print(f"\n best_threshold {best_threshold} best_score {best_score}")
        y_pred = [1 if y_hat >=best_threshold else 0 for y_hat in mlp_y_pred_proba]
        print(roc_auc_score(y_valid, mlp_y_pred_proba))
        print(classification_report(y_valid, y_pred, digits=4))
        oofs[valid_index, 4] = mlp_y_pred_proba
    return oofs

In [35]:
accuracy_scores = []
f1_scores = []
auc_scores = []
specificity_scores = []
sensitivity_scores = []
def scoring(y_test,y_pred_proba, best_threshold):
    y_pred = [1 if y_hat >= best_threshold else 0 for y_hat in y_pred_proba]
    acc = accuracy_score(y_test, y_pred)
    _f1_score = f1_score(y_test, y_pred, average='macro')
    auc_score = roc_auc_score(y_test, y_pred_proba)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp+fn)
    print("accuracy", acc)
    print("f1_score", _f1_score)
    print("auc", auc_score)
    print("sensitivity", sensitivity, "specificity", specificity)
    print(classification_report(y_test, y_pred, digits=4))
    return acc, _f1_score, auc_score, specificity, sensitivity

In [36]:
def train_model(oofs,X_train, y_train, X_test, y_test, USE_SMOTE=False,USE_CLASS_WEIGHT=False, USE_UNDER_SAMPLING=False):
    models = []
    predictions = []
    accuracy_scores = []
    f1_scores = []
    auc_scores = []
    specificity_scores = []
    sensitivity_scores = []
    best_thresholds = []
    for i in range(len(MODEL_NAMES)):
        best_threshold, best_score = find_best_threshold_f1_score(train[TARGET].values, oofs[:,i])
        best_thresholds.append(best_threshold)
        print('\n',best_threshold, best_score)
    logreg_hyperparameters = Logreg_Hyperparameters.copy()
    lgb_hyperparameters = LGBM_Hyperparameters.copy()
    xgboost_hyperparameters = XGBoost_Hyperparameters.copy()
    del xgboost_hyperparameters['early_stopping_rounds']
    rf_hyperparameters = RF_Hyperparameters.copy()
    mlp_hyperparameters = MLP_Hyperparametesr.copy()
    if USE_SMOTE:
        print("SMOTEEEE")
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    elif USE_CLASS_WEIGHT:
        print("CLASS_WEIGHTTTT")
        class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
        class_weights =  {0: class_weights[0], 1: class_weights[1]}
        lgb_hyperparameters['class_weight'] = class_weights
        logreg_hyperparameters['class_weight'] = class_weights
        xgboost_hyperparameters['scale_pos_weight'] = class_weights[1]/ class_weights[0]
        rf_hyperparameters['class_weight'] = class_weights
    elif USE_UNDER_SAMPLING:
        print("UNDER SAMPLING")
        rus = RandomUnderSampler(random_state=42)
        X_train, y_train = rus.fit_resample(X_train, y_train)

    print("LOGREG--------------")
    logreg_model = LogisticRegression(**logreg_hyperparameters)
    logreg_model.fit(X_train, y_train)
    logreg_y_pred_proba = logreg_model.predict_proba(X_test)[:,1]
    acc, _f1_score, auc_score, specificity, sensitivity = scoring(y_test,logreg_y_pred_proba,best_thresholds[0])
    accuracy_scores.append(acc)
    f1_scores.append(_f1_score)
    auc_scores.append(auc_score)
    specificity_scores.append(specificity)
    sensitivity_scores.append(sensitivity)
    predictions.append(logreg_y_pred_proba)
    models.append(logreg_model)

    print("Random Forest--------------")
    rf_model = RandomForestClassifier(**rf_hyperparameters)
    rf_model.fit(X_train, y_train)
    rf_y_pred_proba = rf_model.predict_proba(X_test)[:,1]
    acc, _f1_score, auc_score, specificity, sensitivity = scoring(y_test,rf_y_pred_proba,best_thresholds[1])
    accuracy_scores.append(acc)
    f1_scores.append(_f1_score)
    auc_scores.append(auc_score)
    specificity_scores.append(specificity)
    sensitivity_scores.append(sensitivity)
    predictions.append(rf_y_pred_proba)
    models.append(rf_model)

    print("LGBModel--------------")
    lgb_model = LGBMClassifier(**lgb_hyperparameters)
    lgb_model.fit(X_train, y_train)
    lgb_y_pred_proba = lgb_model.predict_proba(X_test)[:,1]
    acc, _f1_score, auc_score, specificity, sensitivity = scoring(y_test,lgb_y_pred_proba,best_thresholds[2])
    accuracy_scores.append(acc)
    f1_scores.append(_f1_score)
    auc_scores.append(auc_score)
    specificity_scores.append(specificity)
    sensitivity_scores.append(sensitivity)
    predictions.append(lgb_y_pred_proba)
    models.append(lgb_model)

    print("XGBoost--------------")
    print(xgboost_hyperparameters)
    xgb_model = XGBClassifier(**xgboost_hyperparameters)
    xgb_model.fit(X_train, y_train)
    xgb_y_pred_proba = xgb_model.predict_proba(X_test)[:,1]
    acc, _f1_score, auc_score, specificity, sensitivity = scoring(y_test,xgb_y_pred_proba,best_thresholds[3])
    accuracy_scores.append(acc)
    f1_scores.append(_f1_score)
    auc_scores.append(auc_score)
    specificity_scores.append(specificity)
    sensitivity_scores.append(sensitivity)
    predictions.append(xgb_y_pred_proba)
    models.append(xgb_model)
    
    print("MLP--------------")
    mlp_model = MLPClassifier(**mlp_hyperparameters)
    mlp_model.fit(X_train, y_train)
    mlp_y_pred_proba = mlp_model.predict_proba(X_test)[:,1]
    acc, _f1_score, auc_score, specificity, sensitivity = scoring(y_test,mlp_y_pred_proba,best_thresholds[4])
    accuracy_scores.append(acc)
    f1_scores.append(_f1_score)
    auc_scores.append(auc_score)
    specificity_scores.append(specificity)
    sensitivity_scores.append(sensitivity)
    predictions.append(mlp_y_pred_proba)
    models.append(mlp_model)

    print(MODEL_NAMES)
    print(accuracy_scores)
    print(f1_scores)
    print(auc_scores)
    print(specificity_scores)
    print(sensitivity_scores)
    score_df = pd.DataFrame({'model_name': MODEL_NAMES,
                         'accuracy_score':accuracy_scores, 
                         'f1_score': f1_scores, 
                         'auc_score': auc_scores, 
                         'specificity_score': specificity_scores, 
                         'sensitivity_score': sensitivity_scores})
    return score_df,models, predictions

In [37]:
X_train,oh_encoder,robust_scaler = process_data(train)
X_test, _,_ = process_data(test,oh_encoder,robust_scaler)
y_train = train[TARGET].values
y_test = test[TARGET].values

fit train OneHotEncoder
fit train RobustScaler
loadd onehot encoder
loadd robust scaler


In [40]:
print("Not handle Balanced")
nothing_oofs = cross_validate(train)
nothing_score_df, nothing_models, nothing_predictions = train_model(nothing_oofs,X_train, y_train, X_test, y_test)

Not handle Balanced
fit train OneHotEncoder
fit train RobustScaler
loadd onehot encoder
loadd robust scaler
os_name_macos                    0
os_name_windows                  0
age_group_15-17                  0
age_group_18-24                  0
age_group_25-34                  0
age_group_35-44                  0
age_group_45-54                  0
age_group_55+                    0
age_group_under 14               0
age_group_unknown                0
gender_female                    0
gender_male                      0
gender_unknown                   0
country_IN                       0
country_UNKNOWN                  0
country_VN                       0
region_Central Vietnam           0
region_Northern Vietnam          0
region_Southern Vietnam          0
region_None                      0
province_type_rural              0
province_type_unknown            0
province_type_urban              0
clicks                           0
search_volume                    0
dating_search    

In [41]:
import pickle

In [42]:
with open('/Users/natalie/Desktop/DS Thesis/user-churn-prediction/checkpoints/nothing_model_after_tune.pkl', 'wb') as f:
    pickle.dump({
        "score_df": nothing_score_df,
        "oofs": nothing_oofs,
        "models": nothing_models,
        "model_names": MODEL_NAMES,
        "predictions":nothing_predictions},f,protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

In [44]:
print("Use SMOTE")
smote_oofs = cross_validate(train, USE_SMOTE=True)
smote_score_df, smote_models, smote_predictions = train_model(smote_oofs,X_train, y_train, X_test, y_test, USE_SMOTE=True)

Use SMOTE
fit train OneHotEncoder
fit train RobustScaler
loadd onehot encoder
loadd robust scaler
os_name_macos                    0
os_name_windows                  0
age_group_15-17                  0
age_group_18-24                  0
age_group_25-34                  0
age_group_35-44                  0
age_group_45-54                  0
age_group_55+                    0
age_group_under 14               0
age_group_unknown                0
gender_female                    0
gender_male                      0
gender_unknown                   0
country_IN                       0
country_UNKNOWN                  0
country_VN                       0
region_Central Vietnam           0
region_Northern Vietnam          0
region_Southern Vietnam          0
region_None                      0
province_type_rural              0
province_type_unknown            0
province_type_urban              0
clicks                           0
search_volume                    0
dating_search              

In [45]:
with open('/Users/natalie/Desktop/DS Thesis/user-churn-prediction/checkpoints/smote_model_after_tune.pkl', 'wb') as f:
     pickle.dump({
             "score_df":smote_score_df,
             "oofs": smote_oofs,
             "models": smote_models,
             "model_names": MODEL_NAMES,
             "predictions":smote_predictions},f,protocol=pickle.HIGHEST_PROTOCOL)  

In [46]:
print("Use Class weight")
class_weight_oofs = cross_validate(train, USE_CLASS_WEIGHT=True)
class_weight_score_df, class_weight_models, class_weight_predictions = train_model(class_weight_oofs,X_train, y_train, X_test, y_test, USE_CLASS_WEIGHT=True)

Use Class weight
fit train OneHotEncoder
fit train RobustScaler
loadd onehot encoder
loadd robust scaler
os_name_macos                    0
os_name_windows                  0
age_group_15-17                  0
age_group_18-24                  0
age_group_25-34                  0
age_group_35-44                  0
age_group_45-54                  0
age_group_55+                    0
age_group_under 14               0
age_group_unknown                0
gender_female                    0
gender_male                      0
gender_unknown                   0
country_IN                       0
country_UNKNOWN                  0
country_VN                       0
region_Central Vietnam           0
region_Northern Vietnam          0
region_Southern Vietnam          0
region_None                      0
province_type_rural              0
province_type_unknown            0
province_type_urban              0
clicks                           0
search_volume                    0
dating_search       

In [47]:
with open('/Users/natalie/Desktop/DS Thesis/user-churn-prediction/checkpoints/class_weight_model_after_tune.pkl', 'wb') as f:
    pickle.dump({
            "score_df":class_weight_score_df,
            "oofs":   class_weight_oofs,
            "models": class_weight_models,
            "model_names": MODEL_NAMES,
            "predictions":class_weight_predictions},f,protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
print("Use Under sampling")
under_sampling_oofs = cross_validate(train, USE_UNDER_SAMPLING=True)
under_sampling_score_df, under_sampling_models, under_sampling_predictions = train_model(under_sampling_oofs,X_train, y_train, X_test, y_test, USE_UNDER_SAMPLING=True)

Use Under sampling
fit train OneHotEncoder
fit train RobustScaler
loadd onehot encoder
loadd robust scaler
os_name_macos                    0
os_name_windows                  0
age_group_15-17                  0
age_group_18-24                  0
age_group_25-34                  0
age_group_35-44                  0
age_group_45-54                  0
age_group_55+                    0
age_group_under 14               0
age_group_unknown                0
gender_female                    0
gender_male                      0
gender_unknown                   0
country_IN                       0
country_UNKNOWN                  0
country_VN                       0
region_Central Vietnam           0
region_Northern Vietnam          0
region_Southern Vietnam          0
region_None                      0
province_type_rural              0
province_type_unknown            0
province_type_urban              0
clicks                           0
search_volume                    0
dating_search     

In [49]:
with open('/Users/natalie/Desktop/DS Thesis/user-churn-prediction/checkpoints/under_sampling_model_after_tune.pkl', 'wb') as f:
     pickle.dump({
             "score_df":under_sampling_score_df,
             "oofs": under_sampling_oofs,
             "models": under_sampling_models,
             "model_names": MODEL_NAMES,
             "predictions":under_sampling_predictions},f,protocol=pickle.HIGHEST_PROTOCOL)