## Credit Card Fraud Standalone Functions

In [5]:
def split_data(test_size,val_size,x_features,y_target):
    val_percent = val_size / (1 - test_size)
    X_trainVal, X_test, y_trainVal, y_test = train_test_split(x_features, y_target, test_size=test_size, random_state=42, stratify=y_target)
#     X_train, X_val, y_train, y_val = train_test_split(X_trainVal, y_trainVal, test_size=val_percent, random_state=42, stratify=y_trainVal)
    
    return X_test,y_test,X_trainVal,y_trainVal

In [14]:
def cv_learning_curve(x_data,y_data,folds,penalty,step):
    # Include time in the features
#     ffeats_time = raw_data[raw_data.columns[0:-1]]
#     target_time = raw_data[raw_data.columns[-1]]

#     X_train,y_train,X_val,y_val,X_test,y_test,X_trainVal_t,y_trainVal_t = split_data(test_size=0.2,val_size=0.2,x_features=ffeats_time,y_target=target_time)

    learning_stats = []
    clf1 = LogisticRegression(random_state=0,penalty='l2',C=penalty,solver='lbfgs')

    for size in np.arange(step,1.00,step):
        clear_output()
        display("Cross Validation Test {} of {}...".format(int(round(size*100)), int(1/step))) 

        X_cv, X_unused, y_cv, y_unused = train_test_split(x_data, y_data, test_size=1-size, random_state=42, stratify=y_data)
        scores = cross_validate(clf1, X_cv, y_cv, cv=folds, scoring='f1', return_train_score=True)
        learning_stats.append({'data_size':size*100,'train_score':np.mean(scores['train_score']),'test_score':np.mean(scores['test_score'])})

    clear_output()
    display("Cross Validation Test {} of {}...".format(int(1/step), int(1/step)))
    scores = cross_validate(clf1, x_data, y_data, cv=folds, scoring='f1', return_train_score=True)
    learning_stats.append({'data_size':100,'train_score':np.mean(scores['train_score']),'test_score':np.mean(scores['test_score'])})
    
    learning_df = pd.DataFrame(learning_stats)

    fig, ax1 = plt.subplots()
    fig.set_size_inches(15,10)

    ax1.plot(learning_df['data_size'],learning_df['train_score'],color='red')
    ax1.plot(learning_df['data_size'],learning_df['test_score'],color='green')
    ax1.set_ylabel('F1 Score')
    ax1.set_xlabel('Data Set Percentage of Total Training Validation Set')
    ax1.set_title('Learning Curve')
#     ssax1.xlim(0,100)

    fig.legend(['Train Score','Validation Score'],loc=(0.75, 0.82), prop={'size':12})

    ax1.grid()
    plt.show()

In [4]:
def polynomial_test(x_data,y_data,folds,degree_list,reg_list):

    poly_scores = []
    
    log_reg_params = {"degree":degree_list,"penalty": ['l2'], 'C': reg_list}
    grid = ParameterGrid(log_reg_params)
    count = 1

    for params in grid:

        clear_output()
        print(f"Polynomial test {count} of {len(grid)}...") 
        count += 1
        
        clf = LogisticRegression(random_state=0,penalty='l2',C=params['C'],solver='lbfgs')
        poly = PolynomialFeatures(degree = params['degree'], interaction_only=False, include_bias=True)
        X_poly = poly.fit_transform(x_data)
        scores = cross_validate(clf, X_poly, y_data, cv=folds, scoring='f1', return_train_score=True)

        poly_scores.append({'Polynomial_Degree':params['degree'],'Regularize':params['C'], \
                            'train_score':np.mean(scores['train_score']),'test_score':np.mean(scores['test_score'])})

    poly_results = pd.DataFrame(poly_scores)
    display(poly_results)

In [5]:
def cv_test(x_data,y_data,folds,regularize,penalty, drop):
    
    strat_scores = []
    skf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=False)
    skf.get_n_splits(x_data, y_data)
    count = 1

    for train_index, test_index in skf.split(x_data, y_data):
#         clear_output()
#         print("Cross Validation Fold {} of {}...".format(count, folds))
        count += 1

        clf2 = LogisticRegression(random_state=0,penalty=penalty,C=regularize,solver='lbfgs').fit(x_data.iloc[train_index],y_data.iloc[train_index])

        y_prob_train = clf2.predict_proba(x_data.iloc[train_index])
        y_prob_test = clf2.predict_proba(x_data.iloc[test_index])

        y_pred_train = np.array(y_prob_train[:,1] > 0.5).astype(int)
        y_pred_test = np.array(y_prob_test[:,1] > 0.5).astype(int)

    #     plot_pr_curve(y_trainVal.iloc[test_index], y_prob_test[:,1])

        train_result = precision_recall_fscore_support(y_data.iloc[train_index], y_pred_train, average='binary',pos_label=1)
        test_result = precision_recall_fscore_support(y_data.iloc[test_index], y_pred_test, average='binary',pos_label=1)

        strat_scores.append({'Train_Score':train_result[2],'Train_Recall':train_result[1], 'Train_Precision':train_result[0],
        'Test_Score':test_result[2], 'Test_Recall':test_result[1], 'Test_Precision':test_result[0],'Regularization':regularize,
        'Penalty':penalty, 'Drop':drop})
        
    scores = pd.DataFrame(strat_scores)

    return scores,pd.DataFrame(scores.mean()).T

In [7]:
def feature_importance(x_data,y_data,folds):
    
    strat_scores = []
    cv_results = []
    skf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=False)
    skf.get_n_splits(x_data, y_trainVal)
    count = 1

    for train_index, test_index in skf.split(x_data, y_data):
        
        clf2 = LogisticRegression(random_state=0,penalty='l2',C=0.01,solver='lbfgs').fit(x_data.iloc[train_index],y_data.iloc[train_index])
        
        for col in x_data.columns:

            X = x_data.iloc[test_index].copy()
            X[col] = np.random.permutation(X[col].values)

            y_prob_test = clf2.predict_proba(X)
            y_pred_test = np.array(y_prob_test[:,1] > 0.5).astype(int)

            test_result = precision_recall_fscore_support(y_data.iloc[test_index], y_pred_test, average='binary',pos_label=1)
            strat_scores.append({'Test_Score':test_result[2], 'Test_Recall':test_result[1], 'Test_Precision':test_result[0],'Column':col})

        cv_results.append(pd.DataFrame(strat_scores))
        strat_scores = []

#   compute average scores for each cv run 
    cv_results_total = (reduce(lambda x, y: x.add(y, fill_value=0), cv_results))/(len(cv_results))
    cv_sorted = cv_results_total.sort_values(by = 'Test_Score', ascending= False, axis = 0).head(10)
    cv_sorted.reset_index(drop=True, inplace=True)
    
    return cv_sorted

In [8]:
def feat_select_recursive_scrambling(x_data,y_data,runs,cv_folds):

    dropped_columns = []
    results = pd.DataFrame()
    count = 1

    X_poly = data.X_poly_trainVal
    y_trainVal = data.y_trainVal

    for run in range(runs):

        clear_output()
        print(f"Recursive Scrambling Test {count} of {runs}...")

#       return dataframe of the scores when each column is scrambled  
        result = feature_importance(x_data,y_data,folds=cv_folds)

#       the highest scored iteration is first, so drop the scrambled column associated with this iteration  
        col_to_drop = int(result['Column'].iloc[0])
    
#       append to the running list of dropped columns
        dropped_columns.append(col_to_drop)
    
#       drop the column from the data set
        x_data.drop(x_data.columns[col_to_drop],axis=1,inplace=True)

#       conduct the CV test 
        total_results_df, results_df = cv_test(x_data, y_data, folds=cv_folds, regularize=0.01, penalty = 'l2',drop=['Fixed'])

        dropped_run = dropped_columns.copy()
        results_df['Dropped_Columns'] = [dropped_run]
        results_df['Hypothetical_Score'] = result['Test_Score'].iloc[0]
        results = pd.concat([results,results_df])
        count += 1  
        
    return results

In [8]:
def feat_select_mutual_info(x_data,y_data,fit,plot):
    
#   if fit is true, compute the feature importance using the mutual information criteria 
    if fit == True:
        mu = mutual_info_classif(x_data,y_data)
        pdb.set_trace()
        mutual_info = pd.DataFrame(mu).reset_index(inplace=True,drop=True)
        mutual_info.columns = ['index','feat_importance']
        mutual_info.to_csv("mutual_info2.csv")
       
#   if fit is false, read the previously computed mutual information feature results. If file has not been created, throw a warning message
    if fit == False:
        try:
            mutual_info = pd.read_csv("mutual_info2.csv").drop(['Unnamed: 0'],axis=1)
        
        except:
            print("Mutual information feature selection has not yet been conducted. Please set 'fit=True'")
            return
        
# plot features in order of importance to visualize an intuitive cut off
    if plot == True:

        mi_sorted = mutual_info.sort_values(by="feat_importance",ascending=False)
        mi_sorted.reset_index(drop=True, inplace=True)
        
        fig, ax1 = plt.subplots()
        fig.set_size_inches(20,5)

        ax1.plot(mi_sorted.index[:-1],mi_sorted['feat_importance'].iloc[:-1],color='red')
        
        ax1.set_xlabel("Reordered Polynomial Feature Index")
        ax1.set_ylabel("Feature Importance")
        ax1.set_title("Mutual Information Feature Importance")
        
        plt.xticks(np.arange(0, mi_sorted.shape[0]+1, 50))
        ax1.set_xlim(0,mi_sorted.shape[0])
        ax1.grid()
        plt.show()
        

In [11]:
def cv_mutual_info(x_data,y_data,reg_list,drop_list,cv_folds):

#   read the mutual information feature importance from csv. If no file exists throw an error
    try:
        mi_df = pd.read_csv("mutual_info2.csv")
        
    except:
        print("Mutual Information feature selection has not been conducted, please run function 'feat_select_mutual_info' first.")
        return
    
#   sort by feature importance with the least importance features appearing first so they can be dropped
    mi_sorted = mi_df.sort_values(by="feat_importance",ascending=True)
    mi_sorted.reset_index(drop=True, inplace=True)
    mi_sorted.drop('Unnamed: 0',inplace=True,axis=1)

#   define parameter list
    results_mi = pd.DataFrame()
    log_reg_params = {"penalty": ['l2'], 'C':reg_list, 'drop':drop_list}
    grid = ParameterGrid(log_reg_params)
    count = 1

#   iterate over the parameter list
    for params in grid:

#       drop the top X least importance features  
        x_data_dropped = x_data.drop(mi_sorted['index'].iloc[0:params['drop']].tolist(),axis=1)

        clear_output()
        print(f"Test {count} of {len(grid)}...")

#       conduct cross validation with the filtered features
        total_results_df, results_df = cv_test(x_data_dropped, y_data, folds=cv_folds, regularize=params['C'], penalty = params['penalty'], drop = params['drop'])
        results_mi = pd.concat([results_mi,results_df])
        count += 1

    display(results_mi)

In [12]:
def feat_select_permutation_importance(x_data,y_data,fit,plot):
    
    #   if fit is true, compute the feature importance using the permutation importance criteria 
    if fit == True:
        clf = LogisticRegression(random_state=0,penalty='l2',C=0.01,solver='lbfgs').fit(x_data, y_data)
        pi = permutation_importance(estimator=clf,X=x_data,y=y_data)
        perm_imp = pd.DataFrame(pi['Importances_Mean']).reset_index()
        perm_imp.columns = ['index','Importances_Mean']
        perm_imp.to_csv("permutation_imp.csv")
       
#   if fit is false, read the previously computed permutation importance feature results. If file has not been created, throw a warning message
    if fit == False:
        try:
            perm_imp = pd.read_csv("permutation_imp.csv").drop(['Unnamed: 0'],axis=1)
        
        except:
            print("Permutation Importance feature selection has not yet been conducted. Please set 'fit=True'")
            return
        
    if plot == True:
        imp_count = Counter(perm_imp['Importances_Mean'] <= 0)
        print(f"There a {imp_count[1]} features with zero or negative importance and {imp_count[0]} features with a positive importance.")
        
        perm_imp_filter = perm_imp[perm_imp['Importances_Mean'] > 0]
        
        perm_imp_sorted = perm_imp_filter.sort_values(by="Importances_Mean", ascending=True)
        perm_imp_sorted.reset_index(drop=True, inplace=True)
        
        fig, ax1 = plt.subplots()
        fig.set_size_inches(20,5)

        ax1.plot(perm_imp_sorted.index[:-1],perm_imp_sorted['Importances_Mean'].iloc[:-1],color='red')
        
        ax1.set_xlabel("Reordered Polynomial Feature Index")
        ax1.set_ylabel("Feature Importance")
        ax1.set_title("Permutation Feature Importance for Positive Importance Features")
        
        plt.xticks(np.arange(0, perm_imp_sorted.shape[0]+1, 50))
        ax1.set_xlim(0,perm_imp_sorted.shape[0])
        ax1.grid()
        plt.show()

In [60]:
def cv_permutation_importance(x_data,y_data,reg_list,cv_folds,select,feat_num_filter):

#   read the mutual information feature importance from csv. If no file exists throw an error
    try:
        permutation_df = pd.read_csv("permutation_imp.csv")
        
    except:
        print("Permutation Importance feature selection has not been conducted, please run function 'feat_select_permutation_importance' first.")
        return
      
    if select == 1:
    #   filter out features with negative mean feature importance
        permutation_cols_keep = list(permutation_df[permutation_df["Importances_Mean"] > 0].index)
        
    elif select == 2:
#         pdb.set_trace()
        perm_imp_filter = permutation_df[permutation_df['Importances_Mean'] > 0]

        perm_imp_sorted = perm_imp_filter.sort_values(by="Importances_Mean", ascending=True)
        perm_imp_sorted.reset_index(drop=False, inplace=True)
        permutation_cols_keep = perm_imp_sorted['index'].tail(feat_num_filter).tolist()
        

#   define parameter list
    results_perm = pd.DataFrame()
    log_reg_params = {"penalty": ['l2'], 'C': reg_list, 'drop':['Fixed']}
    grid = ParameterGrid(log_reg_params)
    count = 1

#   iterate over the parameter list
    for params in grid:

#       remove features with negative feature importance 
        x_data_dropped = x_data.loc[:,permutation_cols_keep]

        clear_output()
        print(f"Test {count} of {len(grid)}...")

#       conduct cross validation with the filtered features
        total_results_df, results_df = cv_test(x_data_dropped, y_data, folds=cv_folds, regularize=params['C'], penalty = params['penalty'], drop = params['drop'])
        results_perm = pd.concat([results_perm,results_df])
        count += 1

    display(results_perm)

In [69]:
def feat_select_select_from_model(x_data,y_data,fit):
    
    #   if fit is true, compute the feature importance using the permutation importance criteria 
    if fit == True:
        clf3 = LogisticRegression(random_state=0,penalty='l2',C=0.01,solver='lbfgs')
        SFM = SelectFromModel(estimator=clf3).fit(x_data,y_data)
        sfm_df = pd.DataFrame(list(SFM.get_support()))
        sfm_df.to_csv("select_from_model.csv")
       
#   if fit is false, read the previously computed SFM feature results. If file has not been created, throw a warning message
    if fit == False:
        try:
            sfm_df = pd.read_csv("select_from_model.csv")
        
        except:
            print("Select from model feature selection has not yet been conducted. Please set 'fit=True'")
            return
        
    print(f"The filtered data set has been reduced to {Counter(sfm_df['0'])[1]} from {x_data.shape[1]} features")

In [62]:
def cv_select_from_model(x_data,y_data,reg_list,cv_folds):

#   read the mutual information feature importance from csv. If no file exists throw an error
    try:
        sfm_df = pd.read_csv("select_from_model.csv")
        sfm_cols_to_keep = list(sfm_df["0"])
        
    except:
        print("Select from model feature selection has not been conducted, please run function 'feat_select_select_from_model' first.")
        return

    X_poly_filter = x_data.loc[:,sfm_cols_to_keep]

    results_sfm = pd.DataFrame()
    log_reg_params = {"penalty": ['l2'], 'C': reg_list, 'drop':['Fixed']}
    grid = ParameterGrid(log_reg_params)
    count = 1

#   iterate over the parameter list
    for params in grid:

        clear_output()
        print(f"Test {count} of {len(grid)}...")

#       conduct cross validation with the filtered features
        total_results_df, results_df = cv_test(X_poly_filter, y_data, folds=cv_folds, regularize=params['C'], penalty = params['penalty'], drop = params['drop'])
        results_sfm = pd.concat([results_sfm,results_df])
        count += 1

    display(results_sfm)

In [1]:
def feat_select_variance_threshold(x_data,y_data,var_thresh,fit):
    
    #   if fit is true, compute the feature importance using the permutation importance criteria 
    if fit == True:
        X_VT = VarianceThreshold(threshold = var_thresh).fit(x_data)
        X_VarThresh = pd.DataFrame(X_VT.get_support())
        X_VarThresh.columns = ['Column_Filter']
        X_VarThresh.to_csv("var_threshold.csv")
       
#   if fit is false, read the previously computed SFM feature results. If file has not been created, throw a warning message
    if fit == False:
        try:
            X_VarThresh = pd.read_csv("var_threshold.csv")
            X_VarThresh.drop("Unnamed: 0",axis=1,inplace=True)
        
        except:
            print("Select from model feature selection has not yet been conducted. Please set 'fit=True'")
            return
        
    print(f"The filtered data set has been reduced to {X_VarThresh[X_VarThresh['Column_Filter'] == True].shape[0]} from {x_data.shape[1]} features")

In [3]:
def cv_variance_threshold(x_data,y_data,reg_list,cv_folds):

#   read the mutual information feature importance from csv. If no file exists throw an error
    try:
        X_VarThresh = pd.read_csv("var_threshold.csv")
        X_VarThresh.drop("Unnamed: 0",axis=1,inplace=True)
        
    except:
        print("Select from model feature selection has not been conducted, please run function 'feat_select_variance_threshold' first.")
        return

    X_poly_filter = x_data.iloc[:,X_VarThresh.iloc[:,0].tolist()]
    X_poly_filter.columns = [np.arange(0,X_poly_filter.shape[1],1)]

    results_vs = pd.DataFrame()
    log_reg_params = {"penalty": ['l2'], 'C': reg_list, 'drop':['Fixed']}
    grid = ParameterGrid(log_reg_params)
    count = 1

#   iterate over the parameter list
    for params in grid:

        clear_output()
        print(f"Test {count} of {len(grid)}...")

#       conduct cross validation with the filtered features
        total_results_df, results_df = cv_test(X_poly_filter, y_data, folds=cv_folds, regularize=params['C'], penalty = params['penalty'], drop = params['drop'])
        results_vs = pd.concat([results_vs,results_df])
        count += 1

    display(results_vs)

In [4]:
def feat_select_RFECV(x_data,y_data,fit):
    
    #   if fit is true, compute the feature importance using the permutation importance criteria 
    if fit == True:
        clf5 = LogisticRegression(random_state=0,penalty='l2',C=1,solver='lbfgs')
        trans2 = RFECV(estimator=clf5, step=1, cv=StratifiedKFold(10))
        selector2 = trans2.fit(data.X_poly_trainVal,data.y_trainVal)
        rfecv_ranking_df = pd.DataFrame(selector2.ranking_,columns=["Feature_Ranking"])
        rfecv_ranking_df.to_csv("RECV_Feature_Rank_cv10.csv")
       
#   if fit is false, read the previously computed SFM feature results. If file has not been created, throw a warning message
    if fit == False:
        try:
            recv_df = pd.read_csv("RECV_Feature_Rank_cv10.csv")
        
        except:
            print("Select from model feature selection has not yet been conducted. Please set 'fit=True'")
            return
        
    print(f"The filtered data set has been reduced to {len(list(recv_df[recv_df['Feature_Ranking'] == 1].index))} from {x_data.shape[1]} features")

In [5]:
def cv_rfecv(x_data,y_data,reg_list,cv_folds):

#   read the mutual information feature importance from csv. If no file exists throw an error
    try:
        recv_df = pd.read_csv("RECV_Feature_Rank_cv10.csv")
        
    except:
        print("Select from model feature selection has not been conducted, please run function 'feat_select_variance_threshold' first.")
        return

    recv_cols_keep = list(recv_df[recv_df['Feature_Ranking'] == 1].index)
    X_poly_filter = x_data.loc[:,recv_cols_keep]
    
    results_rfecv = pd.DataFrame()
    log_reg_params = {"penalty": ['l2'], 'C': reg_list, 'drop':['Fixed']}
    grid = ParameterGrid(log_reg_params)
    count = 1

#   iterate over the parameter list
    for params in grid:

        clear_output()
        print(f"Test {count} of {len(grid)}...")

#       conduct cross validation with the filtered features
        total_results_df, results_df = cv_test(X_poly_filter, y_data, folds=cv_folds, regularize=params['C'], penalty = params['penalty'], drop = params['drop'])
        results_rfecv = pd.concat([results_rfecv,results_df])
        count += 1

    display(results_rfecv)

In [6]:
def cv_rfecv_learning_curve(x_data,y_data,folds,penalty,step):

    recv_df = pd.read_csv("RECV_Feature_Rank_cv10.csv")
    recv_cols_keep = list(recv_df[recv_df['Feature_Ranking'] == 1].index)
    X_poly_filter = x_data.loc[:,recv_cols_keep]
    
    learning_stats = []
    clf1 = LogisticRegression(random_state=0,penalty='l2',C=penalty,solver='lbfgs')

    for size in np.arange(step,1.00,step):
        clear_output()
        display("Cross Validation Test {} of {}...".format(int(round(size*100)), int(1/step))) 

        X_cv, X_unused, y_cv, y_unused = train_test_split(X_poly_filter, y_data, test_size=1-size, random_state=42, stratify=y_data)
        scores = cross_validate(clf1, X_cv, y_cv, cv=folds, scoring='f1', return_train_score=True)
        learning_stats.append({'data_size':size*100,'train_score':np.mean(scores['train_score']),'test_score':np.mean(scores['test_score'])})

    clear_output()
    display("Cross Validation Test {} of {}...".format(int(1/step), int(1/step)))
    scores = cross_validate(clf1, X_poly_filter, y_data, cv=folds, scoring='f1', return_train_score=True)
    learning_stats.append({'data_size':100,'train_score':np.mean(scores['train_score']),'test_score':np.mean(scores['test_score'])})
    
    learning_df = pd.DataFrame(learning_stats)

    fig, ax1 = plt.subplots()
    fig.set_size_inches(15,10)

    ax1.plot(learning_df['data_size'],learning_df['train_score'],color='red')
    ax1.plot(learning_df['data_size'],learning_df['test_score'],color='green')
    ax1.set_ylabel('F1 Score')
    ax1.set_xlabel('Data Set Percentage of Total Training Validation Set')
    ax1.set_title('Learning Curve')
#     ssax1.xlim(0,100)

    fig.legend(['Train Score','Validation Score'],loc=(0.75, 0.82), prop={'size':12})

    ax1.grid()
    plt.show()

In [3]:
def cv_test_resampling(x_data,y_data,folds,regularize, ratio_list,method):
    
    strat_scores = []
    skf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=False)
    skf.get_n_splits(x_data, y_data)
    count = 1

    for split in ratio_list:
        clear_output()
        print("Test {} of {}...".format(count, len(ratio_list)))
        count += 1
            
        for train_index, test_index in skf.split(x_data, y_data):

    # #       recombine the data so the features and target values are correctly aligned for filtering  
    #         merge_df = pd.DataFrame(x_data.iloc[train_index])
    #         merge_df['Class'] = y_data.iloc[train_index]

    # #       identify the positive cases of the fraud and isolate the same number of non positive cases
    #         merge_fraud_df = merge_df[merge_df['Class'] == 1]
    #         merge_non_fraud_df = merge_df[merge_df['Class'] == 0].iloc[0:len(merge_fraud_df),:]

    # #       recombine the data and shuffle the data set  
    #         under_sample_df = pd.concat([merge_fraud_df, merge_non_fraud_df])
    #         undersample_shuffle = under_sample_df.sample(frac=1, random_state=0)

    # #       separate the features and target from the undersampled data set  
    #         under_sample_feats = undersample_shuffle[undersample_shuffle.columns[:-1]]
    #         under_sample_target = undersample_shuffle[undersample_shuffle.columns[-1]]
            
            if method == 'under':
                undersample = RandomUnderSampler(sampling_strategy=split,random_state=42)
                re_sample_feats, re_sample_target = undersample.fit_resample(x_data.iloc[train_index], y_data.iloc[train_index])
                
            elif method == 'over':
                sm = SMOTE(sampling_strategy=split,random_state=42)
                re_sample_feats, re_sample_target = sm.fit_resample(x_data.iloc[train_index], y_data.iloc[train_index])
                

    #       fit the model based on the undersampled data
            clf2 = LogisticRegression(random_state=0,penalty='l2',C=regularize,solver='lbfgs').fit(re_sample_feats,re_sample_target)

    #       predict based on the original data set  
            y_prob_train = clf2.predict_proba(re_sample_feats)
            y_prob_test = clf2.predict_proba(x_data.iloc[test_index])

            y_pred_train = np.array(y_prob_train[:,1] > 0.5).astype(int)
            y_pred_test = np.array(y_prob_test[:,1] > 0.5).astype(int)

    #       compute the recall and precision scores for the undersampled data set
            train_result = precision_recall_fscore_support(re_sample_target, y_pred_train, average='binary',pos_label=1)
            test_result = precision_recall_fscore_support(y_data.iloc[test_index], y_pred_test, average='binary',pos_label=1)

            strat_scores.append({'Train_Score':train_result[2],'Train_Recall':train_result[1], 'Train_Precision':train_result[0],
            'Test_Score':test_result[2], 'Test_Recall':test_result[1], 'Test_Precision':test_result[0],'Regularization':regularize,
            'Penalty':'l2', 'Ratio':split})
        
    scores = pd.DataFrame(strat_scores)

    return scores,pd.DataFrame(scores.mean()).T

In [5]:
def cv_test_oversampling(x_data,y_data,folds,regularize, ratio_list,cluster_list):
    
    strat_scores = []
    skf = StratifiedKFold(n_splits=folds, random_state=0, shuffle=False)
    skf.get_n_splits(x_data, y_data)
    count = 1
     
    log_reg_params = {"Ratio":ratio_list,"Clusters": cluster_list}
    grid = ParameterGrid(log_reg_params)
    count = 1

    for params in grid:

        clear_output()
        print(f"Over sampling test {count} of {len(grid)}...") 
        count += 1
            
        for train_index, test_index in skf.split(x_data, y_data):
                
            sm = SMOTE(sampling_strategy=params['Ratio'], k_neighbors=params['Clusters'], random_state=42)
            re_sample_feats, re_sample_target = sm.fit_resample(x_data.iloc[train_index], y_data.iloc[train_index])


    #       fit the model based on the oversampled data
            clf2 = LogisticRegression(random_state=0,penalty='l2',C=regularize,solver='lbfgs').fit(re_sample_feats,re_sample_target)

    #       predict based on the original data set  
            y_prob_train = clf2.predict_proba(re_sample_feats)
            y_prob_test = clf2.predict_proba(x_data.iloc[test_index])

            y_pred_train = np.array(y_prob_train[:,1] > 0.5).astype(int)
            y_pred_test = np.array(y_prob_test[:,1] > 0.5).astype(int)

    #       compute the recall and precision scores for the undersampled data set
            train_result = precision_recall_fscore_support(re_sample_target, y_pred_train, average='binary',pos_label=1)
            test_result = precision_recall_fscore_support(y_data.iloc[test_index], y_pred_test, average='binary',pos_label=1)

            strat_scores.append({'Train_Score':train_result[2],'Train_Recall':train_result[1], 'Train_Precision':train_result[0],
            'Test_Score':test_result[2], 'Test_Recall':test_result[1], 'Test_Precision':test_result[0],'Regularization':regularize,
            'Penalty':'l2', 'Ratio':params['Ratio'],'Clusters':params['Clusters']})
        
    scores = pd.DataFrame(strat_scores)

    return scores,pd.DataFrame(scores.mean()).T

In [5]:
def oos_test(x_train,y_train,x_test,y_test,folds,regularize, ratio,clusters):
                 
    sm = SMOTE(sampling_strategy=ratio, k_neighbors=clusters, random_state=42)
    re_sample_feats, re_sample_target = sm.fit_resample(x_train, y_train)
    strat_scores = []
    
#     pdb.set_trace()

#       fit the model based on the over sampled data
    clf2 = LogisticRegression(random_state=0,penalty='l2',C=regularize,solver='lbfgs').fit(re_sample_feats,re_sample_target)

#       predict based on the original data set  
    y_prob_train = clf2.predict_proba(re_sample_feats)
    y_prob_test = clf2.predict_proba(x_test)

    y_pred_train = np.array(y_prob_train[:,1] > 0.5).astype(int)
    y_pred_test = np.array(y_prob_test[:,1] > 0.5).astype(int)

#       compute the recall and precision scores for the undersampled data set
    train_result = precision_recall_fscore_support(re_sample_target, y_pred_train, average='binary',pos_label=1)
    test_result = precision_recall_fscore_support(y_test, y_pred_test, average='binary',pos_label=1)

    strat_scores.append({'Train_Score':train_result[2],'Train_Recall':train_result[1], 'Train_Precision':train_result[0],
    'Test_Score':test_result[2], 'Test_Recall':test_result[1], 'Test_Precision':test_result[0],'Regularization':regularize,
    'Penalty':'l2', 'Ratio':ratio,'Clusters':clusters})
        
    scores = pd.DataFrame(strat_scores)

    return scores,pd.DataFrame(scores.mean()).T,confusion_matrix(y_test,y_pred_test)