# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode
import seaborn as sns
sns.set()
%matplotlib inline
import pickle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split, KFold, cross_val_score, learning_curve
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

  import pandas.util.testing as tm


# Saving the dateset into a pandas dataframe

In [None]:
existing_users_df = pd.read_pickle('existing_users_2classes_df.pickle')
existing_users_df.shape

In [None]:
from sklearn.utils import shuffle
seed = 42
df = shuffle(existing_users_df, random_state=seed)
df0= df[df.next_purchase_day_2class==0].iloc[:100000]
df1 = df[df.next_purchase_day_2class==1].iloc[:100000]
df = pd.concat([df1,df0])
df.groupby('next_purchase_day_2class')['next_purchase_day'].describe()

# Splitting data into training 80% and test data 20%

In [None]:
df["next_purchase_day_2class"] = df["next_purchase_day_2class"].astype(int)
y = df["next_purchase_day_2class"].values
X = df.drop(labels = ["next_purchase_day_2class","user_id","next_purchase_day"],axis = 1)# Create Train & Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X,y = shuffle(X,y, random_state=seed)
X.head()

# Applied Cross Validation with multiple models and scalers

this code was borrowed from an opensource repo

In [None]:
from sklearn import model_selection
def print_results(names, results, test_scores):
    print()
    print("#" * 30 + "Results" + "#" * 30)
    counter = 0

    class Color:
        PURPLE = '\033[95m'
        CYAN = '\033[96m'
        DARKCYAN = '\033[36m'
        BLUE = '\033[94m'
        GREEN = '\033[92m'
        YELLOW = '\033[93m'
        RED = '\033[91m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'
        END = '\033[0m'

    # Get max row
    clf_names = set([name.split("_")[1] for name in names])
    max_mean = {name: 0 for name in clf_names}
    max_mean_counter = {name: 0 for name in clf_names}
    for name, result in zip(names, results):
        counter += 1
        clf_name = name.split("_")[1]
        if result.mean() > max_mean[clf_name]:
            max_mean_counter[clf_name] = counter
            max_mean[clf_name] = result.mean()

    # print max row in BOLD
    counter = 0
    prev_clf_name = names[0].split("_")[1]
    for name, result, score in zip(names, results, test_scores):
        counter += 1
        clf_name = name.split("_")[1]
        if prev_clf_name != clf_name:
            print()
            prev_clf_name = clf_name
        msg = "%s: %f (%f) [test_score:%.3f]" % (name, result.mean(), result.std(), score)
        if counter == max_mean_counter[clf_name]:
            print(Color.BOLD + msg)
        else:
            print(Color.END + msg)


def create_pipelines(seed, verbose=1):
    """
         Creates a list of pipelines with preprocessing(PCA), models and scalers.

    :param seed: Random seed for models who needs it
    :return:
    """

    models = [
              ('LR', LogisticRegression(solver='lbfgs',multi_class ='multinomial')),
              ('LDA', LinearDiscriminantAnalysis()),
              ('KNN', KNeighborsClassifier()),
              ('CART', DecisionTreeClassifier(random_state=seed)),
              ('NB', GaussianNB()),
              ('SVM', SVC(random_state=seed, probability=True)),
              ('LinearSVC',LinearSVC(max_iter=100,dual=False)),
              ('RF', RandomForestClassifier(max_depth=3, random_state=seed)),
              ('XGB', XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'multi:softmax',n_jobs=-1,num_class=3)),
              ('Bagging',BaggingClassifier()),
              ('ExtraTrees',ExtraTreesClassifier(n_estimators=100))
        
              ]
    scalers = [('StandardScaler', StandardScaler()),
               ('MinMaxScaler', MinMaxScaler()),
               ('MaxAbsScaler', MaxAbsScaler()),
               ('RobustScaler', RobustScaler()),
               ('QuantileTransformer-Normal', QuantileTransformer(output_distribution='normal')),
               ('QuantileTransformer-Uniform', QuantileTransformer(output_distribution='uniform')),
               ('PowerTransformer-Yeo-Johnson', PowerTransformer(method='yeo-johnson')),
               ('Normalizer', Normalizer())
               ]
    #additions = [('PCA', PCA(n_components=4)),
                # ]
    # Create pipelines
    pipelines = []
    for model in models:
        # Append only model
        model_name = "_" + model[0]
        pipelines.append((model_name, Pipeline([model])))

        # Append model+scaler
        for scalar in scalers:
            model_name = scalar[0] + "_" + model[0]
            pipelines.append((model_name, Pipeline([scalar, model])))

    if verbose:
        print("Created these pipelines:")
        for pipe in pipelines:
            print(pipe[0])

    return pipelines


def run_cv_and_test(X_train, y_train, X_test, y_test, pipelines, scoring, seed, num_folds,
                    dataset_name, n_jobs):
    """

        Iterate over the pipelines, calculate CV mean and std scores, fit on train and predict on test.
        Return the results in a dataframe

    """

    # List that contains the rows for a dataframe
    rows_list = []

    # Lists for the pipeline results
    results = []
    names = []
    test_scores = []
    prev_clf_name = pipelines[0][0].split("_")[1]
    print("First name is : ", prev_clf_name)

    for name, model in pipelines:
        kfold = model_selection.KFold(n_splits=num_folds, random_state=seed)
        cv_results = ms.cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=n_jobs, scoring=scoring)
        results.append(cv_results)
        names.append(name)

        # Print CV results of the best CV classier
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

        # fit on train and predict on test
        model.fit(X_train, y_train)
        if scoring == "accuracy":
            curr_test_score = model.score(X_test, y_test)
        elif scoring == "roc_auc":
            y_pred = model.predict_proba(X_test)[:, 1]
            curr_test_score = roc_auc_score(y_test, y_pred)

        test_scores.append(curr_test_score)

        # Add separation line if different classifier applied
        rows_list, prev_clf_name = check_seperation_line(name, prev_clf_name, rows_list)

        # Add for final dataframe
        results_dict = {"Dataset": dataset_name,
                        "Classifier_Name": name,
                        "CV_mean": cv_results.mean(),
                        "CV_std": cv_results.std(),
                        "Test_score": curr_test_score
                        }
        rows_list.append(results_dict)

    print_results(names, results, test_scores)

    df = pd.DataFrame(rows_list)
    return df[["Dataset", "Classifier_Name", "CV_mean", "CV_std", "Test_score"]]



In [None]:
from __future__ import print_function

import itertools
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d
import numpy as np

# Global_vars
seed = 42
num_folds = 12
n_jobs = -1
hypertuned_experiment = True
is_save_results = True
# Create pipelines
pipelines = create_pipelines(seed)
scoring = "accuracy"
results_df = run_cv_and_test(X_train, y_train, X_test, y_test, pipelines, scoring, seed, num_folds,
                                     dataset_name="existing user", n_jobs=n_jobs)

        # Save cv experiment to csv
if is_save_results:
    dataset_results_name = "existing_user_results-2classes.csv"
    results_path = os.path.join(dataset_results_name)
    results_df.to_csv(results_path, index=False)

In [None]:
import os
import pandas as pdresults_file
results_file= "existing_user_results-3classes.csv"
results_df = pd.read_csv(os.path.join(results_file)).dropna().round(3)
import operator
results_df.loc[operator.and_(results_df["Classifier_Name"].str.startswith("_"), ~results_df["Classifier_Name"].str.endswith("PCA"))].dropna()
temp = results_df.loc[~results_df["Classifier_Name"].str.endswith("PCA")].dropna()
temp["model"] = results_df["Classifier_Name"].apply(lambda sen: sen.split("_")[1])
temp["scaler"] = results_df["Classifier_Name"].apply(lambda sen: sen.split("_")[0])
def df_style(val):
    return 'font-weight: 800'
pivot_t = pd.pivot_table(temp, values='CV_mean', index=["scaler"], columns=['model'], aggfunc=np.sum)
pivot_t_bold = pivot_t.style.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t["CART"].idxmax(),"CART"])
for col in list(pivot_t):
    pivot_t_bold = pivot_t_bold.applymap(df_style,
                      subset=pd.IndexSlice[pivot_t[col].idxmax(),col])
pivot_t_bold

In [None]:
cols_max_vals = {}
cols_max_row_names = {}
for col in list(pivot_t):
    row_name = pivot_t[col].idxmax()
    cell_val = pivot_t[col].max()
    cols_max_vals[col] = cell_val
    cols_max_row_names[col] = row_name
    
sorted_cols_max_vals = sorted(cols_max_vals.items(), key=lambda kv: kv[1], reverse=True)
print("Best classifiers sorted:\n")
counter = 1
for model, score in sorted_cols_max_vals:
    print(str(counter) + ". " + model + " + " +cols_max_row_names[model] + " : " +str(score))
    counter +=1

# Apply Hyperparameter Tuning for performance improvement

In [None]:
def get_hypertune_params():
    """

        Create a dictionary with classifier name as a key and it's hyper parameters options as a value

    :return:
    """
    # RF PARAMS
    n_estimators = [int(x) for x in np.linspace(start=3, stop=20, num=3)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    rf_params = {'RF__n_estimators': n_estimators,
                 'RF__max_features': max_features,
                 'RF__max_depth': max_depth,
                 'RF__min_samples_split': min_samples_split,
                 'RF__min_samples_leaf': min_samples_leaf,
                 }


    # SVM PARAMS
    C = [x for x in np.arange(0.1, 2, 0.2)]
    kernel = ["linear", "poly", "rbf", "sigmoid"]
    svm_params = {'SVM__C': C,
                  'SVM__kernel': kernel,
                  }

    # Logistic Regression Params
    C = [x for x in np.arange(0.1, 3, 0.2)]
    penalty = ["l2"]
    fit_intercept = [True, False]
    lr_params = {'LR__C': C,
                 'LR__penalty': penalty,
                 'LR__fit_intercept': fit_intercept
                 }

    # LDA PARAMS
    solver = ["lsqr"]
    shrinkage = ["auto", None, 0.1, 0.3, 0.5, 0.7, 0.9]
    lda_params = {'LDA__solver': solver,
                  'LDA__shrinkage': shrinkage
                  }

    hypertuned_params = {"RF": rf_params,
                         "SVM": svm_params,
                         "LR": lr_params,
                         }

    return hypertuned_params


In [None]:
from sklearn.model_selection import GridSearchCV

def run_cv_and_test_hypertuned_params(X_train, y_train, X_test, y_test, pipelines, scoring, seed, num_folds,
                                      dataset_name, hypertuned_params, n_jobs):
    """

        Iterate over the pipelines, calculate CV mean and std scores, fit on train and predict on test.
        Return the results in a dataframe

    :param X_train:
    :param y_train:
    :param X_test:
    :param y_test:
    :param scoring:
    :param seed:
    :param num_folds:
    :param dataset_name:
    :return:
    """

    # List that contains the rows for a dataframe
    rows_list = []

    # Lists for the pipeline results
    results = []
    names = []
    test_scores = []
    prev_clf_name = pipelines[0][0].split("_")[1]
    print("First name is : ", prev_clf_name)

    # To be used within GridSearch (5 in your case)
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    # To be used in outer CV (you asked for num_folds)
    outer_cv = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for name, model in pipelines:

        # Get model's hyper parameters
        model_name = name.split("_")[1]
        if "-" in model_name:
            model_name = model_name.split("-")[0]

        if model_name in hypertuned_params.keys():
            random_grid = hypertuned_params[model_name]
        else:
            continue

        # Train nested-CV
        clf = GridSearchCV(estimator=model, param_grid=random_grid, cv=inner_cv, scoring=scoring,
                           verbose=2, n_jobs=n_jobs, refit=True)
        cv_results = model_selection.cross_val_score(clf, X_train, y_train, cv=outer_cv, n_jobs=n_jobs, scoring=scoring)
        results.append(cv_results)
        names.append(name)

        # Print CV results of the best CV classier
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

        # fit on train and predict on test
        model.fit(X_train, y_train)
        if scoring is "accuracy":
            curr_test_score = model.score(X_test, y_test)
        elif scoring is "roc_auc":
            y_pred = model.predict(X_test)
            curr_test_score = accuracy_score(y_test, y_pred)

        test_scores.append(curr_test_score)

        # Add separation line if different classifier applied
        rows_list, prev_clf_name = check_seperation_line(name, prev_clf_name, rows_list)

        # Add for final dataframe
        results_dict = {"Dataset": dataset_name,
                        "Classifier_Name": name,
                        "CV_mean": cv_results.mean(),
                        "CV_std": cv_results.std(),
                        "Test_score": curr_test_score
                        }
        rows_list.append(results_dict)

    print_results(names, results, test_scores)

    df = pd.DataFrame(rows_list)
    return df[["Dataset", "Classifier_Name", "CV_mean", "CV_std", "Test_score"]]



In [None]:
is_hyp_save_results = True
# Run same experiment with hypertuned parameters
print("#"*30 + "Hyper tuning parameters" "#"*30)
hypertuned_params = get_hypertune_params()

hypertune_results_df = run_cv_and_test_hypertuned_params(X_train, y_train, X_test, y_test, pipelines, scoring, seed,
                                                         num_folds, dataset_name="existing user", n_jobs=n_jobs,
                                                         hypertuned_params=hypertuned_params,)

if is_hyp_save_results:
    dataset_results_name = "existing_user_results_hypertuned-2classes.csv"
    results_path = os.path.join(dataset_results_name)
    hypertune_results_df.to_csv(results_path, index=False)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection as ms
#classifiers = [['LR',LogisticRegression()],['KNN',KNeighborsClassifier(5)],['SVC', SVC(kernel="linear", C=0.025)], ['RandForest',RandomForestClassifier(max_depth=5)],['Boost',AdaBoostClassifier()], ['Gaussian',GaussianNB()]]
classifiers = [['LR',LogisticRegression(solver='lbfgs', C=0.1)]]
kfold = ms.KFold(n_splits=5, random_state=22)
for name, model in classifiers:
    result = ms.cross_val_score(model, X_train, y_train, cv=kfold).mean()
    print(name,result)

# Logistic Regression Feature Importance

In [None]:
from sklearn.linear_model import LogisticRegression

from yellowbrick.datasets import load_occupancy
from yellowbrick.model_selection import FeatureImportances

# Load the classification data set

model = LogisticRegression(solver='liblinear', C=0.01)
viz = FeatureImportances(model, size=(1080, 720))
viz.fit(X, y)
viz.poof()

# Confusion Matrix

In [None]:
from yellowbrick.classifier import ConfusionMatrix


cm = ConfusionMatrix(
    model, classes=['<=14 days','>14 days'],
    label_encoder={0: '<=14 days', 1: '>14 days'},
    size=(1080, 720)
)

cm.fit(X_train, y_train)
cm.score(X_test, y_test)
y_pred = model.predict(X_test)
cm.poof()
print('Accuracy' + str(accuracy_score(y_test, y_pred)))

# Classification Report

In [None]:
from yellowbrick.classifier import ClassificationReport


visualizer = ClassificationReport(
    model, classes=[0,1], support=True, size=(1080, 720)
)

visualizer.fit(X_train, y_train)  # Fit the visualizer and the model
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.poof()                 # Draw/show/poof the data