# Machine learning pipeline

## Load packages

In [1]:
'''
Machine learning HW 3
Spring 2019
pete rodrigue
'''

import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import pylab
import scipy.stats as stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from IPython.core.pylabtools import figsize
import random

  from numpy.core.umath_tests import inner1d


In [2]:
os.getcwd()

'C:\\Users\\edwar.WJM-SONYLAPTOP\\Documents\\GitHub\\ML_spring_2019\\exercise three'

## Set your working directory to the root folder of the repository (ML_spring_2019) on your computer

### You may need to change the path below:

In [3]:
os.chdir("C:\\Users\\edwar.WJM-SONYLAPTOP\\Documents\\GitHub\\ML_spring_2019")

## Define all our functions

In [4]:
def load_and_peek_at_data(path, summary=False):
    '''
    Loads our data and returns a pandas dataframe.
    This function also saves a csv file with descriptive statistics for all
    our variables to our figures folder.
    '''
    separator = '************************\n************************\n\n'
    df = pd.read_csv(path)
    print(separator)
    print('Head of data:')
    print(df.head(5))
    print(separator)
    print('Tail of data:')
    print(df.tail(5))
    print(separator)
    print('column names of data:')
    print(df.columns)
    print(separator)
    print('number of rows of data:')
    print(len(df))
    print(separator)

    if summary:
        print("\n\n\nSummary of data:")
        print(df.describe())
        df.describe().to_csv('figures/summary.csv')

    return df


def make_graphs(df, normal_qq_plots=False):
    '''
    Takes our dataframe, fills in missing values with the median,
    and outputs a series of plots:
            - Normal qq plots for each variable
            - Boxplots for each variable
            - Histograms for each variable
        - A correlation plot for all our variables

    Inputs:
        df (pandas dataframe): our dataframe we want to modify
    '''
    df_temp = df._get_numeric_data()
    fill_missing(df_temp)
    g = sns.heatmap(df[df.columns.difference(
                 ['PersonID',
                  'SeriousDlqin2yrs',
                  'zipcode',
                  'NumberOfTime60-89DaysPastDueNotWorse',
                  'NumberOfTimes90DaysLate'])].corr())
    plt.savefig('figures/correlation_plot')
    plt.close()
    for col in df_temp.columns:
        plt.clf()
        mycol = df_temp[col][df_temp[col].notna()]
        print('skew', ' for col ', mycol.name, 'is:', mycol.skew())
        if abs(mycol.skew()) > 10:
            path = "figures/" + col + "log_transformed"
            g = sns.distplot(mycol)
            g.set_title(col + " dist, log_transformed")
            g.set(xscale='log')
            plt.savefig(path)
            plt.close()
            if normal_qq_plots:
                path = "figures/" + col + \
                       " normal_qq_plot log trans"
                g = stats.probplot(np.log(df[col]+.0001),
                                   dist="norm", plot=pylab)
                plt.title(col + " normal_qq log transformed")
                plt.savefig(path)
        else:
            path = "figures/" + col
            g = sns.distplot(mycol)
            g.set_title(col + " distribution")
            plt.savefig(path)
            plt.close()
            if normal_qq_plots:
                path = "figures/" + col + " normal_qq_plot"
                g = stats.probplot(df[col], dist="norm", plot=pylab)
                plt.title(col + " normal_qq")
                plt.savefig(path)
        plt.clf()
        path = "figures/" + col + " boxplot"
        g = sns.boxplot(mycol)
        plt.savefig(path)


def fill_missing(df):
    '''
    Fill missing numerica data in our data frame with the median value of that
    variable. Modifies the dataframe in place. Does not return anything.

    Inputs:
        df (pandas dataframe): our dataframe we want to modify
    '''
    for col in df.columns:
        if df[col].isna().any():
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)


def descretize_var(df, var, num_groups):
    '''
    Takes one of our variables and splits it into discrete groups.

    Inputs:
        df (pandas dataframe): our dataframe we want to modify
        var (str): the column in our dataframe that we want to make a
                   categorical variable from
        num_groups (int): the number of groups our discrete variable will have

    Returns: a modified dataframe.
    '''
    labs = list(range(1, num_groups + 1))
    labs = [str(x) for x in labs]
    new_var = var + '_discrete'
    df[new_var] = pd.qcut(df[var], num_groups, labels=labs)

    return df


def make_dummies(df, var):
    '''
    Takes our dataframe and turns a specified variable into a series of
    dummy columns. This function returns the modified dataframe.

    Inputs:
        df (pandas dataframe): our dataframe we want to modify
        var (str): the column in our dataframe that we want to make dummies of

    Returns: a modified dataframe.
    '''
    new_var_prefix = "D_" + var

    return pd.concat([df, pd.get_dummies(df[var], prefix=new_var_prefix)],
                     axis=1)

## Models

### Tree

In [5]:
def run_tree_model(x_data, y_data, x_test=None,
                   y_test=None, max_depth=5, outcome_labels=None, threshold=.5, use_test_sets=False):
    '''
    This function takes our data and computes a decision tree model.
    It saves a .dot file you can open in graphviz to see the tree.
    Inputs:
        x_data (pandas dataframe): data frame where each column is a predictor
        y_data (pandas series): series of outcomes
        max_depth (int): the maximum depth of the tree.
        outcome_labels (list of str): the labels for our predictor variables.
    '''
    mymodel = tree.DecisionTreeClassifier(max_depth=max_depth)
    mymodel.fit(X=x_data, y=y_data)
    
    print("***************Tree model")
    print("*********Threshold:{0}".format(threshold))
    
    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        print('mean of pred class is')
        print(predicted_probs['predicted_class'].mean())
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm

### Logit model

In [6]:
def run_logit_model(x_data, y_data, x_test=None, y_test=None, threshold=.5, use_test_sets=False):
    '''
    This function takes our x and y data and a threshold,
    and computes a logistic model. It exports a confusion matrix table.

    Inputs:
        x_data (pandas dataframe): data frame where each column is a predictor
        y_data (pandas series): series of outcomes
        threshold (float): the threshold, between 0 and 1, that we'll use to
                           to decide if a given row is predicted to be a
                           positive in the target class or not.
    '''
    mymodel = LogisticRegression()
    mymodel.fit(x_data, y_data)
    
    print('***********Logistic regression')
    print('Training set performance:')
    print("*********Threshold:{0}".format(threshold))
    
    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm



### K-Nearest Neighbor

In [7]:
def run_knn_model(x_data, y_data, x_test=None, y_test=None, num_n=2, threshold=.5, use_test_sets=False):
    '''
    This function takes our x and y data and a threshold,
    and computes a knn model. It exports a confusion matrix table.

    Inputs:
        x_data (pandas dataframe): data frame where each column is a predictor
        y_data (pandas series): series of outcomes
        threshold (float): probability threshold needed to call prediction positive
        num_n (int): the number of neighbors
    '''
    mymodel = KNeighborsClassifier(n_neighbors=num_n)
    mymodel.fit(x_data, y_data)
    
    print('************KNN')
    print('Training set performance:')
    print('confusion matrix')
    print('|T neg, F pos|\n|F neg, T pos|')
    
    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm

   

### SVM

In [8]:
# SVM
def run_svm_model(x_data_scaled, y_data, x_test=None,
                  y_test=None, kernel='linear', threshold=.5, use_test_sets=False):
    '''
    Runs and SVM model on your data
    Note: this will run much faster if you scale your x data first
    '''
    mymodel = svm.SVC(kernel='linear', probability=True)
    mymodel.fit(x_data_scaled,
                    y_data)  
    mypreds = mymodel.predict(x_data_scaled)    
    
    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data_scaled))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm

### Random Forests

In [9]:
# Random Forest
def run_forest(x_data, y_data, x_test=None, y_test=None,
               my_n_estimators=10, my_max_depth=5, threshold=.5, use_test_sets=False):
    '''
    Runs a random forest model
    Inputs:
        x_data (pd dataframe) : our predictor data
        y_data (pd dataframe) : our outcome data
        n_estimators (int): the number of trees in our forest
        max_depth (int): the max number of levels in our trees
    '''
    mymodel = RandomForestClassifier(n_estimators=my_n_estimators, max_depth=my_max_depth)
    mymodel.fit(x_data, y_data)
    
    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm

### Boosting

In [10]:
# Boosting
def run_boosted_model(x_data, y_data, x_test=None, y_test=None,
                      my_max_depth=5, my_n_estimators=10, threshold=.5, use_test_sets=False):
    '''
    Run a boosted decision tree model
    '''
    mymodel = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=5),
                             algorithm="SAMME",
                             n_estimators=10)
    mymodel.fit(x_data, y_data)

    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm

### Bagging

In [11]:
# Bagging
def run_bagging_model(x_data, y_data, x_test=None, y_test=None, threshold=.5, use_test_sets=False):
    '''
    Runs a bagging model
    '''
    mymodel = BaggingClassifier(KNeighborsClassifier(n_neighbors=16),
                                 max_samples=100, max_features=1000)
    mymodel.fit(x_data, y_data)
    
    if use_test_sets:
        print('Returning test set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_test))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_test, predicted_probs['predicted_class'])
        
        return cm
    
    else:
        print('Returning training set performance:')
        predicted_probs = pd.DataFrame(mymodel.predict_proba(x_data))
        predicted_probs['predicted_class'] = 0
        predicted_probs.loc[predicted_probs[1] >= threshold,
                            'predicted_class'] = 1
        cm = metrics.confusion_matrix(y_data, predicted_probs['predicted_class'])    
        
        return cm

### Looping through models to compare performance

In [12]:
def compare_models(x_data=None, x_data_scaled=None, y_data=None,
                   x_test=None, x_test_scaled=None, y_test=None, use_test_data=False,
                   run_bagging=False, run_boosted=False, run_a_forest=False,
                   run_svm=False, run_knn=False, run_logit=False, run_tree=False,
                   mythresholds=[.5],
                   my_max_depth=5, my_n_estimators=10,
                   num_n=8,
                   outcome_labels=None,
                  mykernel='linear'):    
    '''
    Compare all our models
    '''
    model = []
    fpr = []
    tpr = []
    precision = []
    current_threshold = []
    if run_bagging:
        for t in mythresholds:
            cm = run_bagging_model(x_data, y_data, threshold=t, x_test=x_test,
                                   y_test=y_test, use_test_sets=use_test_data)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('bagging')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    if run_boosted:
        for t in mythresholds:
            cm = run_boosted_model(x_data=x_data, y_data=y_data,
                              my_max_depth=my_max_depth, my_n_estimators=my_n_estimators, threshold=t,
                                  x_test=x_test, y_test=y_test, use_test_sets=use_test_data)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('boosted')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    if run_a_forest:
        for t in mythresholds:
            cm = run_forest(x_data, y_data, x_test, y_test,
                        my_n_estimators, my_max_depth, t,
                                use_test_data)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('forest')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    if run_svm:
        for t in mythresholds:
#            def run_svm_model(x_data_scaled, y_data, x_test=None,
#                   y_test=None, kernel='linear', threshold=.5, use_test_sets=False):
            cm = run_svm_model(x_data_scaled, y_data, threshold=t,
                              x_test=x_test_scaled, y_test=y_test, use_test_sets=use_test_data, kernel=mykernel)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('svm')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    if run_knn:
        for t in mythresholds:
            cm = run_knn_model(x_data, y_data, num_n=num_n, threshold=t,
                              x_test=x_test, y_test=y_test, use_test_sets=use_test_data)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('knn')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    if run_logit:
        for t in mythresholds:
            cm = run_logit_model(x_data, y_data=y_data, threshold=t,
                                x_test=x_test, y_test=y_test, use_test_sets=use_test_data)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('logit')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    if run_tree:
        for t in mythresholds:
            cm = run_tree_model(x_data,
                       y_data=y_data,
                       max_depth=my_max_depth,
                       outcome_labels=outcome_labels, threshold=t,
                               x_test=x_test, y_test=y_test, use_test_sets=use_test_data)
            tpr.append(cm[1][1] / sum(cm[1]))
            fpr.append(cm[0][1] / sum(cm[0]))
            model.append('tree')
            current_threshold.append(t)
            precision.append(cm[1][1] / (cm[1][1] + cm[0][1]))
            print('\n')
    
    rows_to_add = len(current_threshold)
    to_plot = pd.DataFrame({'model': model + ['baseline'] * rows_to_add, 
                            'tpr': tpr + fpr[0:rows_to_add],
                            'fpr': fpr + fpr[0:rows_to_add],
                            'precision': precision + [None] * rows_to_add,
                            'threshold': current_threshold + [None] * rows_to_add,
                           })
    print('\n\n')
    for m in to_plot['model'].unique():
        auc = -1*np.trapz(y=to_plot.loc[to_plot['model'] == m, 'tpr'],
                      x=to_plot.loc[to_plot['model'] == m, 'fpr'])
        print('AUC for ', m, ' is', auc)
        
    print('\n\nTable of results:')
    print(to_plot)
    random_number = random.randint(1,100000)
    print(random_number)
    to_plot.to_csv(str(random_number) + ".csv")
    plt.clf()
    sns.lineplot(x=to_plot['fpr'],
                 y=to_plot['tpr'],
                 hue=to_plot['model'])
    
    

## Function to split data into test and training data

In [13]:
def split_using_date(data, train_start_date, train_end_date):
    '''
    Splits our data into test and training sets using a date the user provides
    '''
    train = projects.loc[projects['date_posted'].between(
                                                    train_start_date,
                                                    train_end_date,
                                                    inclusive=True), :]
    test = projects.loc[projects['date_posted'].between(
                                                    train_end_date,
                                                    '2014-01-01',
                                                    inclusive=False), :]
    
    return [train, test]