## To do:
 * Build custom transformers to push the data through the preprocessing pipeline
 * Split the data into training and test sets 
 * Perform a grid search and choose best models using CV
 * Compare models

In [4]:
import pandas as pd
import numpy as np
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from matplotlib import pyplot as plt

%matplotlib inline

In [145]:
# load the data
data = pd.read_csv('C:/Users/mateu/PYTHON/Datasets/UCI_Credit_Card.csv')

data.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [146]:
data['y'] = data['default.payment.next.month']
data.drop(['ID', 'default.payment.next.month'], axis=1, inplace=True)
# First we have to divide the feature set into categorical and numerical

In [147]:
data.describe()
X = data.drop('y', axis=1)
y = data['y']

In [148]:
predictors = X.columns.tolist()

In [181]:
num_predictors = []

for var in range(len(predictors)):
    current_predictor = predictors[var]
    if current_predictor.find('AMT') >= 0 or current_predictor.find('PAY') >= 0:
        num_predictors.append(predictors[var])

if not 'AGE' in num_predictors and 'LIMIT_BAL' in num_predictors:
    num_predictors.extend(['AGE', 'LIMIT_BAL'])
else:
    print('Predictors already in the list')
          
cat_predictors = ['SEX', 'MARRIAGE', 'EDUCATION']

Predictors already in the list


In [151]:
# Create a feature selector transformer
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class construtror
    def __init__(self, feature_names):
        self._feature_names = feature_names
    
    #Noting to do here
    def fit(self, X, y=None):
        return self
        
    # Method that describes what the transformer needs to do 
    def transform(self, X, y=None):
        return X[self._feature_names]

In [182]:
# Transformer that handles categorical predictors

#class NumericalTransformer(BaseEstimator, TransformerMixin):


num_pipeline = Pipeline(steps=[
    ('num_selector', FeatureSelector(num_predictors)),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())])

categorical_pipeline = Pipeline(steps=[
    ('cat_selector', FeatureSelector(cat_predictors)),
    ('one_hot_encoder', OneHotEncoder( sparse = False ))])

# Combine both pipelines

full_pipeline = FeatureUnion(transformer_list = [
    ('categorical_pipeline', categorical_pipeline),
    ('numerical_pipeline', num_pipeline)
])

In [137]:
# Combine the pipeline with an estimator

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=73089)

In [184]:
full_pipeline_m = Pipeline(steps=[
    ('full_pipeline', full_pipeline),
    ('model', DecisionTreeClassifier())
])

In [185]:
full_pipeline_m.fit(X_train, y_train)

y_pred = full_pipeline_m.predict(X_test)

In [186]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.723

# Select estimators and the list of hyperparameters and perform a grid search
 * Logistic regression
 * Decision tree
 * K-nearest neighbors

In [82]:
# load additional libraries
from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from numpy import bincount, linspace, mean, std, arange, squeeze
import itertools, time, datetime

In [None]:
# As data have been already split into training and test sets, we will use
# the training set to perform a grid search

In [78]:
from numpy import linspace, mean, std

def plotLearningCurve(_x_train, _y_train, learning_model_pipeline,  k_fold = 10, training_sample_sizes = linspace(0.1,1.0,10), jobsInParallel = 1):
    
    training_size, training_score, testing_score = learning_curve(estimator = learning_model_pipeline, \
                                                                  X = _x_train, \
                                                                  y = _y_train, \
                                                                  train_sizes = training_sample_sizes, \
                                                                  cv = k_fold, \
                                                                  n_jobs = jobsInParallel) 

    training_mean = mean(training_score, axis = 1)
    training_std_deviation = std(training_score, axis = 1)
    testing_std_deviation = std(testing_score, axis = 1)
    testing_mean = mean(testing_score, axis = 1 )

    plt.plot(training_size, training_mean, label= "Training Data", marker= '+', color = 'blue', markersize = 8)
    plt.fill_between(training_size, training_mean+ training_std_deviation, training_mean-training_std_deviation, color='blue', alpha =0.12 )

    plt.plot(training_size, testing_mean, label= "Testing/Validation Data", marker= '*', color = 'green', markersize = 8)
    plt.fill_between(training_size, testing_mean + testing_std_deviation, testing_mean - testing_std_deviation, 
                      color='green', alpha =0.14 )

    plt.title("Scoring of our training and testing data vs sample sizes")
    plt.xlabel("Number of Samples")
    plt.ylabel("Accuracy")
    plt.legend(loc= 'best')
    plt.show()

In [79]:
# Run grid search and predict

def runGridSearchAndPredict(pipeline, x_train, y_train, x_test, y_test, param_grid, n_jobs = 1, cv = 10, score = 'accuracy'):
    
    response = {}
    #training_timer       = CodeTimer('training')
    #testing_timer        = CodeTimer('testing')
    #learning_curve_timer = CodeTimer('learning_curve')
    #predict_proba_timer  = CodeTimer('predict_proba')
    
    #with training_timer:
    # pipeline is used as an estimator here
    gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, cv = cv, n_jobs = n_jobs, scoring = score)

    search = gridsearch.fit(x_train,y_train)

    print("Grid Search Best parameters ", search.best_params_)
    print("Grid Search Best score ", search.best_score_)
            
    #with testing_timer:
    y_prediction = gridsearch.predict(x_test)
            
    print("Accuracy score %s" %accuracy_score(y_test,y_prediction))
    print("F1 score %s" %f1_score(y_test,y_prediction))
    print("Classification report  \n %s" %(classification_report(y_test, y_prediction)))
    
    #with learning_curve_timer:
     #   plotLearningCurve(_x_train, _y_train, search.best_estimator_)
    plotLearningCurve(x_train, y_train, search.best_estimator_)
    
    #with predict_proba_timer:
    if hasattr(gridsearch.best_estimator_, 'predict_proba'):
            
        y_probability = gridsearch.predict_proba(x_test)
        false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_probability[:,1])
        response['roc_auc_score'] = roc_auc_score(y_test, y_probability[:,1])
        response['roc_curve'] = (false_positive_rate, true_positive_rate)
    
    else: ## eg SVM, Perceptron doesnt have predict_proba method
            
        response['roc_auc_score'] = 0
        response['roc_curve'] = None
    
    #response['learning_curve_time'] = learning_curve_timer.took
    #response['testing_time'] = testing_timer.took
    #response['training_time'] = training_timer.took
    response['_y_prediction'] = y_prediction
    response['accuracy_score'] = accuracy_score(y_test, y_prediction)
    response['f1_score']  = f1_score(y_test, y_prediction)

    return response
    

In [105]:
# create a list of classifiers and a dictionary of classifiers and parameters to evaluate
classifiers = [
    Perceptron(random_state = 1),
    LogisticRegression(random_state = 1),
    DecisionTreeClassifier(random_state = 1, criterion = 'gini'),
    KNeighborsClassifier(metric = 'minkowski'),
  #  RandomForestClassifier(random_state = 1, criterion = 'gini'),
  #  SVC(random_state = 1, kernel = 'rbf'),    
]


classifier_names = [
            'perceptron',
            'logisticregression',
            'decisiontreeclassifier',
            'kneighborsclassifier',
         #   'randomforestclassifier',
         #   'svc',               
]


classifier_param_grid = [
            
            {'perceptron__max_iter': [1,5,8,10], 'perceptron__eta0': [0.5,.4, .2, .1]},
            {'logisticregression__C':[100,200,300,50,20,600]},
            {'decisiontreeclassifier__max_depth':[6,7,8,9,10,11]},
            {'kneighborsclassifier__n_neighbors':[4,6,7,8]},
      #     {'randomforestclassifier__n_estimators':[1,2,3,5,6]},
      #     {'svc__C':[1], 'svc__gamma':[0.01]}
    
]

In [211]:
#or model, model_name, param_grid in zip(classifiers, classifier_names, classifier_param_grid):
# Example for Decision tree

full_pipeline.fit(X_test, y_test)
full_pipeline.fit(X_train, y_train)
_X_train = full_pipeline.transform(X_train)
_X_test = full_pipeline.transform(X_test)

# we want these actions to be performed by the runGridSearch function

array([[ 1.        ,  0.        ,  0.        , ..., -0.30826249,
        -0.30798627, -0.29748661],
       [ 1.        ,  0.        ,  0.        , ..., -0.18103483,
        -0.19968876, -0.26132497],
       [ 1.        ,  0.        ,  0.        , ..., -0.30826249,
        -0.24487584, -0.2686955 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.17467345,
        -0.27832437, -0.27733283],
       [ 0.        ,  1.        ,  0.        , ..., -0.117421  ,
        -0.11865497, -0.12473992],
       [ 0.        ,  1.        ,  0.        , ..., -0.18103483,
        -0.1817654 , -0.18232215]])

In [213]:
#pipeline.fit(X_train, y_train)
#pipeline.predict(X_test)
#pipeline.get_params().keys()
# if the estimator is not a pipeline than you can't supply any prefixes to the parameters 
params = {'n_neighbors':[4,6,7,8]}
gridsearch = GridSearchCV(estimator=KNeighborsClassifier(metric = 'minkowski'), cv=10, n_jobs=1, param_grid=params, scoring='accuracy')
gridsearch.fit(_X_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(), n_jobs=1,
             param_grid={'n_neighbors': [4, 6, 7, 8]}, scoring='accuracy')

In [215]:
gridsearch.best_params_

{'n_neighbors': 8}

In [206]:
pipe = Pipeline(steps=[('StandardScaler', StandardScaler()),
                ('decisiontreeclassifier', DecisionTreeClassifier(random_state = 1, criterion = 'gini'))])

In [207]:
params =  {'decisiontreeclassifier__max_depth':[6,7,8,9,10,11]}

In [208]:
#pipe.get_params().keys()
full_pipeline.get_params().keys()

dict_keys(['n_jobs', 'transformer_list', 'transformer_weights', 'verbose', 'categorical_pipeline', 'numerical_pipeline', 'categorical_pipeline__memory', 'categorical_pipeline__steps', 'categorical_pipeline__verbose', 'categorical_pipeline__cat_selector', 'categorical_pipeline__one_hot_encoder', 'categorical_pipeline__cat_selector__feature_names', 'categorical_pipeline__one_hot_encoder__categories', 'categorical_pipeline__one_hot_encoder__drop', 'categorical_pipeline__one_hot_encoder__dtype', 'categorical_pipeline__one_hot_encoder__handle_unknown', 'categorical_pipeline__one_hot_encoder__sparse', 'numerical_pipeline__memory', 'numerical_pipeline__steps', 'numerical_pipeline__verbose', 'numerical_pipeline__num_selector', 'numerical_pipeline__imputer', 'numerical_pipeline__std_scaler', 'numerical_pipeline__num_selector__feature_names', 'numerical_pipeline__imputer__add_indicator', 'numerical_pipeline__imputer__copy', 'numerical_pipeline__imputer__fill_value', 'numerical_pipeline__imputer_

In [209]:
gridsearch = GridSearchCV(estimator=pipe, cv=10, n_jobs=1, param_grid=params, scoring='accuracy')
gridsearch.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('full_pipeline', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier(random_state=1))]),
             n_jobs=1,
             param_grid={'decisiontreeclassifier__max_depth': [6, 7, 8, 9, 10,
                                                               11]},
             scoring='accuracy')

In [1]:
print("Added a new line")

Added a new line
