In [34]:
# Import necessary packages
import pandas as pd
import numpy as np
import multiprocessing as mp
cores_to_use = mp.cpu_count() - 1 # Use one less core

In [35]:
# Load specific packages
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve, auc, make_scorer
from sklearn.model_selection import GridSearchCV

# Create a Dataset object
# File path is path to data
# init_splitter is a CV splitter
class CCDataset():
    def __init__(self, file_path, init_splitter): 
        self.file_path = file_path # Initial file load
        self.X, self.y = self.load_dataset() # Data object
        self.splitter = init_splitter
        
    def load_dataset(self):
        data_raw = pd.read_csv(self.file_path) # Read in data
        # If there were other necessary preprocessing steps, would put them here
        # In this case there isn't
        # Also Note: We would want Scaling steps to be separate
            # And Normalize on Train Data, apply same scaler to test data (not a new one)
        data_np = data_raw.values # Return only the data object
        X, y = data_np[:, :-1], data_np[:, -1] # Data and labels
        return X, y
    
    # reccomended accuracy function for heavily imbalanced data
    def pr_auc(self, y_true, preds):
        p, r, _ = precision_recall_curve(y_true, preds) # Returns a curve set
        return auc(r, p) # returns area under the curve
    
    # Function for updating CV Splitter to something new
    def make_splitter(self, cv_splitter):
        return cv_splitter
            
    # Grid search for optimization
    def run_grid_search(self, model, parameters):
        # Build scorer
        scorer = make_scorer(self.pr_auc, needs_proba=True)
        # Generate Grid Search Object
        grid = GridSearchCV(estimator=model, param_grid=parameters,
                           n_jobs=cores_to_use, cv=self.splitter,
                           scoring=scorer)
        # Fit Data
        grid.fit(self.X, self.y)
        return grid
        
    # Function for evaluating a model using pr_auc
    def evaluate_model(self, model, X, y):
        # The data splits we want to use
        # Generates a callable scoring function from custom loss func
        # Thus you can pass it to a grid search or something of that nature to compare different runs/models
        # NOTE: if I wanted to do a grid/random search, I could similarly pass the scorer/cv to those intializers!
        scorer = make_scorer(self.pr_auc, needs_proba=True)
        model_scores = cross_val_score(model, X, y, 
                                       scoring=scorer, # What method you are using for calculating cores
                                       cv=self.splitter, # How you are going to be generating your splits
                                       n_jobs = cores_to_use) # Number of processes
        
        return model_scores

In [36]:
# Pick Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

models = {'Decision Tree':DecisionTreeClassifier(),
         'Random Forest':RandomForestClassifier(),
         'Extra Trees':ExtraTreesClassifier(),
         'Bagging':BaggingClassifier()}

In [37]:
# Create dataset object and execute
import time

# Create Dataset Object
splitter = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cc_dat = CCDataset('./creditcard.csv', splitter)

# Takes in models to consider, as well as a CCDataset object
def run_models(models, cc_dataset):
    results = []
    for key, model in models.items():
        start = time.time()
        scores = cc_dataset.evaluate_model(model, cc_dataset.X, cc_dataset.y)
        results.append(scores)
        end = time.time()
        run_time = str(round(end - start, 3))
        m = str(round(np.mean(scores), 3))
        s = str(round(np.std(scores), 3))
        print('Model = ' + key + ': ' + m + ', ' + s + \
              ', run_time = ' + run_time + 's')
        
run_models(models, cc_dat) # Run set of models

### Results
* Extra trees performed the best
* Can we get better performance that the default values
    * Let's do a random search
    * Might make the most sense to make a class function for this

In [38]:
# Grid Search on extra trees classifier
# This is a test - not intended to tune parameters
param_grid = {
    'n_estimators':[50, 100, 200],
    'bootstrap':[True, False]
}

grid_results = cc_dat.run_grid_search(models['Extra Trees'], param_grid)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
             estimator=ExtraTreesClassifier(), n_jobs=11,
             param_grid={'bootstrap': [True, False],
                         'n_estimators': [50, 100, 200]},
             scoring=make_scorer(pr_auc, needs_proba=True))

In [39]:
# Print Grid Results
grid_results.__dict__

{'scoring': make_scorer(pr_auc, needs_proba=True),
 'estimator': ExtraTreesClassifier(),
 'n_jobs': 11,
 'refit': True,
 'cv': RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
 'verbose': 0,
 'pre_dispatch': '2*n_jobs',
 'error_score': nan,
 'return_train_score': False,
 'param_grid': {'n_estimators': [50, 100, 200], 'bootstrap': [True, False]},
 'multimetric_': False,
 'best_index_': 5,
 'best_score_': 0.8654374524055811,
 'best_params_': {'bootstrap': False, 'n_estimators': 200},
 'best_estimator_': ExtraTreesClassifier(n_estimators=200),
 'refit_time_': 44.34875154495239,
 'scorer_': make_scorer(pr_auc, needs_proba=True),
 'cv_results_': {'mean_fit_time': array([ 40.11501992,  84.0108492 , 166.00723124,  63.03104612,
         124.25726318, 229.02655768]),
  'std_fit_time': array([ 1.23394263,  2.35385305,  5.54593163,  2.67439392,  3.18350877,
         29.34928   ]),
  'mean_score_time': array([0.18196044, 0.36727539, 0.7243084 , 0.20792242, 0.40488521,
         0.