In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import multiprocessing as mp
cores_to_use = mp.cpu_count() - 1 # Use one less core

In [2]:
# Load specific packages
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve, auc, make_scorer
from sklearn.model_selection import GridSearchCV

# Create a Dataset object
# File path is path to data
# init_splitter is a CV splitter
class CCDataset():
    def __init__(self, file_path): 
        self.file_path = file_path # Initial file load
        self.X, self.y = self.load_dataset() # Data object
        
    def load_dataset(self):
        data_raw = pd.read_csv(self.file_path) # Read in data
        # If there were other necessary preprocessing steps, would put them here
        # In this case there isn't
        # Also Note: We would want Scaling steps to be separate
            # And Normalize on Train Data, apply same scaler to test data (not a new one)
        data_np = data_raw.values # Return only the data object
        X, y = data_np[:, :-1], data_np[:, -1] # Data and labels
        return X, y
    
    # reccomended accuracy function for heavily imbalanced data
    def pr_auc(self, y_true, preds):
        p, r, _ = precision_recall_curve(y_true, preds) # Returns a curve set
        return auc(r, p) # returns area under the curve
        
    # Function for evaluating a model using pr_auc
    def evaluate_model(self, model, X, y):
        # The data splits we want to use
        data_splits = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        # Generates a callable scoring function from custom loss func
        # Thus you can pass it to a grid search or something of that nature to compare different runs/models
        # NOTE: if I wanted to do a grid/random search, I could similarly pass the scorer/cv to those intializers!
        scorer = make_scorer(self.pr_auc, needs_proba=True)
        model_scores = cross_val_score(model, X, y, 
                                       scoring=scorer, # What method you are using for calculating cores
                                       cv=data_splits, # How you are going to be generating your splits
                                       n_jobs = cores_to_use) # Number of processes
        
        return model_scores

In [3]:
# Pick Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

# models = {'Decision Tree':DecisionTreeClassifier(),
#          'Random Forest':RandomForestClassifier(),
#          'Extra Trees':ExtraTreesClassifier(),
#          'Bagging':BaggingClassifier()}
models = {'Extra Trees':ExtraTreesClassifier()}

In [None]:
# Create dataset object and execute
import time

# Create Dataset Object
cc_dat = CCDataset('./creditcard.csv')

# Takes in models to consider, as well as a CCDataset object
def run_models(models, cc_dataset):
    results = []
    for key, model in models.items():
        start = time.time()
        scores = cc_dataset.evaluate_model(model, cc_dataset.X, cc_dataset.y)
        results.append(scores)
        end = time.time()
        run_time = str(round(end - start, 3))
        m = str(round(np.mean(scores), 3))
        s = str(round(np.std(scores), 3))
        print('Model = ' + key + ': ' + m + ', ' + s + \
              ', run_time = ' + run_time + 's')
        
run_models(models, cc_dat) # Run set of models

### Results
* Extra trees performed the best
* Can we get better performance that the default values
    * Let's do a random search
    * Might make the most sense to make a class function for this

In [None]:
# Random Search on extra trees classifier
