In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint as sp_randint
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(0)

In [2]:
dfs = {}
for file in ['../data/MovAve15_Data.csv', '../data/MovAve30_Data.csv',
             '../data/MovAve60_Data.csv', '../data/MovAve90_Data.csv']:
    dfs[file[14:16]] = pd.read_csv(file)

In [3]:
class CustomKFold:
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, df, y, groups=None):
        from sklearn.model_selection import StratifiedKFold
        import numpy as np

        sk = StratifiedKFold(random_state=0, n_splits=self.n_splits, shuffle=True)

        train_all = df[df['year'] < 2019]

        for train_idx, valid_idx in sk.split(np.zeros(train_all.shape[0]), train_all['year']):
            yield (train_idx, valid_idx)

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [9]:
preds = {}
tests = {}

for name, df in dfs.items():

    train = df[df['year'] < 2019]
    test = df[df['year'] == 2019]

    x_train = train.drop(['Target_Var'], axis=1)
    y_train = train['Target_Var']

    x_test = test.drop(['Target_Var'], axis=1)
    y_test = test['Target_Var']
    
    with open('../results/best_gb_model_{}.pckl'.format(name), 'rb') as f:
        gb = pickle.load(f)
    with open('../results/best_gb_model_cv_{}.pckl'.format(name), 'rb') as f:
        gb_cv = pickle.load(f)
    with open('../results/cols_{}.pckl'.format(name), 'rb') as f:
        cols = pickle.load(f)
    
    accuracy = gb.score(x_test[cols], y_test)
    print('best estimator accuracy for {}: {:.4f}'.format(name, accuracy))
    preds[name] = gb.predict_proba(x_test[cols])[:,1]
    tests[name] = y_test


best estimator accuracy for 15: 0.4934
best estimator accuracy for 30: 0.4959
best estimator accuracy for 60: 0.5036
best estimator accuracy for 90: 0.5154


In [10]:

def check_accuracy_data(threshold, prediction, actual):
    '''
    This function will take two series of equal length and only calculate 
    accuracy for predictions that meet the threshold criteria. 
    '''
    accuracies = []
    my_list = []
    for preds, truth in zip(prediction, actual):
        if preds >= threshold:
            accuracies.append(1 == truth)
        elif preds <= (1-threshold):
            accuracies.append(0 == truth)
        else:
            continue
    if len(accuracies) == 0:
        accuracy = None
        subsample_percent = 0
    else:
        accuracy = accuracies.count(True)/len(accuracies)
        subsample_percent = len(accuracies)/len(prediction)
    return [threshold, accuracy, subsample_percent]


'''
For the below, update predictions list to be your list of predict probas in
15,30,60,90 order and same for tests list being actual outcomes. 
Change 'your_model_type' (2 serperate inputs) below to some 2-4 character string
that describes your model. (RF, NN, GB, LR, etc etc)
Please push csv to git once complete.
'''
thresholds = [.52, .54, .55, .56]
predictions = [preds['15'], preds['30'], preds['60'], preds['90']]
tests = [tests['15'], tests['30'], tests['60'], tests['90']]
mov_avs = [15,30,60,90]
lists_ = []
for p, a, m in zip(predictions, tests, mov_avs):
    for t in thresholds:
        lists_.append(check_accuracy_data(t, p, a) + [m, 'GB'])
        
df = pd.DataFrame(lists_)
df.columns = ['Threshold', 'Accuracy', 'Percent_of_Samples', 'Dataset', 'Model']

# Save file with your model name 
df.to_csv('../data/accuracy_dataframe_gb.csv')


In [13]:
!head ../data/accuracy_dataframe_gb.csv -n 20

,Threshold,Accuracy,Percent_of_Samples,Dataset,Model
0,0.52,0.4935064935064935,0.8450395083406497,15,GB
1,0.54,0.5003317850033179,0.6615452151009658,15,GB
2,0.55,0.5007564296520424,0.5803336259877085,15,GB
3,0.56,0.5056374674761491,0.5061457418788411,15,GB
4,0.52,0.4925925925925926,0.7303877366997295,30,GB
5,0.54,0.5064695009242144,0.4878268710550045,30,GB
6,0.55,0.49655963302752293,0.3931469792605951,30,GB
7,0.56,0.49850746268656715,0.30207394048692515,30,GB
8,0.52,0.49468462083628634,0.6786916786916787,60,GB
9,0.54,0.4874715261958998,0.4223184223184223,60,GB
10,0.55,0.49544072948328266,0.3164983164983165,60,GB
11,0.56,0.4939759036144578,0.23953823953823955,60,GB
12,0.52,0.5218487394957984,0.6201146430432517,90,GB
13,0.54,0.4966887417218543,0.31474726420010424,90,GB
14,0.55,0.5097560975609756,0.2136529442417926,90,GB
15,0.56,0.49825783972125437,0.1495570609692548,90,GB
