In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, Lasso
from scipy.stats import randint as sp_randint
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from scipy.stats import expon
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

np.random.seed(0)

In [2]:
dfs = {}
for file in ['../data/MovAve15_Data.csv', '../data/MovAve30_Data.csv',
             '../data/MovAve60_Data.csv', '../data/MovAve90_Data.csv']:
    dfs[file[14:16]] = pd.read_csv(file)

In [3]:
class CustomKFold:
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, df, y, groups=None):
        from sklearn.model_selection import StratifiedKFold
        import numpy as np

        sk = StratifiedKFold(random_state=0, n_splits=self.n_splits, shuffle=True)

        train_all = df[df['year'] < 2019]

        for train_idx, valid_idx in sk.split(np.zeros(train_all.shape[0]), train_all['year']):
            yield (train_idx, valid_idx)

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [14]:
class KFold_Scaled:
    def __init__(self, n_splits=5):
        self.n_splits = n_splits

    def split(self, df, y, groups=None):
        from sklearn.model_selection import StratifiedKFold
        from sklearn.preprocessing import LabelEncoder
        import numpy as np

        sk = StratifiedKFold(random_state=0, n_splits=self.n_splits, shuffle=True)

        train_all = df[df['year'] < 1]

        for train_idx, valid_idx in sk.split(np.zeros(train_all.shape[0]),
                                             LabelEncoder().fit_transform(df['year'])):
            yield (train_idx, valid_idx)

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [17]:
preds = {}
tests = {}

for name, df in dfs.items():

    data_scaled = df.copy()
    scalers = {}

    for col in df:
        if df[col].nunique() > 2:
            scalers[col] = MinMaxScaler((-1, 1))
            data_scaled[col] = scalers[col].fit_transform(df[col].values.reshape(-1, 1))

    train = data_scaled[data_scaled['year'] < 1]
    test = data_scaled[data_scaled['year'] == 1]

    x_train = train.drop(['Target_Var'], axis=1)
    y_train = train['Target_Var']

    x_test = test.drop(['Target_Var'], axis=1)
    y_test = test['Target_Var']
    
    with open('best_lr_model_{}.pckl'.format(name), 'rb') as f:
        lr = pickle.load(f)
    #with open('best_lr_model_cv_{}.pckl'.format(name), 'rb') as f:
    #    lr_cv = pickle.load(f)
    
    accuracy = lr.score(x_test, y_test)
    print('best estimator accuracy for {}: {:.4f}'.format(name, accuracy))
    preds[name] = lr.predict_proba(x_test)[:,1]
    tests[name] = y_test




best estimator accuracy for 15: 0.5136




best estimator accuracy for 30: 0.5153




best estimator accuracy for 60: 0.5171
best estimator accuracy for 90: 0.4966




In [18]:

def check_accuracy_data(threshold, prediction, actual):
    '''
    This function will take two series of equal length and only calculate 
    accuracy for predictions that meet the threshold criteria. 
    '''
    accuracies = []
    my_list = []
    for preds, truth in zip(prediction, actual):
        if preds >= threshold:
            accuracies.append(1 == truth)
        elif preds <= (1-threshold):
            accuracies.append(0 == truth)
        else:
            continue
    if len(accuracies) == 0:
        accuracy = None
        subsample_percent = 0
    else:
        accuracy = accuracies.count(True)/len(accuracies)
        subsample_percent = len(accuracies)/len(prediction)
    return [threshold, accuracy, subsample_percent]


'''
For the below, update predictions list to be your list of predict probas in
15,30,60,90 order and same for tests list being actual outcomes. 
Change 'your_model_type' (2 serperate inputs) below to some 2-4 character string
that describes your model. (RF, NN, GB, LR, etc etc)
Please push csv to git once complete.
'''
thresholds = [.52, .54, .55, .56]
predictions = [preds['15'], preds['30'], preds['60'], preds['90']]
tests = [tests['15'], tests['30'], tests['60'], tests['90']]
mov_avs = [15,30,60,90]
lists_ = []
for p, a, m in zip(predictions, tests, mov_avs):
    for t in thresholds:
        lists_.append(check_accuracy_data(t, p, a) + [m, 'LR'])
        
df = pd.DataFrame(lists_)
df.columns = ['Threshold', 'Accuracy', 'Percent_of_Samples', 'Dataset', 'Model']

# Save file with your model name 
df.to_csv('../data/accuracy_dataframe_lr.csv')
