
## Model Exploration

Active questions:
            # For linear model, allow for intercept
            # lm_features += t.columns.tolist()[1:]

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
from sklearn.metrics import roc_auc_score
import os
import argparse
import yaml
from model_utils import format_crash_data
from model_classes import Indata, Tuner, Tester
import json
import sklearn.ensemble as ske
import sklearn.svm as svm
import sklearn.linear_model as skl
from sklearn.model_selection import RandomizedSearchCV, KFold, GroupShuffleSplit
import xgboost as xgb
from sklearn import metrics

config_file = "../config/config_boston.yml"
datadir = "../../data/boston"
dataDir = datadir


In [2]:
with open(config_file) as f:
    config = yaml.safe_load(f)

## Function that defines our default configs

In [3]:
def set_defaults(config={}):
    """
    Sets defaults if not given in the config file.
    Default is just to use the open street map features and crash file
    args:
        config - dict
    """
    if 'seg_data' not in list(config.keys()):
        print('\n\n\n NOTE: WE ARE SETTING DEFAULT AS vz_predict_dataset.csv.gz WITHIN TRAIN_MODEL.py set_defaults \n\n\n')
        config['seg_data'] = 'vz_predict_dataset.csv.gz'
    if 'concern' not in list(config.keys()):
        config['concern'] = ''
    if 'atr' not in list(config.keys()):
        config['atr'] = ''
    if 'tmc' not in list(config.keys()):
        config['tmc'] = ''
    if 'f_cont' not in list(config.keys()):
        config['f_cont'] = ['width']

    if 'process' not in list(config.keys()):
        config['process'] = True
    if 'time_target' not in list(config.keys()):
        config['time_target'] = [15, 2017]
    if 'weeks_back' not in list(config.keys()):
        config['weeks_back'] = 1
    if 'name' not in list(config.keys()):
        config['name'] = 'boston'
    if 'level' not in list(config.keys()):
        config['level'] = 'week'

In [4]:
set_defaults(config)




 NOTE: WE ARE SETTING DEFAULT AS vz_predict_dataset.csv.gz WITHIN TRAIN_MODEL.py set_defaults 





In [5]:
a = os.path.abspath('train_model.py')
b = os.path.dirname(a)
c = os.path.dirname(b)
d = os.path.dirname(c)
print(a)
print(d)

C:\Users\Daniel\Documents\ML\Transurban\crash-model-master\src\models\train_model.py
C:\Users\Daniel\Documents\ML\Transurban\crash-model-master


In [6]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath('train_model.py'))))
DATA_FP = os.path.join(BASE_DIR, 'data', config['name'])
PROCESSED_DATA_FP = os.path.join(BASE_DIR, 'data', config['name'], 'processed/')
seg_data = os.path.join(PROCESSED_DATA_FP, config['seg_data'])

print(BASE_DIR)
print(DATA_FP)
print(PROCESSED_DATA_FP)
print(seg_data)

C:\Users\Daniel\Documents\ML\Transurban\crash-model-master
C:\Users\Daniel\Documents\ML\Transurban\crash-model-master\data\boston
C:\Users\Daniel\Documents\ML\Transurban\crash-model-master\data\boston\processed/
C:\Users\Daniel\Documents\ML\Transurban\crash-model-master\data\boston\processed/vz_predict_dataset.csv.gz


In [7]:
data = pd.read_csv(seg_data, dtype={'segment_id': 'str'})
data.head()

Unnamed: 0,segment_id,year,week,crash,width,SPEEDLIMIT,Struct_Cnd,Surface_Tp,F_F_Class,lanes,hwy_type,osm_speed,signal,oneway,width_per_lane,AADT,visionzero,seeclickfix
0,0,2015,1,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
1,0,2015,2,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
2,0,2015,3,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
3,0,2015,4,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
4,0,2015,5,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0


In [14]:
data.sort_values(['segment_id', 'year', 'week'], inplace=True)

In [15]:
config['level']

'segment'

## We define get_features to be used to find our various features to be used

In [16]:
def get_features(config, data, datadir):
    """
    Get features from the feature list created during data generation
    """

    with open(os.path.join(datadir, 'features.yml')) as f:
        features = yaml.safe_load(f)

    print('\n Our features are:\n', json.dumps(features, indent=4, sort_keys=True))
    print('\n Data columns are:\n', data.columns.values)
    
    # segment chars
    # Dropping continuous features that don't exist
    new_feats = []
    for f in features['f_cont']:
        if f not in data.columns.values:
            print("Feature " + f + " not found, skipping")
        else:
            new_feats.append(f)
    f_cont = new_feats
    f_cat = features['f_cat']

    # create featureset holder
    features = f_cont + f_cat

    print('\nConfig[concern]', config['concern'])
    print('\nConfig[atr]', config['atr_cols'])
    print('\nConfig[tmc]', config['tmc_cols'])
    print(('\nSegment features included: {}'.format(features)))

    if config['concern'] != '':
        features.append(config['concern'])
    if config['atr'] != '':
        features += config['atr_cols']
    if config['tmc'] != '':
        features += config['tmc_cols']
        
    print('\nf_cat', f_cat)
    print('\nf_cont', f_cont)
    print('\nfeatures', features)
        
    return f_cat, f_cont, features

In [18]:
f_cat, f_cont, features = get_features(config, data, PROCESSED_DATA_FP)


 Our features are:
 {
    "f_cat": [
        "width",
        "SPEEDLIMIT",
        "Struct_Cnd",
        "Surface_Tp",
        "F_F_Class"
    ],
    "f_cont": [
        "lanes",
        "hwy_type",
        "osm_speed",
        "signal",
        "oneway",
        "width_per_lane",
        "AADT"
    ]
}

 Data columns are:
 ['segment_id' 'year' 'week' 'crash' 'width' 'SPEEDLIMIT' 'Struct_Cnd'
 'Surface_Tp' 'F_F_Class' 'lanes' 'hwy_type' 'osm_speed' 'signal' 'oneway'
 'width_per_lane' 'AADT' 'visionzero' 'seeclickfix']

Config[concern] visionzero

Config[atr] ['speed_coalesced', 'volume_coalesced']

Config[tmc] ['Conflict']

Segment features included: ['lanes', 'hwy_type', 'osm_speed', 'signal', 'oneway', 'width_per_lane', 'AADT', 'width', 'SPEEDLIMIT', 'Struct_Cnd', 'Surface_Tp', 'F_F_Class']

f_cat ['width', 'SPEEDLIMIT', 'Struct_Cnd', 'Surface_Tp', 'F_F_Class']

f_cont ['lanes', 'hwy_type', 'osm_speed', 'signal', 'oneway', 'width_per_lane', 'AADT']

features ['lanes', 'hwy_type', '

In [19]:
data_segs = data.groupby('segment_id')[f_cont + f_cat].max()
print(len(data_segs))
print(len(data))
data_segs

22500
4612500


Unnamed: 0_level_0,lanes,hwy_type,osm_speed,signal,oneway,width_per_lane,AADT,width,SPEEDLIMIT,Struct_Cnd,Surface_Tp,F_F_Class
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2,0,0,0,0,15,18330,30,20,2,6,7
000,2,0,25,0,0,8,0,15,15,1,6,7
001,2,1,0,0,1,15,22222,30,20,2,6,3
0010,3,1,0,0,1,8,8974,24,35,1,6,3
00100,2,2,25,0,1,8,0,17,35,2,6,7
001000,2,0,25,0,1,6,0,12,20,2,6,7
0010000,2,0,0,0,0,6,0,12,20,2,6,7
0010001,2,0,0,0,0,6,0,13,20,0,0,7
0010002,2,0,0,0,0,6,0,13,20,0,0,7
0010003,2,0,0,0,0,4,0,9,20,1,6,7


In [20]:
data_segs.loc[['0']]

Unnamed: 0_level_0,lanes,hwy_type,osm_speed,signal,oneway,width_per_lane,AADT,width,SPEEDLIMIT,Struct_Cnd,Surface_Tp,F_F_Class
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,2,0,0,0,0,15,18330,30,20,2,6,7


In [21]:
data.loc[data['segment_id'] == '0']

Unnamed: 0,segment_id,year,week,crash,width,SPEEDLIMIT,Struct_Cnd,Surface_Tp,F_F_Class,lanes,hwy_type,osm_speed,signal,oneway,width_per_lane,AADT,visionzero,seeclickfix
0,0,2015,1,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
1,0,2015,2,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
2,0,2015,3,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
3,0,2015,4,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
4,0,2015,5,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
5,0,2015,6,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
6,0,2015,7,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
7,0,2015,8,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
8,0,2015,9,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0
9,0,2015,10,0,30,20,2,6,7,2,0,0,0,0,15,18330,0,0


In [22]:
data_segs.reset_index(inplace=True)

## We just finished getting the maximum of a number of features for each roach segment. Now to add some extra features. We refer back to data, and use the similar .groupby argument to get features by segment_id

In [23]:
def add_extra_features(data, data_segs, config, datadir):
    """
    Add concerns, atrs and tmcs
    Args:
        data
        data_segs
        config
    Returns:
        updated data_segs
    """

    # add concern
    
    if config['concern'] != '':
        print('Adding concerns')
        concern_observed = data[data.year == 2016].groupby('segment_id')[config['concern']].max()
        data_segs = data_segs.merge(concern_observed.reset_index(), on='segment_id')
    else:
        print('We didnt add concerns')
        
    # add in atrs if filepath present
    if config['atr'] != '':
        filePath = datadir + config['atr']
        atrs = pd.read_csv(filePath, dtype={'id': 'str'})
        print('Reading in our Atrs from', filePath)
        print('Atrs looks like', atrs.head())
        # for some reason pandas reads the id as float before str conversions
        atrs['id'] = atrs.id.apply(lambda x: x.split('.')[0])
        data_segs = data_segs.merge(atrs[['id'] + config['atr_cols']],
                                    left_on='segment_id', right_on='id')
    else:
        print('We didnt add atrs')
        
    # add in tmcs if filepath present
    if config['tmc'] != '':
        filePath = datadir + config['tmc']
        tmcs = pd.read_json(filePath, dtype={'near_id': str})[
            ['near_id'] + config['tmc_cols']]
        print('Reading in our tmcs from:', filePath)
        print('Tmcs looks like:', tmcs.head())
        print('Tmcs have the following headers:', list(tmcs))
        data_segs = data_segs.merge(
            tmcs, left_on='segment_id', right_on='near_id', how='left')
        data_segs[config['tmc_cols']] = data_segs[config['tmc_cols']].fillna(0)
    else:
        print('We didnt add tmcs')
        
    return data_segs

In [24]:
data_segs = add_extra_features(data, data_segs, config, PROCESSED_DATA_FP)

Adding concerns
Reading in our Atrs from C:\Users\Daniel\Documents\ML\Transurban\crash-model-master\data\boston\processed/atrs_predicted.csv
Atrs looks like   id                                           geometry            px  \
0  0  MULTILINESTRING ((-7918115.056242444 5195887.0... -7.918112e+06   
1  1  MULTILINESTRING ((-7918221.546464928 5195926.0... -7.918215e+06   
2  2  MULTILINESTRING ((-7918331.617485714 5195964.1... -7.918325e+06   
3  3  MULTILINESTRING ((-7918550.226762316 5196038.5... -7.918544e+06   
4  4  MULTILINESTRING ((-7918639.174632536 5196092.3... -7.918636e+06   

             py  heavy  light  bikes  speed  volume  heavy_coalesced  \
0  5.195907e+06    NaN    NaN    NaN    NaN     NaN       252.369816   
1  5.195945e+06    NaN    NaN    NaN    NaN     NaN       252.421894   
2  5.195983e+06    NaN    NaN    NaN    NaN     NaN       252.475778   
3  5.196057e+06    NaN    NaN    NaN    NaN     NaN       252.579594   
4  5.196091e+06    NaN    NaN    NaN    NaN 

In [25]:
data_segs.max()

segment_id             999
lanes                    6
hwy_type                23
osm_speed               55
signal                   6
oneway                   1
width_per_lane          47
AADT                209021
width                   76
SPEEDLIMIT              65
Struct_Cnd               4
Surface_Tp               8
F_F_Class                7
visionzero              35
id                     999
speed_coalesced         39
volume_coalesced     25796
Conflict             13420
dtype: object

In [28]:
data.max()

segment_id           999.0
year                2018.0
week                  52.0
crash                  5.0
width                 76.0
SPEEDLIMIT            65.0
Struct_Cnd             4.0
Surface_Tp             8.0
F_F_Class              7.0
lanes                  6.0
hwy_type              23.0
osm_speed             55.0
signal                 6.0
oneway                 1.0
width_per_lane        47.0
AADT              209021.0
visionzero            35.0
seeclickfix           43.0
dtype: float64

## We have now added all the feature we might want. Time to process them

In [None]:
f = 'width'
data_segs.head()
b = pd.DataFrame.copy(data_segs)

a = pd.get_dummies([1,2,3,1,6,'hi'])
a.columns = ['weight' + str(c) for c in a.columns]
features_fake = list(np.copy(features))
a.columns.tolist()[1:]
features_fake
data_segs.segment_id.map(lambda x: x[:2] != '00').astype(int)

set(features) - set(f_cat + f_cont)

In [None]:
def process_features(features, config, f_cat, f_cont, data_segs):
    # features for linear model
    lm_features = features

    if config['process']:
        
        # Discretise all the categorical data and one-hot encode it. e.g. 'Width1', 'Width2'
        print(('Processing categorical: {}'.format(f_cat)))
        print('NOTE: Have commented lm_features += t.columns.tolist()[1:] out within process_features')
        for f in f_cat:
            # The way get dummies works is it takes in a list [1, 2, 3, 'hi'], and outputs a DF where in the first row, the column
            # with header '1' has a 1 in it. In the 4th row, the column with header 'hi' has a 1 in it.
            # So below, we make a df with # rows = len(data_segs[f]), where there is a single 1 in each column. That is, we one-hot encode
            # each variable.
            t = pd.get_dummies(data_segs[f])
            t.columns = [f + str(c) for c in t.columns]
            data_segs = pd.concat([data_segs, t], axis=1)
            features += t.columns.tolist()
            lm_features += t.columns.tolist()
            # For linear model, allow for intercept
            # lm_features += t.columns.tolist()[1:]
            
        # Take log(data + 1) for all the continuous data sources and append it
        print(('Processing continuous: {}'.format(f_cont)))
        for f in f_cont:
            data_segs['log_%s' % f] = np.log(data_segs[f] + 1)
            features += ['log_%s' % f]
            lm_features += ['log_%s' % f]
        # add segment type
        # Here it adds a '1' to the 'intersection' column if the segment_id is not preceeded by two 0's
        data_segs['intersection'] = data_segs.segment_id.map(lambda x: x[:2] != '00').astype(int)
        features += ['intersection']
        lm_features += ['intersection']

        # remove duplicated features
        features = list(set(features) - set(f_cat + f_cont))
        lm_features = list(set(lm_features) - set(f_cat + f_cont))

    return data_segs, features, lm_features

In [None]:
data_segs, features, lm_features = process_features(features, config, f_cat, f_cont, data_segs)

In [None]:
features.sort()
lm_features.sort()
print('\n\n', len(features), '\n\n', features)
print('\n\n', len(lm_features), '\n\n', lm_features)

In [None]:
# Create lagged crash values of we are looking at weekly data and add to features.
if config['level'] == 'week':
    print('Creating lagged crash values')
    crash_lags = format_crash_data(data, 'crash', week, year)
    crash_cols = ['pre_week', 'pre_month', 'pre_quarter', 'avg_week']
    features += crash_cols
    data_model = crash_lags.merge(data_segs, left_on='segment_id', right_on='segment_id')

# If not, then get the maximum number of crashes on the road segment and add that.
# We then add a target depending on if a crash has ever occured on that road segment. 1 if yes, 0 if no.
else:
    print('Aggregating crashes by segment_id')
    any_crash = data.groupby('segment_id')['crash'].max()
    any_crash = (any_crash > 0).astype(int)
    any_crash.name = 'target'
    data_model = data_segs.set_index('segment_id').join(any_crash).reset_index()

In [None]:
print("full features:{}".format(features))
print('\n\n lm_features:', lm_features)
print('\n\n Data_model: \n\n', data_model.head())
print('\n\n config[level]:', config['level'])
print('\n\n Process_data_fp:', PROCESSED_DATA_FP)

In [None]:
data_segs.head()

## We move to Run & Init

In [None]:
# First we get all our parameters 

# cv parameters
cvp = dict()
cvp['pmetric'] = 'roc_auc'
cvp['iter'] = 5  # number of iterations
cvp['folds'] = 5  # folds for cv (default)
cvp['shuffle'] = True

# LR parameters
mp = dict()
mp['LogisticRegression'] = dict()
mp['LogisticRegression']['penalty'] = ['l1', 'l2']
mp['LogisticRegression']['C'] = ss.beta(a=5, b=2)  # beta distribution for selecting reg strength
mp['LogisticRegression']['class_weight'] = ['balanced']

# xgBoost model parameters
mp['XGBClassifier'] = dict()
mp['XGBClassifier']['max_depth'] = list(range(3, 7))
mp['XGBClassifier']['min_child_weight'] = list(range(1, 5))
mp['XGBClassifier']['learning_rate'] = ss.beta(a=2, b=15)

# cut-off for model performance
# generally, if the model isn't better than chance, it's not worth reporting
perf_cutoff = 0.5

In [None]:
# Next we are going to make a class to hold our data

class Indata():
    scoring = None
    data = None
    train_x, train_y, test_x, test_y = None, None, None, None
    is_split = 0
    
    # init with pandas DF and target column name, specify scoring observations
    def __init__(self, data, target, scoring=None):
        # If scoring observations, store under scoring attribute
        if scoring is not None:
            print('During initialisation of Indata, scoring was given.')
            self.data = data[~(scoring)]
            self.scoring = data[scoring]
        else:
            print('No scoring provided, setting self.data = data')
            self.data = data
        self.target = target
        # Check to see that target has more than one value
        print('The target stored in self.data is:', self.data[self.target].head())
        assert self.data[self.target].nunique() > 1
    
    # Split into train/test
    # pct : percent training observations
    # datesort : specify date column for sorting values
    #   If this is not None, split will be non-random (i.e. split on sorted obs)
    def tr_te_split(self, pct, datesort=None, group_col=None, seed=None):
        """
        Split into train/test
        pct : percent training observations
        datesort : specify date column for sorting values
            If this is not None, split will be non-random (i.e. split on sorted obs)
        group_col : group column name for groupkfold split
            Will also be passed to tuner
        """
        if seed:
            np.random.seed(seed)
        if group_col:
            self.group_col = group_col
            grouper = GroupShuffleSplit(n_splits=1, train_size=pct)
            g = grouper.split(self.data, groups=self.data[group_col])
            # get the actual indexes of the training set
            inds, _ = tuple(*g)
            # translate that into boolean array
            inds = self.data.index[inds]
            inds = self.data.index.isin(inds)
        elif datesort:
            self.data.sort_values(datesort, inplace=True)
            self.data.reset_index(drop=True, inplace=True)
            # Generates [true, true, true, true, false, false], where the true going to index our train examples
            inds = np.arange(0.0, len(self.data)) / len(self.data) < pct
        else:
            inds = np.random.rand(len(self.data)) < pct
        
        self.train_x = self.data[inds]
        self.train_y = self.data[self.target][inds]
        print('Train obs:', len(self.train_x))
        
        self.test_x = self.data[~inds]
        self.test_y = self.data[self.target][~inds]
        print('Test obs:', len(self.test_x))
        
        self.is_split = 1

In [None]:
# Initialize data
df = Indata(data_model, 'target')

print('\n\n cvp:', cvp, '\n\n mp', mp, '\n\n perf_cutoff', perf_cutoff)
print('\n\n df', df)

# Create train/test split
df.tr_te_split(.7, seed=1)

print('\n\n df after split', df)

In [None]:
# Weight the target data inversely with frequency of class
a = data_model['target'].value_counts(normalize=True)
w = 1 / a[1]
mp['XGBClassifier']['scale_pos_weight'] = [w]

In [None]:
class Tuner():
    """
    Initiates with indata class, will tune series of models according to parameters.  
    Outputs RandomizedGridCV results and parameterized model in dictionary
    """
    data = None
    train_x, train_y = None, None
    group_col = None
    
    def __init__(self, indata, best_models=None, grid_results=None):
        print('Initiating Tuner instance')
        
        # Ensure data has already been split
        if indata.is_split == 0:
            raise ValueError('Data is not split, cannot be tested')
        
        # Check if grouped by some column
        if hasattr(indata,'group_col'):
            print('Data has previously been grouped.')
            self.group_col = indata.group_col
        
        # Initialise data attributes
        self.data = indata.data
        self.train_x = indata.train_x
        self.train_y = indata.train_y
        if best_models is None:
            self.best_models = {}
        if grid_results is None:
            self.grid_results = pd.DataFrame()
        
    def make_grid(self, model, cvparams, mparams):
        sc = cvparams['pmetric']
        cv = KFold(cvparams['folds'], cvparams['shuffle'])
        n_iter = cvparams['iter']
        
        print('Making RandomCVGrid with the following parameters:')
        print('\n Scoring:', sc, '\n cv:', cv, '\n n_iter', n_iter)
        # Makes CV grid
        # to implement, no capability for GroupKFold for randomizedsearch
        # if self.group_col:
        # cv = GroupKFold(cvparams['folds'])
        grid = RandomizedSearchCV(
                    model(),
                    scoring = sc, 
                    cv = cv,
                    refit = False, 
                    n_iter = n_iter,
                    param_distributions = mparams, 
                    verbose = 1)
        return(grid)
    
    def run_grid(self, grid, train_x, train_y):
        print('Running the grid.')
        grid.fit(train_x, train_y)
        results = pd.DataFrame(grid.cv_results_)[['mean_test_score','mean_train_score','params']]
        best = {}
        best['bp'] = grid.best_params_
        best[grid.scoring] = grid.best_score_
        
        print('Attained results:', results)
        print('Got best as:', best)
        print('Best has type:', type(best))
        return(best, results)
            
    def tune(self, name, m_name, features, cvparams, mparams):
        print('hello')
        if hasattr(ske, m_name):
            model = getattr(ske, m_name)
            print('ske has our m_name attribute with model:', model)
        elif hasattr(skl, m_name):
            model = getattr(skl, m_name)
            print('skl has our m_name attribute with model:', model)
        elif hasattr(xgb, m_name):
            model = getattr(xgb, m_name)
            print('xgb has our m_name attribute with model:', model)
        elif hasattr(svm, m_name):
            model = getattr(svm, m_name)
            print('svm has our m_name attribute with model:', model)
        else:
            raise ValueError('Model name is invalid.')
        
        print('About to make search grid.')
        grid = self.make_grid(model, cvparams, mparams)
        
        print(self.train_x)
        print(self.train_x[features])
        
        best, results = self.run_grid(grid, self.train_x[features], self.train_y)
        
        print('Have finished run_grid, results prior to modification:', results)
        results['name'] = name
        results['m_name'] = m_name
        print('Results after modification:', results)        
        
        print('self.grid_results before appending:', self.grid_results)
        self.grid_results = self.grid_results.append(results)
        print('self.grid_results after appending:', self.grid_results)
                
        print('best before modification:', best)
        best['model'] = model(**best['bp'])
        best['features'] = list(features)
        print('best after modification:', best)
                
        print('self.best_models before modification:', self.best_models)
        self.best_models.update({name: best}) 
        print('self.best_models after modification:', self.best_models)

In [None]:
# Initialize tuner
tune = Tuner(df)
print('Tuner', tune)

try:
    # Base XG model
    tune.tune(name = 'XG_base', m_name = 'XGBClassifier', features = features, cvparams = cvp, mparams = mp['XGBClassifier'])
    # Base LR model
    tune.tune(name = 'LR_base', m_name = 'LogisticRegression', features = lm_features, 
              cvparams = cvp, mparams = mp['LogisticRegression'])

except ValueError:
    print('CV fails, likely very few of target available, try rerunning at segment-level')
    raise

In [None]:
class Tester():
    """
    Initiates with indata class, receives parameterized sklearn models, prints and stores results
    """
    
    def __init__(self, data, rundict=None):
        print('Initiating Tester')
        if data.is_split == 0 :
            raise ValueError('Data is not split, cannot be tested')
        else:
            self.data = data
            if rundict is None:
                self.rundict = {}
            
    def init_tuned(self, tuned):
        """ pass Tuner object, populate with names, models, features """
        if tuned.best_models=={}:
            raise ValueError('No tuned models found')
        else:
            
            self.rundict.update(tuned.best_models)
    
    def predsprobs(self, model, test_x):
        """ Produce predicted class and probabilities """
        # if the model doesn't have predict proba, will be treated as GLM
        if hasattr(model, 'predict_proba'):
            print('Model has a predict_proba method, so we shall use that for predicting probabilities.')
            preds = model.predict(test_x)
            probs = model.predict_proba(test_x)[:,1]
        else:
            print('Model does not have a predict_proba method, so we take probs >= 0.5 as a 1 prediction.')
            probs = model.predict(test_x)
            preds = (probs>=.5).astype(int)
        return(preds, probs)
    
    def get_metrics(self, preds, probs, test_y):
        """ Produce metrics (f1 score, AUC, brier) """
        # if test is not binary, just run brier
        if len(np.unique(test_y))==2:
            print('Only two unique y values, indicated test is binary. Returning f1_s, roc and brier [within get_metrics]')
            f1_s = metrics.f1_score(test_y, preds)
            roc = metrics.roc_auc_score(test_y, probs)
        else:
            print('Test is not binary. Only returning brier [within get_metrics]')
            f1_s, roc = None, None
        brier = metrics.brier_score_loss(test_y, probs)
        return(f1_s, roc, brier)
    
    def make_result(self, model, test_x, test_y):
        """ gets predictions and runs metrics """
        print('Within make result')
        preds, probs = self.predsprobs(model, test_x)
        print('Predictions:', preds)
        print('Probabilities:', probs)
        f1_s, roc, brier = self.get_metrics(preds, probs, test_y)
        print("f1_score: ", f1_s)
        print("roc auc: ", roc)
        print("brier_score: ", brier)
        result = {}
        result['f1_s'] = f1_s
        result['roc'] = roc
        result['brier'] = brier
        return(result)
    
    def run_model(self, name, model, features, cal=True, cal_m='sigmoid'):
        """
        Run a specific model (not from Tuner classs)
        By default, calibrates predictions and produces metrics for them
        Will also store in rundict object
        """

        results = {}
        results['features'] = list(features)
        results['model'] = model
        print("Fitting {} model with {} features".format(name, len(features)))
        if cal:
            # Need disjoint calibration/training datasets
            # Split 50/50
            rnd_ind = np.random.rand(len(self.data.train_x)) < .5
            train_x = self.data.train_x[features][rnd_ind]
            train_y = self.data.train_y[rnd_ind]
            cal_x = self.data.train_x[features][~rnd_ind]
            cal_y = self.data.train_y[~rnd_ind]
        else:
            train_x = self.data.train_x[features]
            train_y = self.data.train_y

        m_fit = model.fit(train_x, train_y)
        result = self.make_result(
            m_fit,
            self.data.test_x[features],
            self.data.test_y)

        results['raw'] = result
        results['m_fit'] = m_fit
        if cal:
            print("calibrated:")
            m_c = CalibratedClassifierCV(model, method = cal_m)
            m_fit_c = m_c.fit(cal_x, cal_y)
            result_c = self.make_result(m_fit_c, self.data.test_x[features], self.data.test_y)
            results['calibrated'] = result_c              
            print("\n")
        if name in self.rundict:
            self.rundict[name].update(results)
        else:
            self.rundict.update({name:results})
    
    def run_tuned(self, name, cal=True, cal_m='sigmoid'):
        """ Wrapper for run_model when using Tuner object """
        print('Within run_tunes. Feeding variables into run model:')
        print('\nname:', name)
        print('\nself.rundict[name][model]:', self.rundict[name]['model'])
        print('\nself.rundict[name][features]:', self.rundict[name]['features'])
        print('\ncal', cal)
        print('\ncal_m', cal_m)
        self.run_model(name, self.rundict[name]['model'], self.rundict[name]['features'], cal, cal_m)

    def lift_chart(self, x_col, y_col, data, ax=None, pct=True):
        """ 
        create lift chart 
        x_col = pctiles of predictions
        y_col = % positive class
        """
        p = sns.barplot(x=x_col, y=y_col, data=data, 
                        palette='Greens', ax = None, ci=None)
        vals = p.get_yticks()
        xvals = [x.get_text().split(',')[-1].strip(']') for x in p.get_xticklabels()]
        if pct==True:
            p.set_yticklabels(['{:3.0f}%'.format(i*100) for i in vals])
            xvals = ['{:2.1f}%'.format(float(x)*100) for x in xvals]
        p.set_xticklabels(xvals, rotation=30)
        p.set_facecolor('white')
        p.set_xlabel('')
        p.set_ylabel('')
        p.set_title('Predicted probability vs actual percent')
        return(p)
    
    def density(self, data, score_col, ax=None):
        """ create kdeplot of predictions """
        p = sns.kdeplot(data[score_col], ax=ax)
        p.set_facecolor('white')
        p.legend('')
        p.set_xlabel('Predicted probability')
        p.set_title('KDE plot predictions')
        return(p)

    def density_and_lift_charts(self, model, features=None, model_params=None, verbose=True, qcut=10):
        """ 
        produces prediction density and decile lift chart 
        currently only works for binary targets (0/1)
        model (str or object with predict) : name in rundict (if used), otherwise model
        features (list) : list of features, if not available in rundict
        model_params : can just pass model params (from rundict)
        verbose : True if you want the prediction deciles to be output
        qcut : can specify percentile cut (default = decile)
        """
        if model_params:
            pass
        elif model not in self.rundict:
            _, probs = self.predsprobs(model, self.data.test_x[features])
        else:
            model_params = self.rundict[model]
            _, probs = self.predsprobs(model_params['m_fit'],
                self.data.test_x[model_params['features']])
        risk_df = pd.DataFrame(
            {'probs':probs, 'target':self.data.test_y})
        risk_df['categories'] = pd.qcut(risk_df['probs'], qcut)
        risk_mean = risk_df.groupby('categories')['target'].mean().reset_index()
        if verbose:
            print(risk_df.probs.describe())
            print(risk_mean)
        _, axes = plt.subplots(1, 2)
        self.lift_chart('categories', 'target', risk_df, 
                   ax=axes[1])
        self.density(risk_df, 'probs', ax=axes[0])
        plt.show()

    
    def to_csv(self):
        """ outputs rundict to csv """
        if self.rundict == {}:
            raise ValueError('No results found')
        else:
            now = pd.to_datetime('today').value
            #Make dataframe, transpose so each row = model
            pd.DataFrame(self.rundict).T.to_csv('results_{}.csv'.format(now))


In [None]:
# Run test
test = Tester(df)
test.init_tuned(tune)
test.run_tuned('LR_base', cal=False)
test.run_tuned('XG_base', cal=False)

In [None]:
print(test.rundict)

In [None]:
# choose best performing model. Note we have already tuned hyperparameters of the models, so we are purely choosing the best model type.
print('Within train_model. Have instantiated tuner object and completed tuning. Will now iterate over test.rundict to check for best performing model. Test.rundict has len:', len(test.rundict), 'and looks like:', test.rundict)
best_perf = 0
best_model = None
for m in test.rundict:
    if test.rundict[m]['roc_auc'] > best_perf:
        best_perf = test.rundict[m]['roc_auc']
        best_model = test.rundict[m]['model']
        best_model_features = test.rundict[m]['features']


In [None]:
# check for performance above certain level
if best_perf <= perf_cutoff:
    print(('Model performs below AUC %s, may not be usable' % perf_cutoff))
else:
    print('Best performance is {}, better than the cutoff at {}'.format(best_perf, perf_cutoff))

print('Best performance was', best_perf, '\n Best model was', best_model, '\nBest model features were', best_model_features)

# train on full data [including our test data]
trained_model = best_model.fit(data_model[best_model_features], data_model['target'])

## Having chosen our model, we can now carry out predictions and output feature importance

In [None]:
def predict(trained_model, data_model, best_model_features,
            features, perf_cutoff, config_level, datadir):
    """
    Args:
        config_level - either week or segment
    Returns
        nothing, writes prediction segments to file
    """
    if config_level == 'week':
        # Predict back number of weeks according to config
        all_weeks = data[['year', 'week']].drop_duplicates().sort_values(['year', 'week']).values
        back_weeks = all_weeks[-config['weeks_back']:]
        pred_weeks = np.zeros([back_weeks.shape[0], data_segs.shape[0]])
        for i, yw in enumerate(back_weeks):
            preds = predict_forward(trained_model, best_model_features, perf_cutoff,
                                    yw[1], yw[0], data_segs, data)
            pred_weeks[i] = preds

        # create dataframe with segment-year-week index
        df_pred = pd.DataFrame(pred_weeks.T,
                               index=data_segs.segment_id.values,
                               columns=pd.MultiIndex.from_tuples([tuple(w) for w in back_weeks]))
        # has year-week column index, need to stack for year-week index
        df_pred = df_pred.stack(level=[0, 1])
        df_pred = df_pred.reset_index()
        df_pred.columns = ['segment_id', 'year', 'week', 'prediction']
        df_pred.to_csv(os.path.join(datadir, 'seg_with_predicted.csv'), index=False)
        data_plus_pred = df_pred.merge(data_model, on=['segment_id'])
        data_plus_pred.to_json(os.path.join(datadir, 'seg_with_predicted.json'), orient='index')
    else:
        preds = trained_model.predict_proba(data_model[features])[::, 1]
        df_pred = data_model.copy(deep=True)
        df_pred['prediction'] = preds
        df_pred.to_csv(os.path.join(datadir, 'seg_with_predicted.csv'), index=False)
        df_pred.to_json(os.path.join(datadir, 'seg_with_predicted.json'), orient='index')

In [None]:
config_level = 'segment'
predict(trained_model, data_model, best_model_features,
        features, perf_cutoff, config_level, datadir)

In [None]:
def output_importance(trained_model, features, datadir):
    
    if hasattr(trained_model, 'feature_importances_'):
        print('Have the feature_importances_ attribute. Zipping this and using as weights')
        feature_imp_dict = dict(zip(features, trained_model.feature_importances_.astype(float)))
    
    elif hasattr(trained_model, 'coefficients'):
        print('Do not have feature_importances_. Instead using coefficients attribute. Ziping this and using as weights.')
        feature_imp_dict = dict(zip(features, trained_model.coefficients.astype(float)))
    
    else:
        return("No feature importances/coefficients detected")

    path = os.path.join(datadir, 'feature_importances.json')
    with open(path, 'w') as f:
        print('Returning results at', path)
        json.dump(feature_imp_dict, f)

In [None]:
output_importance(trained_model, features, datadir)
