# Imports

In [50]:
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams.update({'figure.figsize':(11,7), 'figure.dpi':120})
datapath = '../Data/'
image_dest = 'results_images/'

startTime = time.time()

# Load Data

In [51]:
filename = 'log_ready.pickle'
infile = open(datapath+filename,'rb')
df = pickle.load(infile)
infile.close()

df.columns.to_list()

['PAST_DUE',
 'TOTAL_60_DAYS_AMT',
 'NUM_PREM_FOR_PER',
 'BREAK_ARRANGEMENT',
 'MULTI_DWELL_SIZE',
 'SNAP_GEO',
 'NUM_PER_FOR_PREM',
 'HAS_COTENANT',
 'SPA_PER_ID',
 'CMIS_MATCH']

# Model Params

In [52]:
import statsmodels.api as sm
from custom_methods import model_eval
#from sklearn.preprocessing import StandardScaler
#from imblearn.over_sampling import RandomOverSampler

event_col = 'CMIS_MATCH'
id_col = 'SPA_PER_ID'
model = 'logit'
scaler = None
sampler = None
#scaler = StandardScaler()
#sampler = RandomOverSampler(sampling_strategy='minority', random_state=42)

# K-Folds

In [53]:
predictions = model_eval.k_folds(
    df = df, 
    event_col = event_col,
    id_col = id_col, 
    k = 4, 
    model = model, 
    scaler = scaler,
    sampler = sampler,
)

predictions.head()

Unnamed: 0,SPA_PER_ID,prediction,CMIS_MATCH
0,211062,0.003102,False
1,223268,0.004566,False
2,223268,0.005347,False
3,223268,0.005773,False
4,303740,0.007969,False


Take maximum risk prediction for each person

In [54]:
predictions = predictions.groupby('SPA_PER_ID').prediction.max()
predictions = pd.concat([predictions, df.groupby('SPA_PER_ID').CMIS_MATCH.any()], axis=1, join='inner', ignore_index=False)
predictions = predictions.reset_index()
predictions.head()

Unnamed: 0,SPA_PER_ID,prediction,CMIS_MATCH
0,3,0.00462,False
1,4,0.002062,False
2,6,0.004978,False
3,12,0.002964,False
4,14,0.004142,False


In [55]:
predictions.isnull().sum()

SPA_PER_ID    0
prediction    0
CMIS_MATCH    0
dtype: int64

In [58]:
print(len(predictions))
print(len(predictions[predictions.CMIS_MATCH]))

84345
302


# Save and Time

In [56]:
filename = 'log_predictions.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(predictions, outfile)
outfile.close()

In [57]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time() - startTime)

hours:minutes:seconds = 0:0:5.609459161758423
