# Imports

In [1]:
import pandas as pd
import pickle
import time

datapath = '../Data/'

startTime = time.time()

# Load Data

In [2]:
filename = 'log_ready.pickle'
infile = open(datapath+filename,'rb')
df = pickle.load(infile)
infile.close()

df.columns.to_list()

['PAST_DUE',
 'TOTAL_60_DAYS_AMT',
 'NUM_PREM_FOR_PER',
 'BREAK_ARRANGEMENT',
 'MULTI_DWELL_SIZE',
 'SNAP_GEO',
 'NUM_PER_FOR_PREM',
 'HAS_COTENANT',
 'PER-PREM-MONTH_ID',
 'CMIS_MATCH']

# K-Folds

## Helper Programs

In [3]:
import numpy as np

def get_folds_ids(id_col, k):
    # 04/24/21
    all_ids = pd.Series(id_col.unique().copy(), dtype='object')
    folds = []
    num_per_fold = int(np.ceil(len(all_ids)/k))
    while not all_ids.empty:
        fold = []
        if len(all_ids) < num_per_fold:
            fold = all_ids
        else:
            fold = all_ids.sample(n=num_per_fold)
        folds.append(fold.to_list())
        all_ids.drop(index=fold.index, inplace=True)
    return np.array(folds, dtype='object')


def log_k_folds(df: pd.DataFrame, event_col: str, id_col: str, k: int, model: object, sampler: object=None, scaler: object=None, inverted:bool=False) -> pd.DataFrame:
    '''
    10/26/21
    Chooses k-folds based on 'id_col'
    Supported use of the following models:
        statsmodels.discrete.discrete_model.Logit
            set model='logit'
    
    Returns:
        predictions (pandas DataFrame), models (List[statsmodels.discrete.discrete_model.Logit])
    '''
    import statsmodels.api as sm
    import copy

    # Create copies of base model to train
    models = []
    for i in range(k):
        models.append(copy.deepcopy(model))
    # Create Folds
    folds = get_folds_ids(id_col=df[id_col], k=k)
    id_prediction_actual = pd.DataFrame(columns=[id_col, 'prediction', event_col])
    # Loop through k-folds
    for fold in range(len(folds)):
        # Find test, training sets for fold
        if inverted:
            # Training set is smaller than test set
            df_test = df[~df[id_col].isin(folds[fold])]
            df_train = df[df[id_col].isin(folds[fold])]
        else:
            df_test = df[df[id_col].isin(folds[fold])]
            df_train = df[~df[id_col].isin(folds[fold])]
            
        # Scale
        if scaler != None:
            fold_scaler = copy.deepcopy(scaler)
            df_train.update(fold_scaler.fit_transform(df_train.drop([event_col, id_col], axis=1)))
            df_test.update(fold_scaler.transform(df_test.drop([event_col, id_col], axis=1)))
            
        # Undersample, oversample, etc.
        if sampler != None:
            fold_sampler = copy.deepcopy(sampler)
            X_res, y_res = fold_sampler.fit_sample(X=df_train.drop(event_col, axis=1), y=df_train[event_col])
            df_train = pd.concat([X_res, y_res], axis=1)
            
        # Get predictions
        predictions = None
        if model == "logit":
            exog = np.asarray(df_train.drop([id_col, event_col], axis=1), dtype='float')
            endog = np.asarray(df_train[event_col], dtype='bool')
            model_local = sm.Logit(endog=endog, exog=exog).fit(disp=False)
            models[fold] = model_local
            exog_test = np.asarray(df_test.drop([id_col, event_col], axis=1), dtype='float')
            predictions = pd.Series(model_local.predict(exog=exog_test))
        else:
            print("Model not supported.")

        # Append to predictions, actuals
        id_prediction_actual = id_prediction_actual.append(pd.concat([
            df_test[id_col].reset_index(drop=True), \
            predictions.rename('prediction').reset_index(drop=True), \
            df_test[event_col].reset_index(drop=True)], 
            axis=1),ignore_index=True)
    id_prediction_actual[event_col] = id_prediction_actual[event_col].astype('bool')
    return id_prediction_actual, models

## Run

In [4]:
#from sklearn.preprocessing import StandardScaler
#from imblearn.over_sampling import RandomOverSampler

# Model Parameters
event_col = 'CMIS_MATCH'
id_col = 'PER-PREM-MONTH_ID'
model = 'logit'
scaler = None
sampler = None
k = 4
#scaler = StandardScaler()
#sampler = RandomOverSampler(sampling_strategy='minority', random_state=42)


predictions, models = log_k_folds(
    df = df, 
    event_col = event_col,
    id_col = id_col, 
    k = k, 
    model = model, 
    scaler = scaler,
    sampler = sampler,
)

## Print Predictions Head

In [5]:
predictions.head()

Unnamed: 0,PER-PREM-MONTH_ID,prediction,CMIS_MATCH
0,3-98612-1,0.001765,False
1,3-98612-4,0.001765,False
2,3-98612-6,0.001765,False
3,3-98612-10,0.001765,False
4,3-98612-13,0.001765,False


## Print Sample Model Fitting

In [6]:
models[0].summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,2423288.0
Model:,Logit,Df Residuals:,2423280.0
Method:,MLE,Df Model:,7.0
Date:,"Mon, 01 Nov 2021",Pseudo R-squ.:,0.06058
Time:,13:35:46,Log-Likelihood:,-41116.0
converged:,True,LL-Null:,-43768.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,0.2668,0.005,57.958,0.000,0.258,0.276
x2,0.0018,6.5e-05,27.494,0.000,0.002,0.002
x3,-6.1689,0.052,-118.181,0.000,-6.271,-6.067
x4,0.4878,0.050,9.751,0.000,0.390,0.586
x5,-2.591e-07,6.96e-07,-0.372,0.710,-1.62e-06,1.1e-06
x6,0.3965,0.078,5.073,0.000,0.243,0.550
x7,-0.1555,0.027,-5.662,0.000,-0.209,-0.102
x8,-0.3043,0.029,-10.500,0.000,-0.361,-0.248


# Save and Time

In [7]:
# Save predictions
filename = 'log_output.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(predictions, outfile)
outfile.close()

# Save models
filename = 'log_models.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(models, outfile)
outfile.close()

In [8]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time() - startTime)

hours:minutes:seconds = 0:1:18.716119050979614
