# Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import time
from matplotlib import pyplot as plt

plt.rcParams.update({'figure.figsize':(11,7), 'figure.dpi':120})
datapath = '../Data/'
image_dest = 'results_images/'

startTime = time.time()

# Load Data

In [2]:
filename = 'log_output.pickle'
infile = open(datapath+filename,'rb')
df = pickle.load(infile)
infile.close()

df.columns.to_list()

['PER-PREM-MONTH_ID', 'prediction', 'CMIS_MATCH']

## Extract IDs

In [3]:
df[['SPA_PER_ID', 'SPA_PREM_ID', 'MONTH']] = df['PER-PREM-MONTH_ID'].str.split(pat='-', expand=True)

df = df.drop('PER-PREM-MONTH_ID', axis=1)

for col in ['SPA_PER_ID', 'SPA_PREM_ID', 'MONTH']:
    df[col] = df[col].astype('int')

df.head()

Unnamed: 0,prediction,CMIS_MATCH,SPA_PER_ID,SPA_PREM_ID,MONTH
0,0.001765,False,3,98612,1
1,0.001765,False,3,98612,4
2,0.001765,False,3,98612,6
3,0.001765,False,3,98612,10
4,0.001765,False,3,98612,13


# Take maximum risk prediction for each person

## Take Max Likelihood For Each Person

In [5]:
predictions = df.groupby('SPA_PER_ID')['prediction'].max()
predictions = pd.concat([predictions, df.groupby('SPA_PER_ID').CMIS_MATCH.any()], axis=1, join='inner', ignore_index=False)
predictions = predictions.reset_index()
predictions.head()

84345
84345


Unnamed: 0,SPA_PER_ID,prediction,CMIS_MATCH
0,3,0.002089,False
1,4,0.001236,False
2,6,0.002334,False
3,12,0.001311,False
4,14,0.001866,False


## Find Most Indicative Time to Analyze Risk
Note: The max predicted risk can occur at multiple times for a single person.

In [5]:
matches = predictions[['SPA_PER_ID', 'prediction']]
matches['max_likelihood'] = True 

df = df.merge(matches, how='left', left_on=['SPA_PER_ID', 'prediction'], right_on=['SPA_PER_ID', 'prediction'])
df.head()

Unnamed: 0,prediction,CMIS_MATCH,SPA_PER_ID,SPA_PREM_ID,MONTH,max_likelihood
0,0.001765,False,3,98612,1,
1,0.001765,False,3,98612,4,
2,0.001765,False,3,98612,6,
3,0.001765,False,3,98612,10,
4,0.001765,False,3,98612,13,


### Create Normalized Backward Time Column

In [6]:
def backward_time_from_last(df: pd.DataFrame) -> pd.DataFrame:
    new_df = df.copy()
    # Get max month for each person
    max_months = dict(new_df.groupby(['SPA_PER_ID'])['MONTH'].max())
    # Fill in max month for each row
    new_df['BACKWARD_MONTH'] = new_df['SPA_PER_ID'].apply(lambda x: max_months[x])
    # Calculate BACKWARD_MONTH - the number of months until the last data point
    new_df['BACKWARD_MONTH'] = new_df['MONTH'] - new_df['BACKWARD_MONTH']
    new_df['BACKWARD_MONTH'] = new_df['BACKWARD_MONTH'].astype('int')
    return new_df

In [7]:
df = backward_time_from_last(df)

df.head()

Unnamed: 0,prediction,CMIS_MATCH,SPA_PER_ID,SPA_PREM_ID,MONTH,max_likelihood,BACKWARD_MONTH
0,0.001765,False,3,98612,1,,-57
1,0.001765,False,3,98612,4,,-54
2,0.001765,False,3,98612,6,,-52
3,0.001765,False,3,98612,10,,-48
4,0.001765,False,3,98612,13,,-45


# Alternate: Weighted Average

In [8]:
def weighted_avg(df: pd.DataFrame , weighting: str, center: int, values: str='prediction', backward_month: str='BACKWARD_MONTH') -> pd.Series:
        vals = df[values]
        weights = []
        if weighting == 'linear':
            # weights = np.arange(start=1, stop=len(vals)+1)
            weights = 1 / abs(df[backward_month] - center)
        elif weighting == 'exponential':
            # weights = [np.exp(x) for x in np.arange(start=0, stop=len(vals))]
            weights = 1 / np.exp(abs(df[backward_month] - center))
            
        weights = weights.replace(to_replace=np.inf, value=1.5)

        return (vals * weights).sum() / sum(weights)

In [9]:
center = -40

weighted_predictions = df.groupby('SPA_PER_ID').apply(weighted_avg, 'linear', center)

weighted_predictions = weighted_predictions.rename('linear').to_frame().join(
    df.groupby('SPA_PER_ID').apply(weighted_avg, 'exponential', center).rename('exp'), how='left', on='SPA_PER_ID')

weighted_predictions = weighted_predictions.join(df.groupby('SPA_PER_ID')['CMIS_MATCH'].last(), how='left', on='SPA_PER_ID')

weighted_predictions = weighted_predictions.reset_index()

weighted_predictions.head()

Unnamed: 0,SPA_PER_ID,linear,exp,CMIS_MATCH
0,3,0.001737,0.001735,False
1,4,0.001183,0.001181,False
2,6,0.001923,0.001916,False
3,12,0.001255,0.001241,False
4,14,0.001821,0.001808,False


# Save and Time

In [10]:
filename = 'log_predictions.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(predictions, outfile)
outfile.close()

filename = 'log_predictions_time.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(df, outfile)
outfile.close()

filename = 'log_weighted_predictions.pickle'
outfile = open(datapath + filename, 'wb')
pickle.dump(weighted_predictions, outfile)
outfile.close()

In [11]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time() - startTime)

hours:minutes:seconds = 0:2:52.43069386482239
