# Load Data

In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import matplotlib.pyplot as plt
%config IPCompleter.greedy=True

In [2]:
start_time = time.time()

In [3]:
filename = '../../002_Data/Release_4/data_combined.pickle'
infile = open(filename,'rb')
df = pickle.load(infile)
infile.close()

filename = '../../002_Data/Release_4/attribute_list.pickle'
infile = open(filename,'rb')
keep = pickle.load(infile)
infile.close()

keep

['PAST_DUE',
 'TOTAL_60_DAYS_AMT',
 'NUM_PREM_FOR_PER',
 'BREAK_ARRANGEMENT',
 'COVID_REMINDER',
 'MULTI_DWELL_SIZE',
 'SNAP_GEO',
 'NUM_PER_FOR_PREM',
 'APARTMENT',
 'HAS_COTENANT']

In [4]:
# causing convergence issues
keep.remove('APARTMENT')
# Add other to keep
#keep = keep + ['SPA_PER_ID', 'ARREARSMONTH', 'CMIS_MATCH']
keep = keep + ['SPA_PER_ID', 'ARREARSMONTH', 'LAST_MO_W_DATA']

In [5]:
df = df[keep]
df.isnull().sum()

PAST_DUE             0
TOTAL_60_DAYS_AMT    0
NUM_PREM_FOR_PER     0
BREAK_ARRANGEMENT    0
COVID_REMINDER       0
MULTI_DWELL_SIZE     0
SNAP_GEO             0
NUM_PER_FOR_PREM     0
HAS_COTENANT         0
SPA_PER_ID           0
ARREARSMONTH         0
LAST_MO_W_DATA       0
dtype: int64

# Require unique person, month

In [6]:
df.groupby(['SPA_PER_ID', 'ARREARSMONTH']).size().value_counts()

1    3331612
2       7332
dtype: int64

## Just take last - most related to homelessness

In [7]:
print(len(df))
df = df.drop_duplicates(subset=['SPA_PER_ID', 'ARREARSMONTH'], keep='last', ignore_index=True)
print(len(df))

df.groupby(['SPA_PER_ID', 'ARREARSMONTH']).size().value_counts()

3346276
3338944


1    3338944
dtype: int64

# Take Difference (Inactive)

In [8]:
def take_diff(df, save_out_cols):
    # 04/23/21
    # NOTE: Cols taking diff must be numerical
    new_df = df[save_out_cols].copy()
    new_df = new_df.join(df.drop(save_out_cols, axis=1).diff())
    new_df.dropna(inplace=True)
    return new_df

In [9]:
'''
save_out = [
    'SPA_PER_ID',
    'ARREARSMONTH',
    'CMIS_MATCH',
    'CMIS_DB_ENTRY',
    'HAS_COTENANT'
]
df = take_diff(df, save_out_cols=save_out)

df.isnull().sum()
'''

"\nsave_out = [\n    'SPA_PER_ID',\n    'ARREARSMONTH',\n    'CMIS_MATCH',\n    'CMIS_DB_ENTRY',\n    'HAS_COTENANT'\n]\ndf = take_diff(df, save_out_cols=save_out)\n\ndf.isnull().sum()\n"

# Reconfigure for Cox Time Varying
* start col
* stop col
* one row per person per unique set of time-varying covariates

## Custom Methods

In [10]:
def transform_to_cox_time_varying(df: pd.DataFrame, month_col: str) -> pd.DataFrame:
    '''
    05/11/21
    Transform data to format required by lifelines CoxTimeVaryingFitter.
    'start' is exclusive and 'stop' is inclusive
    '''
    new_df = df.copy()
    cols_not_month = new_df.columns.to_list()
    cols_not_month.remove(month_col)
    new_df = new_df.join(new_df.groupby(cols_not_month).min().rename({month_col:'start'}, axis=1), how='left', on=cols_not_month)
    cols_not_month = new_df.columns.to_list()
    cols_not_month.remove(month_col)
    new_df = new_df.join(new_df.groupby(cols_not_month).max().rename({month_col:'stop'}, axis=1), how='left', on=cols_not_month)
    to_update = new_df[new_df['start'] == new_df['stop']]['stop']
    to_update += 1
    new_df['stop'].update(to_update)
    new_df = new_df.drop(month_col, axis=1).drop_duplicates()
    return new_df

## Reconfigure

In [11]:
df = transform_to_cox_time_varying(df, month_col='ARREARSMONTH')

# Clean Up

In [12]:
df.isnull().sum()

PAST_DUE             0
TOTAL_60_DAYS_AMT    0
NUM_PREM_FOR_PER     0
BREAK_ARRANGEMENT    0
COVID_REMINDER       0
MULTI_DWELL_SIZE     0
SNAP_GEO             0
NUM_PER_FOR_PREM     0
HAS_COTENANT         0
SPA_PER_ID           0
LAST_MO_W_DATA       0
start                0
stop                 0
dtype: int64

In [13]:
len(df[(df['start'] == df['stop'])])

0

In [14]:
df.dtypes

PAST_DUE               int64
TOTAL_60_DAYS_AMT    float64
NUM_PREM_FOR_PER       int64
BREAK_ARRANGEMENT      int64
COVID_REMINDER         int64
MULTI_DWELL_SIZE       int64
SNAP_GEO             float64
NUM_PER_FOR_PREM       int64
HAS_COTENANT          object
SPA_PER_ID             int64
LAST_MO_W_DATA        object
start                  int64
stop                   int64
dtype: object

In [15]:
df['HAS_COTENANT'] = df['HAS_COTENANT'].astype('bool')
# df['APARTMENT'] = df['APARTMENT'].astype('bool')
# df['CMIS_MATCH'] = df['CMIS_MATCH'].astype('bool')
df['LAST_MO_W_DATA'] = df['LAST_MO_W_DATA'].astype('bool')

# Save Output

In [16]:

filename = '../../002_Data/Release_4/cox_time_ready_last_data.pickle'
outfile = open(filename, 'wb')
pickle.dump(df, outfile)
outfile.close()


# Calculate Time

In [17]:
from custom_methods.calc_time import calc_time_from_sec

calc_time_from_sec(time.time()-start_time)

hours:minutes:seconds = 0:0:34.51171040534973
