# Imports

In [5]:
import pandas as pd
import numpy as np
import pickle
import time
from custom_methods.preprocessing import date_map

## Start Timer

In [6]:
startTime = time.time()

# Gather Service Agreements Data
Track 'MAIN' account holder accross accounts and premises

In [7]:
service_agreements = pd.read_csv('../../002_Data/Release_4/ServiceAgreements_Anon.csv').rename({'spa_prem_id':'SPA_PREM_ID', 'spa_acct_id':'SPA_ACCT_ID', 'spa_per_id':'SPA_PER_ID', 'homelessMatch':'CMIS_MATCH', 'EnrollDate':'ENROLL_DATE', 'apartment':'APARTMENT'}, axis=1)

rows = len(service_agreements)
ppl = service_agreements.SPA_PER_ID.nunique()
accts = service_agreements.SPA_ACCT_ID.nunique()
pos_ppl = service_agreements[service_agreements.CMIS_MATCH == True].SPA_PER_ID.nunique()

print(f'Rows: {rows}')
print(f'People: {ppl}')
print(f'Accounts: {accts}')
print(f'pos cases: {pos_ppl}')
service_agreements.head()

Rows: 709068
People: 305480
Accounts: 270990
pos cases: 2386


Unnamed: 0,SPA_PREM_ID,SPA_ACCT_ID,spa_sa_id,SPA_PER_ID,ACCT_REL_TYPE_CD,CMIS_MATCH,START_DT,END_DT,SA_TYPE_DESCR,Class,APARTMENT,ENROLL_DATE
0,115011.0,197077.0,394613.0,81568.0,MAIN,,2018-07-28,,Residential Electric WA,RESIDENTIAL,False,
1,115011.0,197077.0,394613.0,226934.0,COTENANT,,2018-07-28,,Residential Electric WA,RESIDENTIAL,False,
2,144240.0,103592.0,207601.0,39347.0,COTENANT,,2018-07-25,2019-05-31,Residential Electric WA,RESIDENTIAL,False,
3,144240.0,103592.0,207601.0,57810.0,MAIN,,2018-07-25,2019-05-31,Residential Electric WA,RESIDENTIAL,False,
4,83426.0,74182.0,148569.0,272447.0,MAIN,,2018-07-22,2019-12-02,Residential Electric WA,RESIDENTIAL,True,


# Transform

In [8]:
# Convert Dates to months since December, 2015
#service_agreements.START_DT = service_agreements.START_DT.map(string_date_map)
#service_agreements.END_DT = service_agreements.END_DT.map(string_date_map)
service_agreements.ENROLL_DATE = service_agreements.ENROLL_DATE.apply(lambda x: date_map(date=x, relative_to='2015-01-01', format='yyyy-mm-dd'))

# Replace NaN with False in CMIS_MATCH
service_agreements.CMIS_MATCH = service_agreements.CMIS_MATCH.replace(to_replace=np.nan, value=False).astype('bool')

# Any null enroll dates for cmis_match? No - good
print(f'Null Enroll Dates for P Cases: {service_agreements[service_agreements.CMIS_MATCH]["ENROLL_DATE"].isnull().sum()}')

# Retain only columns we want to add to billing - note: all CMIS_MATCHes have ENROLL_DATEs
service_agreements.drop(['spa_sa_id', 'START_DT', 'END_DT', 'SA_TYPE_DESCR', 'Class'], axis=1, inplace=True)

# Create list of accounts that have a cotenant
cotenant_accounts = service_agreements[service_agreements['ACCT_REL_TYPE_CD'] == 'COTENANT']['SPA_ACCT_ID'].values
# Only keep info regarding the 'MAIN' account holder
service_agreements = service_agreements[service_agreements['ACCT_REL_TYPE_CD'] == 'MAIN'].drop('ACCT_REL_TYPE_CD', axis=1)
# Add boolean column for cotenant
service_agreements['HAS_COTENANT'] = service_agreements['SPA_ACCT_ID'].isin(cotenant_accounts).astype('bool')
del cotenant_accounts
service_agreements.drop_duplicates(inplace=True)

# Group Enroll Dates into list
enroll_dates = service_agreements[~service_agreements["ENROLL_DATE"].isnull()].groupby(["SPA_ACCT_ID", "SPA_PREM_ID"])["ENROLL_DATE"].unique()
service_agreements = service_agreements.set_index(['SPA_ACCT_ID','SPA_PREM_ID']).sort_index()
service_agreements.update(enroll_dates)
del enroll_dates
service_agreements["ENROLL_DATE"] = service_agreements["ENROLL_DATE"].apply(lambda x: tuple([]) if np.isnan(x).all() else tuple(x))

# If any CMIS_MATCH for person, then CMIS_MATCH for all instances of person
service_agreements.update(service_agreements.groupby(['SPA_ACCT_ID', 'SPA_PREM_ID'])["CMIS_MATCH"].any())
service_agreements.drop_duplicates(inplace=True)
service_agreements.reset_index(inplace=True)
pos_rows = len(service_agreements[service_agreements.CMIS_MATCH])
pos = service_agreements[service_agreements.CMIS_MATCH].SPA_PER_ID.nunique()
print(f'Positive Rows: {pos_rows}')
print(f'Positive Cases: {pos}')

# Check all CMIS_MATCHes have ENROLL_DATEs
print(f'Null Enroll Dates for P Cases: {service_agreements[service_agreements.CMIS_MATCH]["ENROLL_DATE"].isnull().sum()}\n')

# Check Matching
print('Grouping:')
print(service_agreements.groupby(['SPA_ACCT_ID', 'SPA_PREM_ID']).size().value_counts())

Null Enroll Dates for P Cases: 0
Positive Rows: 2072
Positive Cases: 1935
Null Enroll Dates for P Cases: 0

Grouping:
1    270858
dtype: int64


Unique matching on each ('SPA_ACCT_ID', 'SPA_PREM_ID')

# Enforce Attribute Types

## Check and Replace Missing Values

In [9]:
# Find how many NaN's in each column
service_agreements.isnull().sum()

SPA_ACCT_ID     0
SPA_PREM_ID     0
SPA_PER_ID      0
CMIS_MATCH      0
APARTMENT       0
ENROLL_DATE     0
HAS_COTENANT    0
dtype: int64

## Update Attribute Types

In [10]:
service_agreements.dtypes

SPA_ACCT_ID     float64
SPA_PREM_ID     float64
SPA_PER_ID      float64
CMIS_MATCH       object
APARTMENT          bool
ENROLL_DATE      object
HAS_COTENANT       bool
dtype: object

In [11]:
# Enforce Boolean Types
boolean_cols = [
    'CMIS_MATCH',
    'APARTMENT',
    'HAS_COTENANT'
]
for col in boolean_cols:
    service_agreements[col] = service_agreements[col].astype('bool')

# Enforce int types for categorical attributes
categorical_cols = [
    'SPA_PER_ID',
    'SPA_PREM_ID',
    'SPA_ACCT_ID'
]
for col in categorical_cols:
    service_agreements[col] = service_agreements[col].astype('int64')

In [12]:
service_agreements.dtypes

SPA_ACCT_ID      int64
SPA_PREM_ID      int64
SPA_PER_ID       int64
CMIS_MATCH        bool
APARTMENT         bool
ENROLL_DATE     object
HAS_COTENANT      bool
dtype: object

# Get Processing Stats and Save

## Save Pickle

In [13]:
#filename = '../../002_Data/Release_4/processed_sa.pickle'
#outfile = open(filename, 'wb')
#pickle.dump(service_agreements, outfile)
#outfile.close()

## Check Numbers Retained

In [17]:
1935/2386

0.8109807208717519

In [14]:
retained_p = service_agreements[service_agreements.CMIS_MATCH]['SPA_PER_ID'].nunique()
retained_people = service_agreements.SPA_PER_ID.nunique()
retained_p_rows = len(service_agreements[service_agreements.CMIS_MATCH])
retained_accts = service_agreements.SPA_ACCT_ID.nunique()

print(f'Retained {retained_people} = {100*retained_people/ppl}% of people.')
print(f'Retained {retained_p} = {100*retained_p/pos_ppl}% of positive cases.')
print(f'Retained {retained_p_rows} = {100*retained_p_rows/pos_rows}% of positive rows.')
print(f'Retained {retained_accts} = {100*retained_accts/accts}% of accounts.')

Retained 243360 = 79.664789838942% of people.
Retained 1935 = 81.09807208717518% of positive cases.
Retained 2072 = 100.0% of positive rows.
Retained 253638 = 93.59681169046829% of accounts.


## Total Time

In [15]:
print(f'Total Time in Seconds: {time.time()-startTime}')

Total Time in Seconds: 7.346033573150635
