# Data Importing

In [256]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib

from pandas_profiling import ProfileReport
from category_encoders import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

ImportError: cannot import name 'joblib' from 'sklearn.externals' (/home/tyler/.local/share/virtualenvs/family-promise-spokane-ds-a-lmm-C0CO/lib/python3.8/site-packages/sklearn/externals/__init__.py)

In [None]:
data = pd.read_csv('../All_data_with_exits.csv')
df = pd.DataFrame(data)

pd.options.display.max_columns = None

In [None]:
df.head()

# Target Recategorization


Because the target is initially recorded in a very granular manner, the target labels will need to be recategorized to fit into the 5 Categories provided by stakeholder:

- Permanent Exit
- Temporary Exit
- Emergency Shelter
- Transitional Housing
- Unknown/Other

**Permanent Exit**

- Staying or living with family, permanent tenure
- Staying or living with friends, permanent tenure
- Permanent housing (other than RRH) for formerly homeless persons
- Rental by client with RRH or equivalent subsidy
- Rental by client, no ongoing housing subsidy
- Rental by client, other ongoing housing subsidy
- Owned by client, no ongoing housing subsidy

**Temporary Exit**  

- Place not meant for habitation (e.g., a vehicle, an abandoned building, bus/train/subway station/airport or anywhere outside)
- Staying or living with family, temporary tenure (e.g., room, apartment or house)
- Staying or living with friends, temporary tenure (e.g., room, apartment or house)
- Hotel or Motel paid for without Emergency Shelter Voucher

**Emergency Shelter**  

- Emergency shelter, including hotel or motel paid for with emergency shelter voucher, or RHY-funded Host Home shelter 

**Transitional Housing**  

- Transitional Housing for homeless persons (including homeless youth)
- Safe Haven
- Substance Abuse Treatment or Detox Center
- Foster Care Home or Foster Care Group Home
- Psychiatric Hospital or Other Psychiatric Facility

**Unknown/Other**

- No exit interview completed
- Client refused
- Other
- Client doesn't know

Because pandas has a built in value mapping function that is more performant and consistent using a dictionary of this format, we are going with this dictionary structure rather than a more DRY dictionary with each entry as an element of a list with the category as the key.  
e.g. `values_dict = {'Permanent Exit' : [some_value, some_value2]}`

In [None]:
# Use apply to assign values in dataframe to categories
values_dict = {
    # Permanent Exits
    'Staying or living with family, permanent tenure' : 'Permanent Exit',
    'Staying or living with friends, permanent tenure' : 'Permanent Exit',
    'Permanent housing (other than RRH) for formerly homeless persons' : 'Permanent Exit',
    'Rental by client with RRH or equivalent subsidy' : 'Permanent Exit',
    'Rental by client, no ongoing housing subsidy' : 'Permanent Exit',
    'Rental by client, other ongoing housing subsidy' : 'Permanent Exit',
    'Owned by client, no ongoing housing subsidy' : 'Permanent Exit',
    # Temporary Exits
    'Staying or living with family, temporary tenure (e.g., room, apartment or house)' : 'Temporary Exit',
    'Staying or living with friends, temporary tenure (e.g., room, apartment or house)' : 'Temporary Exit',
    'Hotel or Motel paid for without Emergency Shelter Voucher' : 'Temporary Exit',
    # Emergency Shelter
    'Emergency shelter, including hotel or motel paid for with emergency shelter voucher, or RHY-funded Host Home shelter' : 'Emergency Shelter',
    # Transitional Housing
    'Transitional Housing for homeless persons (including homeless youth)' : 'Transitional Housing',
    'Safe Haven' : 'Transitional Housing',
    'Substance Abuse Treatment or Detox Center' : 'Transitional Housing',
    'Foster Care Home or Foster Care Group Home' : 'Transitional Housing',
    'Psychiatric Hospital or Other Psychiatric Facility' : 'Transitional Housing',
    # Unknown/Other
    'Place not meant for habitation (e.g., a vehicle, an abandoned building, bus/train/subway station/airport or anywhere outside)' : 'Unknown/Other',
    'No exit interview completed' : 'Unknown/Other',
    'Client refused' : 'Unknown/Other',
    'Other' : 'Unknown/Other',
    'Client doesn\'t know' : 'Unknown/Other',
    np.NaN : 'Unknown/Other'
}

In [None]:
# Features that need to have dtype converted to datetime
date_features = ['Enroll Date', 'Exit Date', 'CurrentDate', 'Date of First Contact (Beta)', 
                 'Date of First ES Stay (Beta)', 'Date of Last Contact (Beta)', 
                 'Date of Last ES Stay (Beta)', 'Engagement Date','Homeless Start Date']

In [None]:
# Features will artifacts remaining after filter application to text
text_artifacts = ['RReferral Source',
                  'RDate Status Determined',
                  'REnroll Status',
                  'RRunaway Youth',
                  'RReason Why No Services Funded',
                  'RSexual Orientation',
                  'RLast Grade Completed',
                  'RSchool Status',
                  'REmployed Status',
                  'RWhy Not Employed',
                  'RType of Employment',
                  'RLooking for Work',
                  'RGeneral Health Status',
                  'RDental Health Status',
                  'RMental Health Status',
                  'RPregnancy Status',
                  'RPregnancy Due Date',
                  'VLast Permanent Address',
                  'VState',
                  'VZip']

# Dict comprehension to generate dict of fixed names
rename_dict = {k: k[1:] for k in text_artifacts}

In [None]:
print(rename_dict)

In [None]:
# Dictionary used to impute data on dataframe
# List of columns that needs values consolidated and imputed into "Unknown" value
column_impute_list = ['Race' , 'Ethnicity' , 'Length of Stay']
replace_list = ['Client refused','Client doesn\'t know', 'Data not collected', np.NaN]

## Cleaning Pipeline

Start and create pipeline

In [None]:
def start_pipeline(dataf):
    '''Creates a copy of original dataframe to use in pipeline'''
    return dataf.copy()

def column_cleaner(dataf):
    '''Takes in a dataframe and removes decimals from column names'''
    dataf.columns = dataf.columns.str.replace(r'\d+.', '')
    return dataf

def column_rename(dataf):
    '''Fixes column name artifacts from string filter'''
    dataf = dataf.rename(columns = rename_dict)
    return dataf

def column_strip(dataf):
    '''Strips leading whitespace artifacting from RE'''
    dataf.columns = dataf.columns.str.lstrip(' ')
    return dataf

def set_dtypes(dataf):
    '''Sets Data Type to specific columns'''
    dataf[date_features] = dataf[date_features].apply(pd.to_datetime, infer_datetime_format=True)
    return dataf

def add_categories(dataf):
    '''Adds each entry to one of the five target categories'''
    dataf['Recategorized'] = dataf['Exit Destination'].map(values_dict)
    return dataf

def impute_values(dataf):
    '''Takes columns in column_impute_list and replaces missing and unknown 
    values with "Unknown"'''
    for column in column_impute_list:
        dataf[column].replace(replace_list, 'Unknown', inplace=True)
    return dataf

Run pipeline

In [None]:
df2 = (df
    .pipe(start_pipeline)
    .pipe(column_cleaner)
    .pipe(column_rename)
    .pipe(column_strip)
    .pipe(set_dtypes)
    .pipe(add_categories)
    .pipe(impute_values)
)

## Pipeline Results Testing

In [None]:
# Check for enforcement of datetime dtype
for column in date_features:
    print(df2[column].dtypes)

In [None]:
df2['Recategorized'].value_counts(dropna=False)

In [None]:
df2.head()

## Initial Visualizations  

Final Visualizations will need to be formatted with proper object usage and syntax

In [None]:
# Value Distribution
df2['Recategorized'].value_counts().plot(kind='bar');

In [None]:
# Basic scatterplots
sns.scatterplot(data=df2, y='Recategorized', x='Income Total at Entry').set_title('Exit Destination vs. Income Total at Entry');

# Feature Engineering

In [None]:
# Length of Stay
# Exit Date - Enroll Date
df2['Enrollment Length'] = df2['Exit Date'] - df2['Enroll Date']
print(df2['Enrollment Length'].dtypes)

In [None]:
df2.head()

# Feature Selection

In [None]:
features = ['CaseMembers','Race', 'Ethnicity', 
            'Current Age', 'Gender', 'Length of Stay', 
            'Days Enrolled in Project','Household Type', 
            'Barrier Count at Entry']

In [None]:
target = 'Recategorized'

In [None]:
X = df2[features]
y = df2[target]

In [None]:
# Investigate X and Y dfs
X.head()

In [None]:
X['Barrier Count at Entry'].value_counts(dropna=False)

In [None]:
X['Race'].dtypes

In [None]:
# Train, Test, Validation Split

# First split : Train, Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split : Train, Val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


# Modeling

Modeling Strategy: 
- Implement SKL pipeline to add modularity to workflow
- Begin with random forest implementation
- Update model choices using combinations of cross-validation, loss metrics, hyperparameter tuning

In [None]:
# Pipeline for random forest model
random_forest_model = Pipeline([('ord', OrdinalEncoder()),
                                ('imputer', SimpleImputer()),
                                ('classifier', RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42, verbose=1))])


In [None]:
# Fit the model
random_forest_model.fit(X_train, y_train)

In [None]:
random_forest_model.score(X_val, y_val)

In [None]:
# Setup for classification report metrics
y_true = y_val
y_pred = random_forest_model.predict(X_val)
target_names = ['Permanent Exit', 'Temporary Exit', 'Transitional Housing', 'Emergency Shelter' , 'Unknown/Other']

In [None]:
print(classification_report(y_true, y_pred, target_names=target_names))

# Modeling Serialization


In [None]:
# Extract the classifier step from the pipeline
clf = random_forest_model['classifier']
joblib_file = "randomforest_modelv1.pkl"
joblib.dump(clf, joblib_file)