In [None]:
import pandas as pd
import geopandas as gpd
from blue_conduit_spatial.data import build_datasets, load_datasets

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [2]:
# Shape files truncate column names.  This dictionary is used to restore the whole names.

col_name_dictionary = {'pid': 'pid', 'Property Z': 'Property Zip Code', 'Owner Type': 'Owner Type',
                       'Owner Stat': 'Owner State', 'Homestead': 'Homestead', 'Homestea_1': 'Homestead Percent',
                       'HomeSEV': 'HomeSEV', 'Land Value': 'Land Value', 'Land Impro': 'Land Improvements Value',
                       'Residentia': 'Residential Building Value', 'Resident_1': 'Residential Building Style',
                       'Commercial': 'Commercial Building Value', 'Building S': 'Building Storeys',
                       'Parcel Acr': 'Parcel Acres', 'Rental': 'Rental', 'Use Type': 'Use Type',
                       'Prop Class': 'Prop Class', 'Old Prop c': 'Old Prop class', 'Year Built': 'Year Built',
                       'USPS Vacan': 'USPS Vacancy', 'Zoning': 'Zoning', 'Future Lan': 'Future Landuse',
                       'DRAFT Zone': 'DRAFT Zone', 'Housing Co': 'Housing Condition 2012',
                       'Housing _1': 'Housing Condition 2014', 'Commerci_1': 'Commercial Condition 2013',
                       'Latitude': 'Latitude', 'Longitude': 'Longitude', 'Hydrant Ty': 'Hydrant Type',
                       'Ward': 'Ward', 'PRECINCT': 'PRECINCT', 'CENTRACT': 'CENTRACT', 'CENBLOCK': 'CENBLOCK',
                       'SL_Type': 'SL_Type', 'SL_Type2': 'SL_Type2', 'SL_Lead': 'SL_Lead', 'Ed_July': 'Ed_July',
                       'Ed_March': 'Ed_March', 'Last_Test': 'Last_Test', 'Max_Lead': 'Max_Lead',
                       'Med_Lead': 'Med_Lead', 'Num_Tests': 'Num_Tests', 'Res_Test': 'Res_Test',
                       'Sen_Test': 'Sen_Test', 'SL_private': 'SL_private_inspection',
                       'B_median_a': 'B_median_age_all_women', 'B_median_1': 'B_median_age_all_men', 
                       'B_median_2': 'B_median_age_all', 'B_median_3': 'B_median_age_all_women_white',
                       'B_median_4': 'B_median_age_all_men_white', 'B_median_5': 'B_median_age_all_white',
                       'B_median_6': 'B_median_age_all_women_black', 'B_median_7': 'B_median_age_all_men_black',
                       'B_median_8': 'B_median_age_all_black', 'B_total_bl': 'B_total_black_pop',
                       'B_total_wh': 'B_total_white_pop', 'B_married_': 'B_married_couples',
                       'B_single_w': 'B_single_women', 'B_marrie_1': 'B_married_couples_white',
                       'B_single_1': 'B_single_women_white', 'B_marrie_2': 'B_married_couples_black',
                       'B_single_2': 'B_single_women_black', 'B_marrie_3': 'B_married_couples_w_children',
                       'B_single_m': 'B_single_mothers_w_children', 'B_househol': 'B_households_w_elderly',
                       'B_househod': 'B_househod_no_elderly', 'B_aggregat': 'B_aggregate_income',
                       'B_speak_sp': 'B_speak_spanish', 'B_speak_on': 'B_speak_only_english',
                       'B_no_engli': 'B_no_english', 'B_hispanic': 'B_hispanic_household',
                       'B_imputed_': 'B_imputed_rent', 'B_impute_1': 'B_imputed_value',
                       'known_priv': 'known_private_sl', 'known_publ': 'known_public_sl', 'hydrovac': 'hydrovac',
                       'sl_priva_1': 'sl_private_type', 'sl_public_': 'sl_public_type', 'created_at': 'created_at',
                       'source': 'source', 'hv_visit': 'hv_visit', 'sl_visit': 'sl_visit', 'replaced': 'replaced',
                       'dangerous': 'dangerous', 'geometry': 'geometry'}

In [3]:
# Update path to the new shapefile I sent.

sl_df = gpd.read_file('../data/raw/flint_sl_materials/')
sl_df = sl_df.rename(col_name_dictionary, axis=1)

In [4]:
# Drop columns that aren't used by the classifier

drop_cols = ['known_private_sl', 'known_public_sl', 'hydrovac', 'created_at', 'source',
             'hv_visit', 'sl_visit', 'replaced', 'geometry', 
            'Latitude', 'Longitude']

data = sl_df.drop(drop_cols, axis = 1)

# Only keep labelled data
data = data[~pd.isnull(data.dangerous)].reset_index()

# Drop everything except target from training data
Xdata = data.drop(['pid', 'sl_private_type', 'sl_public_type', 'dangerous'], axis = 1)

# Build target.  Each 'dangerous' is True when sl_private_type OR sl_public_type contain lead.
Ydata = data[['sl_private_type', 'sl_public_type', 'dangerous']]


dummy_cols = ['Property Zip Code', 'Owner Type', 'Residential Building Style', 'Homestead', 'Building Storeys',
              'Rental', 'Use Type', 'Prop Class', 'Old Prop class', 'USPS Vacancy', 'Housing Condition 2012',
              'Housing Condition 2014', 'Owner State', 'Zoning', 'Future Landuse', 'Commercial Condition 2013',
              'Hydrant Type', 'SL_Type', 'SL_Type2', 'DRAFT Zone', 'Last_Test', 'SL_private_inspection', 'Ward',
              'CENTRACT', 'CENBLOCK']#, 'PRECINCT']

# Fill missing data
Xdata = Xdata.fillna(-1)

# Create dummies from categorical columns
Xdata = pd.get_dummies(Xdata, columns=dummy_cols)

# Groups for spatial cross validation
groups = Xdata['PRECINCT']
Xdata = Xdata.drop('PRECINCT', axis=1)

In [5]:
# Group Shuffle Split example.  train_test_split could go here, but we like spatial cross validation 
# better than a uniform random sample.
gss = GroupShuffleSplit(n_splits=3, train_size=.75, random_state=42)

for train_idx, test_idx in gss.split(Xdata, Ydata, groups):
    train_index = train_idx
    test_index = test_idx
    break

Xtrain = Xdata.loc[train_index]
Xtest = Xdata.loc[test_index]
Ytrain = Ydata.loc[train_index.tolist()]
Ytest = Ydata.loc[test_index.tolist()]

In [6]:
from blue_conduit_spatial.data import build_datasets, load_datasets

def test_build(Xtrain, Xtest, Ytrain, Ytest):

    data_dir = '../data'
    data_raw_path = f'{data_dir}/raw/flint_sl_materials/'
    save_dir = f'{data_dir}/test_dir'
    Xtrain_, Xtest_, Ytrain_, Ytest_ = build_datasets(data_raw_path, save_dir=save_dir)
    
    assert Xtrain.drop('index', axis=1).equals(Xtrain_)
    assert Xtest.drop('index', axis=1).equals(Xtest_)
    assert Ytrain.equals(Ytrain_)
    assert Ytest.equals(Ytest_)
    
def test_load(Xtrain, Xtest, Ytrain, Ytest):

    data_dir = '../data'
    save_dir = f'{data_dir}/test_dir'
    
    Xtrain = Xtrain.drop('index', axis=1).reset_index(drop=True)
    Xtest = Xtest.drop('index', axis=1).reset_index(drop=True)
    Ytrain = Ytrain.reset_index(drop=True)
    Ytest = Ytest.reset_index(drop=True)
    Ytrain['dangerous'] = Ytrain['dangerous'].astype(int)
    Ytest['dangerous'] = Ytest['dangerous'].astype(int)
    
    Xtrain_, Xtest_, Ytrain_, Ytest_ = load_datasets(save_dir)
    
    assert Xtrain_.astype(Xtrain.dtypes).equals(Xtrain)
    assert Xtest_.astype(Xtest.dtypes).equals(Xtest)
    assert Ytrain_.astype(Ytrain.dtypes).equals(Ytrain)
    assert Ytest_.astype(Ytest.dtypes).equals(Ytest)
    
test_build(Xtrain, Xtest, Ytrain, Ytest)
test_load(Xtrain, Xtest, Ytrain, Ytest)

In [6]:
# Train an xgboost classifier
data_dir = '../data'
data_raw_path = f'{data_dir}/raw/flint_sl_materials/'
save_dir = f'{data_dir}/processed'
Xtrain, Xtest, Ytrain, Ytest = build_datasets(data_raw_path, save_dir=save_dir)

In [None]:
# xgb = xgboost.XGBClassifier()

# xgb.fit(Xtrain, Ytrain['dangerous'])
# yhat = xgb.predict_proba(Xtest)

In [None]:
# # Measure predictive power.  This is roc score, but any metric could go here.
# roc_auc_score(Ytest['dangerous'], yhat[:,1])

In [None]:
# ## ADDED BY KEVIN HARE (9/26/2021)
# import numpy as np
# np.savez('../data/predictions/baseline_preds.npz', yhat=yhat, ytrue=Ytest['dangerous'])
# xgb.save_model('../models/baseline_jared_20210921.json')