In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from blue_conduit_spatial.utilities import build_datasets, load_datasets

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [2]:
# Shape files truncate column names.  This dictionary is used to restore the whole names.

col_name_dictionary = {'pid': 'pid', 'Property Z': 'Property Zip Code', 'Owner Type': 'Owner Type',
                       'Owner Stat': 'Owner State', 'Homestead': 'Homestead', 'Homestea_1': 'Homestead Percent',
                       'HomeSEV': 'HomeSEV', 'Land Value': 'Land Value', 'Land Impro': 'Land Improvements Value',
                       'Residentia': 'Residential Building Value', 'Resident_1': 'Residential Building Style',
                       'Commercial': 'Commercial Building Value', 'Building S': 'Building Storeys',
                       'Parcel Acr': 'Parcel Acres', 'Rental': 'Rental', 'Use Type': 'Use Type',
                       'Prop Class': 'Prop Class', 'Old Prop c': 'Old Prop class', 'Year Built': 'Year Built',
                       'USPS Vacan': 'USPS Vacancy', 'Zoning': 'Zoning', 'Future Lan': 'Future Landuse',
                       'DRAFT Zone': 'DRAFT Zone', 'Housing Co': 'Housing Condition 2012',
                       'Housing _1': 'Housing Condition 2014', 'Commerci_1': 'Commercial Condition 2013',
                       'Latitude': 'Latitude', 'Longitude': 'Longitude', 'Hydrant Ty': 'Hydrant Type',
                       'Ward': 'Ward', 'PRECINCT': 'PRECINCT', 'CENTRACT': 'CENTRACT', 'CENBLOCK': 'CENBLOCK',
                       'SL_Type': 'SL_Type', 'SL_Type2': 'SL_Type2', 'SL_Lead': 'SL_Lead', 'Ed_July': 'Ed_July',
                       'Ed_March': 'Ed_March', 'Last_Test': 'Last_Test', 'Max_Lead': 'Max_Lead',
                       'Med_Lead': 'Med_Lead', 'Num_Tests': 'Num_Tests', 'Res_Test': 'Res_Test',
                       'Sen_Test': 'Sen_Test', 'SL_private': 'SL_private_inspection',
                       'B_median_a': 'B_median_age_all_women', 'B_median_1': 'B_median_age_all_men', 
                       'B_median_2': 'B_median_age_all', 'B_median_3': 'B_median_age_all_women_white',
                       'B_median_4': 'B_median_age_all_men_white', 'B_median_5': 'B_median_age_all_white',
                       'B_median_6': 'B_median_age_all_women_black', 'B_median_7': 'B_median_age_all_men_black',
                       'B_median_8': 'B_median_age_all_black', 'B_total_bl': 'B_total_black_pop',
                       'B_total_wh': 'B_total_white_pop', 'B_married_': 'B_married_couples',
                       'B_single_w': 'B_single_women', 'B_marrie_1': 'B_married_couples_white',
                       'B_single_1': 'B_single_women_white', 'B_marrie_2': 'B_married_couples_black',
                       'B_single_2': 'B_single_women_black', 'B_marrie_3': 'B_married_couples_w_children',
                       'B_single_m': 'B_single_mothers_w_children', 'B_househol': 'B_households_w_elderly',
                       'B_househod': 'B_househod_no_elderly', 'B_aggregat': 'B_aggregate_income',
                       'B_speak_sp': 'B_speak_spanish', 'B_speak_on': 'B_speak_only_english',
                       'B_no_engli': 'B_no_english', 'B_hispanic': 'B_hispanic_household',
                       'B_imputed_': 'B_imputed_rent', 'B_impute_1': 'B_imputed_value',
                       'known_priv': 'known_private_sl', 'known_publ': 'known_public_sl', 'hydrovac': 'hydrovac',
                       'sl_priva_1': 'sl_private_type', 'sl_public_': 'sl_public_type', 'created_at': 'created_at',
                       'source': 'source', 'hv_visit': 'hv_visit', 'sl_visit': 'sl_visit', 'replaced': 'replaced',
                       'dangerous': 'dangerous', 'geometry': 'geometry'}

In [3]:
# Update path to the new shapefile I sent.

sl_df = gpd.read_file('../data/raw/flint_sl_materials/')
sl_df = sl_df.rename(col_name_dictionary, axis=1)

In [4]:
# Drop columns that aren't used by the classifier

drop_cols = ['known_private_sl', 'known_public_sl', 'hydrovac', 'created_at', 'source',
             'hv_visit', 'sl_visit', 'replaced', 'geometry', 
            'Latitude', 'Longitude']

data = sl_df.drop(drop_cols, axis = 1)

# Only keep labelled data
data = data[~pd.isnull(data.dangerous)].reset_index()

# Drop everything except target from training data
Xdata = data.drop(['pid', 'sl_private_type', 'sl_public_type', 'dangerous'], axis = 1)

# Build target.  Each 'dangerous' is True when sl_private_type OR sl_public_type contain lead.
Ydata = data[['sl_private_type', 'sl_public_type', 'dangerous']]

pid = data.pid


dummy_cols = ['Property Zip Code', 'Owner Type', 'Residential Building Style', 'Homestead', 'Building Storeys',
              'Rental', 'Use Type', 'Prop Class', 'Old Prop class', 'USPS Vacancy', 'Housing Condition 2012',
              'Housing Condition 2014', 'Owner State', 'Zoning', 'Future Landuse', 'Commercial Condition 2013',
              'Hydrant Type', 'SL_Type', 'SL_Type2', 'DRAFT Zone', 'Last_Test', 'SL_private_inspection', 'Ward',
              'CENTRACT', 'CENBLOCK']#, 'PRECINCT']

# Fill missing data
Xdata = Xdata.fillna(-1)

# Create dummies from categorical columns
Xdata = pd.get_dummies(Xdata, columns=dummy_cols)

# Groups for spatial cross validation
groups = Xdata['PRECINCT']
Xdata = Xdata.drop('PRECINCT', axis=1)

In [5]:
Xdata.head()

Unnamed: 0,index,Homestead Percent,HomeSEV,Land Value,Land Improvements Value,Residential Building Value,Commercial Building Value,Parcel Acres,Year Built,SL_Lead,...,CENTRACT_3800,CENTRACT_4000,CENTRACT_13500,CENTRACT_13600,CENBLOCK_1,CENBLOCK_2,CENBLOCK_3,CENBLOCK_4,CENBLOCK_5,CENBLOCK_6
0,38,100.0,18400,932,0,35843,0.0,0.14,2,0,...,0,0,0,0,0,1,0,0,0,0
1,47,100.0,11800,420,0,23227,0.0,0.05,2,0,...,0,0,0,0,0,1,0,0,0,0
2,53,0.0,0,602,0,18180,0.0,0.051,1912,0,...,0,0,0,0,0,1,0,0,0,0
3,59,50.0,4550,781,0,17452,0.0,0.086,2,0,...,0,0,0,0,0,1,0,0,0,0
4,70,100.0,12800,510,0,25104,0.0,0.07,1900,0,...,0,0,0,0,0,1,0,0,0,0


In [7]:
def test_build(Xdata, Ydata, pid):

    data_dir = '../data'
    data_raw_path = f'{data_dir}/raw/flint_sl_materials/'
    save_dir = f'{data_dir}/test_dir'
    Xdata_, Ydata_, pid_, _, _ = build_datasets(data_raw_path, save_dir=save_dir)
    
    assert Xdata.drop('index', axis=1).equals(Xdata_)
    assert Ydata.equals(Ydata_)
    assert pid.equals(pid_)
    
def test_load(Xdata, Ydata, pid):

    data_dir = '../data'
    save_dir = f'{data_dir}/test_dir'
    
    Xdata = Xdata.drop('index', axis=1).reset_index(drop=True)
    Ydata = Ydata.reset_index(drop=True)
    Ydata['dangerous'] = Ydata['dangerous'].astype(int)
    
    Xdata_, Ydata_, pid_, _, _ = load_datasets(save_dir)
    
    assert Xdata_.astype(Xdata.dtypes).equals(Xdata)
    assert Ydata_.astype(Ydata.dtypes).equals(Ydata)
    assert pid_['pid'].equals(pid)
    
def test_index():
    
    data_dir = '../data'
    data_raw_path = f'{data_dir}/raw/flint_sl_materials/'
    Xdata_, _, _, train_idx_, test_idx_ = build_datasets(data_raw_path)
    
    N = Xdata_.shape[0]
    
    train_size_list = train_idx_.keys()

    for train_size_ in train_size_list:
        train_size_float = float(train_size_)
        for train_split, test_split in zip(train_idx_[train_size_], test_idx_[train_size_]):
            N_train_split = len(train_split)
            N_test_split = len(test_split)

            # Train and test split sum up to the full dataset size
            N_train_split+N_test_split==N

            # Less than 5% of missmatch between expected split train size and actual split train size
            diff = np.abs(N_train_split-N*train_size_float)/N
            assert diff<0.5
    
test_build(Xdata, Ydata, pid)
test_load(Xdata, Ydata, pid)
test_index()

In [8]:
# Train an xgboost classifier

data_dir = '../data'
data_raw_path = f'{data_dir}/raw/flint_sl_materials/'
save_dir = f'{data_dir}/test_dir'

# Build and save
Xdata, Ydata, pid, train_idx, test_idx = build_datasets(data_raw_path, save_dir=save_dir)

# Load from saved
Xdata, Ydata, pid, train_idx, test_idx = load_datasets(save_dir)

In [9]:
train_idx.files

['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9']

In [10]:
train_idx['0.1']

array([array([  537,   539,   540, ..., 24621, 24622, 24623]),
       array([   48,    51,    59, ..., 26859, 26860, 26861]),
       array([ 2893,  2912,  2919, ..., 26852, 26857, 26862])],
      dtype=object)