## Lecture 27 Notebook: SVM and Parameter Tuning
Duncan Callaway
November 27 2018

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
pd.options.display.max_columns = 100

Let's import the environmental and demographic datasets from CES:

In [3]:
env = pd.read_csv('ces3results_environment.csv')
demog = pd.read_csv('ces3results_demographics.csv')

print('Enviro cols are ', env.columns)
print('Demographics cols are ',demog.columns)

Enviro cols are  Index(['Census Tract', 'Total Population', 'California County', 'ZIP',
       'Nearby City \n(to help approximate location only)', 'Longitude',
       'Latitude', 'CES 3.0 Score', ' CES 3.0 Percentile',
       'CES 3.0 \nPercentile Range', 'SB 535 Disadvantaged Community', 'Ozone',
       'Ozone Pctl', 'PM2.5', 'PM2.5 Pctl', 'Diesel PM', 'Diesel PM Pctl',
       'Drinking Water', 'Drinking Water Pctl', 'Pesticides',
       'Pesticides Pctl', 'Tox. Release', 'Tox. Release Pctl', 'Traffic',
       'Traffic Pctl', 'Cleanup Sites', 'Cleanup Sites Pctl',
       'Groundwater Threats', 'Groundwater Threats Pctl', 'Haz. Waste',
       'Haz. Waste Pctl', 'Imp. Water Bodies', 'Imp. Water Bodies Pctl',
       'Solid Waste', 'Solid Waste Pctl', 'Pollution Burden',
       'Pollution Burden Score', 'Pollution Burden Pctl', 'Asthma',
       'Asthma Pctl', 'Low Birth Weight', 'Low Birth Weight Pctl',
       'Cardiovascular Disease', 'Cardiovascular Disease Pctl', 'Education',
       '

Now merge them...

In [4]:
all = env.merge(demog, left_on='Census Tract', right_on='Census Tract ')

In [5]:
all.head()
all.shape

(8035, 72)

In [6]:
np.mean(all.loc[:,'Imp. Water Bodies']==0)

0.4370877411325451

In [7]:
all = all.dropna()

In [8]:
X = all.loc[:,'Asthma':]
X = X.drop(['Census Tract ', ' CES 3.0 Score', 'CES 3.0 Percentile', ' CES 3.0 \nPercentile Range', 'California \nCounty'], axis = 1);
X.columns

Index(['Asthma', 'Asthma Pctl', 'Low Birth Weight', 'Low Birth Weight Pctl',
       'Cardiovascular Disease', 'Cardiovascular Disease Pctl', 'Education',
       'Education Pctl', 'Linguistic Isolation', 'Linguistic Isolation Pctl',
       'Poverty', 'Poverty Pctl', 'Unemployment', 'Unemployment Pctl',
       'Housing Burden', 'Housing Burden Pctl', 'Pop. Char. ',
       'Pop. Char. Score', 'Pop. Char. Pctl', 'Total Population_y',
       'Children < 10 (%)', 'Pop 11-64 years (%)', 'Elderly > 65 (%)',
       'Hispanic (%)', 'White (%)', 'African American (%)',
       'Native American (%)', 'Asian American (%)', 'Other (%)'],
      dtype='object')

In [9]:
y = all[['Solid Waste', 'Imp. Water Bodies']] 
y_waste = y[['Solid Waste']]
y_water = y[['Imp. Water Bodies']]!=0 

## Predicting whether water bodies are contaminated

In this section we'll run an SVM -- checking different parameter options by cross validation -- to predict whether or not water bodies near each community are contaminated on the basis of their socio-economic metrics. 

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_water, test_size = 0.98)  

In [12]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [13]:
SV_model = SVC()
param_dist = {'C': randint(1, 100),
                'kernel': ['linear', 'poly', 'rbf']
             }
rnd_search = RandomizedSearchCV(SV_model, param_distributions=param_dist, 
                                cv=3, n_iter=4, n_jobs=4)

rnd_search.fit(X_train, y_train['Imp. Water Bodies'])

print(rnd_search.best_score_)
print(rnd_search.best_params_)

KeyboardInterrupt: 

In [None]:
tuned_train_score = rnd_search.score(X_train, y_train)
tuned_test_score = rnd_search.score(X_test, y_test)

print('Train Score: ', tuned_train_score)
print('Test Score: ', tuned_test_score)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_pred = rnd_search.predict(X_test)
confusion_matrix(y_test, y_pred)

## Let's different classifiers

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNC

In [None]:
KNC_model = KNC()
KNC_model.fit(X_train, y_train['Imp. Water Bodies'])
KNC_train_score = KNC_model.score(X_train, y_train)
KNC_test_score = KNC_model.score(X_test, y_test)

print('Train Score: ', KNC_train_score)
print('Test Score: ', KNC_test_score)

y_pred = KNC_model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
KNC_model = KNC()
param_dist = {'n_neighbors': randint(1, 200)
             }

KNC_search = RandomizedSearchCV(KNC_model, param_distributions=param_dist, 
                                cv=3, n_iter=100, n_jobs=4)

KNC_search.fit(X_train, y_train['Imp. Water Bodies'])

print(KNC_search.best_score_)
print(KNC_search.best_params_)

In [None]:
KNC_train_score = KNC_search.score(X_train, y_train)
KNC_test_score = KNC_search.score(X_test, y_test)

print('Train Score: ', KNC_train_score)
print('Test Score: ', KNC_test_score)

y_pred = KNC_search.predict(X_test)
confusion_matrix(y_test, y_pred)

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [None]:
RFC_model = RFC(n_estimators = 100, criterion = 'gini')

RFC_model.fit(X_train, y_train['Imp. Water Bodies'])

In [None]:
RFC_train_score = RFC_model.score(X_train, y_train)
RFC_test_score = RFC_model.score(X_test, y_test)

print('Train Score: ', RFC_train_score)
print('Test Score: ', RFC_test_score)

y_pred = RFC_model.predict(X_test)
confusion_matrix(y_test, y_pred)