In [2]:
import pandas as pd
import numpy as np
import os

#### Identify Directories

In [3]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))


#### Import Data

In [33]:
#CSVs = [file for file in os.listdir(rawDataDir) if ('.csv' and 'geocode') in file]
CSVs = [file for file in os.listdir(rawDataDir) if ('.csv' and 'geocode' and 'US') in file]
CSV_dict = {int(file.split('.csv')[0].split('_pull')[-1]):file for file in CSVs}
max_key = np.max([k for k in CSV_dict.keys()])

# CSVs.sort()

#filename = f"{rawDataDir}/{CSVs[-1]}"
filename = f"{rawDataDir}//{CSV_dict[max_key]}"
df = pd.read_csv(filename)

print(filename)
print(df.shape)

/Users/Michael/Documents/Projects/MLgeo/data/raw//reverse_US_geocode_results_pull10.csv
(6695, 9)


In [87]:
df['state_flag'].value_counts()[:20]

None              1611
Texas              285
California         263
Florida            177
North Carolina     160
Montana            145
Nevada             122
Colorado           115
New Mexico         115
Idaho              106
Arizona            103
Minnesota          103
Louisiana           97
Michigan            96
Wyoming             96
Oregon              91
North Dakota        90
Massachusetts       89
New York            84
South Dakota        75
Name: state_flag, dtype: int64

#### Random Forest

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [89]:
y = np.array(df['state_flag'])
X = np.array(df[['gps_lat','gps_long']])

train, test, train_labels, test_labels = train_test_split(X , y, test_size = 0.3)

In [90]:
# instantiate model
model = RandomForestClassifier(n_estimators=100, 
                               max_features = 'auto',
                               n_jobs=None
                              )

In [91]:
# train model
model.fit(train, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [92]:
# predict on  test data
predictions = model.predict(test)

# Probabilities for each class
probs = model.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

Model Accuracy: 93.21%


In [93]:
# classificatyion report
print(metrics.classification_report(test_labels,predictions))

                precision    recall  f1-score   support

       Alabama       0.85      0.79      0.81        14
       Arizona       0.89      0.97      0.93        35
      Arkansas       1.00      0.70      0.82        10
    California       0.94      0.94      0.94        77
      Colorado       0.97      0.89      0.93        37
   Connecticut       0.50      0.50      0.50         2
      Delaware       0.00      0.00      0.00         1
       Florida       0.98      0.93      0.96        58
       Georgia       0.83      0.88      0.86        17
         Idaho       0.83      0.91      0.87        32
      Illinois       0.81      1.00      0.90        13
       Indiana       0.78      0.88      0.82         8
          Iowa       1.00      0.94      0.97        17
        Kansas       0.91      1.00      0.95        20
      Kentucky       0.60      0.60      0.60        10
     Louisiana       0.96      1.00      0.98        27
         Maine       1.00      0.75      0.86  

  'precision', 'predicted', average, warn_for)


#### Random Forest Hyperparameter Tuning

In [77]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [78]:
# current model parameters
model.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [79]:
#### Random Hyperparameter Search

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 21)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, None], 'min_samples_split': [2, 5, 10, 15, 20], 'min_samples_leaf': [1, 2, 4, 6, 8, 10], 'bootstrap': [True, False]}


In [80]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across a bunch of different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 250, cv = 5, verbose=2, 
                               random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train, train_labels)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 1250 out of 1250 | elapsed: 23.1min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [81]:
'''
{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}
'''

rf_random.best_params_



{'n_estimators': 780,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [82]:
# save the best estimator we found
best_random = rf_random.best_estimator_

# predict on  test data
predictions = best_random.predict(test)

# Probabilities for each class
probs = best_random.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

Model Accuracy: 91.56%


In [83]:
# classificatyion report
# print(metrics.classification_report(test_labels,predictions))

In [None]:
#### Grid Search CV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40,50,60,70,80,90,100],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2,3, 4, 5],
    'min_samples_split': [2,4,6,8,10, 12],
    'n_estimators': [100, 200, 400,600,800,1000]
}
# Create a based model
rf = rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train, train_labels)

In [None]:
grid_search.best_params_

In [None]:
# save the best estimator we found
best_grid = grid_search.best_estimator_

# predict on  test data
predictions = best_grid.predict(test)

# Probabilities for each class
probs = best_grid.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

#### Neural Network