In [1]:
import pandas as pd
import numpy as np
import os

#### Identify Directories

In [46]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))


#### Import Data

In [47]:
#CSVs = [file for file in os.listdir(rawDataDir) if ('.csv' and 'geocode') in file]
CSVs = [file for file in os.listdir(rawDataDir) if ('.csv' and 'geocode' and 'US') in file]

CSVs.sort()

filename = f"{rawDataDir}/{CSVs[-1]}"
df = pd.read_csv(filename)

print(filename)
print(df.shape)

C:\Users\MichaelMatosin\Documents\Projects\MLgeo\data\raw/reverse_US_geocode_results_pull3.csv
(2200, 9)


In [48]:
df.head()

Unnamed: 0,gps_lat,gps_long,lat,long,city,state,area,country,state_flag
0,31.78,-94.33,31.90378,-94.3952,Timpson,Texas,Shelby County,US,Texas
1,38.77,-88.01,38.73088,-88.08532,Olney,Illinois,Richland County,US,Illinois
2,28.85,-81.03,28.73972,-81.11506,Geneva,Florida,Seminole County,US,Florida
3,25.86,-96.76,26.10369,-97.16469,South Padre Island,Texas,Cameron County,US,Texas
4,37.23,-97.89,37.28669,-98.02589,Harper,Kansas,Harper County,US,Kansas


#### Random Forest

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [50]:
y = np.array(df['state_flag'])
X = np.array(df[['gps_lat','gps_long']])

train, test, train_labels, test_labels = train_test_split(X , y, test_size = 0.3)

In [51]:
# instantiate model
model = RandomForestClassifier(n_estimators=100, 
                               max_features = 'auto',
                               n_jobs=None
                              )

In [52]:
# train model
model.fit(train, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [53]:
# predict on  test data
predictions = model.predict(test)

# Probabilities for each class
probs = model.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

Model Accuracy: 93.03%


In [54]:
# classificatyion report
print(metrics.classification_report(test_labels,predictions))

                precision    recall  f1-score   support

       Alabama       1.00      0.88      0.93         8
       Arizona       1.00      0.86      0.92         7
      Arkansas       0.88      0.88      0.88         8
    California       0.97      0.97      0.97        39
      Colorado       0.95      1.00      0.97        18
   Connecticut       0.00      0.00      0.00         0
      Delaware       0.00      0.00      0.00         1
       Florida       0.96      1.00      0.98        22
       Georgia       1.00      1.00      1.00         4
         Idaho       1.00      0.94      0.97        18
      Illinois       0.83      1.00      0.91         5
       Indiana       1.00      0.50      0.67         2
          Iowa       1.00      0.75      0.86         4
        Kansas       0.88      1.00      0.93         7
      Kentucky       0.71      1.00      0.83         5
     Louisiana       0.85      0.85      0.85        13
         Maine       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


#### Random Forest Hyperparameter Tuning

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [15]:
# current model parameters
model.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [16]:
#### Random Hyperparameter Search

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 21)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, None], 'min_samples_split': [2, 5, 10, 15, 20], 'min_samples_leaf': [1, 2, 4, 6, 8, 10], 'bootstrap': [True, False]}


In [17]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across a bunch of different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 250, cv = 5, verbose=2, 
                               random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train, train_labels)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 1250 out of 1250 | elapsed: 33.7min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [18]:
'''
{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}
'''

rf_random.best_params_



{'n_estimators': 120,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [19]:
# save the best estimator we found
best_random = rf_random.best_estimator_

# predict on  test data
predictions = best_random.predict(test)

# Probabilities for each class
probs = best_random.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

Model Accuracy: 74.11%


In [184]:
# classificatyion report
# print(metrics.classification_report(test_labels,predictions))

In [189]:
#### Grid Search CV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40,50,60,70,80,90,100],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2,3, 4, 5],
    'min_samples_split': [2,4,6,8,10, 12],
    'n_estimators': [100, 200, 400,600,800,1000]
}
# Create a based model
rf = rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train, train_labels)

Fitting 3 folds for each of 2880 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  4.0min


KeyboardInterrupt: 

In [190]:
grid_search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [191]:
# save the best estimator we found
best_grid = grid_search.best_estimator_

# predict on  test data
predictions = best_grid.predict(test)

# Probabilities for each class
probs = best_grid.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

#### Neural Network