In [155]:
import pandas as pd
import numpy as np
import os

#### Identify Directories

In [156]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))


#### Import Data

In [157]:
CSVs = [file for file in os.listdir(rawDataDir) if '.csv' in file]
CSVs.sort()

In [158]:
filename = f"{rawDataDir}/coordinates_and_states_pull{len(CSVs)}.csv"

df = pd.read_csv(filename)
print(df.shape)

(6993, 3)


In [159]:
df.head()

Unnamed: 0,Lat,Long,State
0,-42.65,138.44,
1,-37.24,125.4,
2,-32.67,127.16,
3,-41.03,113.45,
4,-10.11,125.35,


In [160]:
df['State'].value_counts()

None                  3583
Western Australia     1023
Queensland             746
Northern Territory     623
South Australia        559
New South Wales        336
Victoria                87
Tasmania                36
Name: State, dtype: int64

#### Random Forest

In [161]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [162]:
df.head(2)

Unnamed: 0,Lat,Long,State
0,-42.65,138.44,
1,-37.24,125.4,


In [163]:
y = np.array(df['State'])
X = np.array(df[['Lat','Long']])

train, test, train_labels, test_labels = train_test_split(X , y, test_size = 0.3)

In [164]:
# instantiate model
model = RandomForestClassifier(n_estimators=100, 
                               max_features = 'auto',
                               n_jobs=None
                              )

In [165]:
# train model
model.fit(train, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [166]:
# predict on  test data
predictions = model.predict(test)

# Probabilities for each class
probs = model.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

Model Accuracy: 77.74%


In [167]:
# classificatyion report
print(metrics.classification_report(test_labels,predictions))

                    precision    recall  f1-score   support

   New South Wales       0.78      0.85      0.81        98
              None       0.83      0.82      0.83      1094
Northern Territory       0.70      0.68      0.69       176
        Queensland       0.75      0.72      0.74       231
   South Australia       0.76      0.62      0.68       172
          Tasmania       0.44      0.78      0.56         9
          Victoria       0.63      0.85      0.72        20
 Western Australia       0.69      0.79      0.74       298

          accuracy                           0.78      2098
         macro avg       0.70      0.76      0.72      2098
      weighted avg       0.78      0.78      0.78      2098



#### Random Forest Hyperparameter Tuning

In [183]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [169]:
# current model parameters
model.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [170]:
#### Random Hyperparameter Search

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 21)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, None], 'min_samples_split': [2, 5, 10, 15, 20], 'min_samples_leaf': [1, 2, 4, 6, 8, 10], 'bootstrap': [True, False]}


In [175]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across a bunch of different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 250, cv = 5, verbose=2, 
                               random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train, train_labels)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 1250 out of 1250 | elapsed: 18.6min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [182]:
'''
{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}
'''

rf_random.best_params_



{'n_estimators': 340,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 15,
 'bootstrap': True}

In [180]:
# save the best estimator we found
best_random = rf_random.best_estimator_

# predict on  test data
predictions = best_random.predict(test)

# Probabilities for each class
probs = best_random.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

Model Accuracy: 78.55%


In [184]:
# classificatyion report
# print(metrics.classification_report(test_labels,predictions))

In [189]:
#### Grid Search CV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40,50,60,70,80,90,100],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2,3, 4, 5],
    'min_samples_split': [2,4,6,8,10, 12],
    'n_estimators': [100, 200, 400,600,800,1000]
}
# Create a based model
rf = rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train, train_labels)

Fitting 3 folds for each of 2880 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  4.0min


KeyboardInterrupt: 

In [190]:
grid_search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [191]:
# save the best estimator we found
best_grid = grid_search.best_estimator_

# predict on  test data
predictions = best_grid.predict(test)

# Probabilities for each class
probs = best_grid.predict_proba(test)[:, 1]

# model accuracy 
accuracy = metrics.accuracy_score(test_labels,predictions)

print("Model Accuracy:",f"{np.round(accuracy*100,2)}%")

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

#### Neural Network