In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [2]:
df = pd.read_csv('features_for_fit.csv')
df

Unnamed: 0,CBSA,MSA,Employment,GDP,Population,Unemployment,Total
0,10180,"abilene, tx",4.672014,16.444673,1.397860,3.0,670.5
1,10420,"akron, oh",1.819752,11.310636,-0.128197,4.3,1139.5
2,10500,"albany, ga",2.294333,16.066871,-2.434386,4.3,1101.5
3,10540,"albany-lebanon, or",8.949608,25.114448,7.911940,4.3,446.5
4,10580,"albany-schenectady-troy, ny",4.109291,18.285130,0.147426,3.7,825.5
...,...,...,...,...,...,...,...
379,49420,"yakima, wa",6.186625,24.300861,1.240113,7.0,815.0
380,49620,"york-hanover, pa",4.926748,12.935877,1.656400,3.8,856.0
381,49660,"youngstown-warren-boardman, oh-pa",-1.471502,8.218357,-2.317246,5.7,1414.5
382,49700,"yuba city, ca",10.104159,24.611951,3.727698,6.8,614.0


In [3]:
# Target "Emerging" and "Static" based off of top 40 MSA's with best ranking score.
df.loc[df['Total'] < 300, 'Target'] = "Emerging"
df.loc[df['Total'] >300, 'Target'] = "Static"
df

Unnamed: 0,CBSA,MSA,Employment,GDP,Population,Unemployment,Total,Target
0,10180,"abilene, tx",4.672014,16.444673,1.397860,3.0,670.5,Static
1,10420,"akron, oh",1.819752,11.310636,-0.128197,4.3,1139.5,Static
2,10500,"albany, ga",2.294333,16.066871,-2.434386,4.3,1101.5,Static
3,10540,"albany-lebanon, or",8.949608,25.114448,7.911940,4.3,446.5,Static
4,10580,"albany-schenectady-troy, ny",4.109291,18.285130,0.147426,3.7,825.5,Static
...,...,...,...,...,...,...,...,...
379,49420,"yakima, wa",6.186625,24.300861,1.240113,7.0,815.0,Static
380,49620,"york-hanover, pa",4.926748,12.935877,1.656400,3.8,856.0,Static
381,49660,"youngstown-warren-boardman, oh-pa",-1.471502,8.218357,-2.317246,5.7,1414.5,Static
382,49700,"yuba city, ca",10.104159,24.611951,3.727698,6.8,614.0,Static


In [4]:
y = df['Target']
X = df.drop(columns=['CBSA','MSA', 'Total', 'Target'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Counter(y_train)

Counter({'Static': 257, 'Emerging': 31})

In [6]:
model = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=42)

In [7]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9791666666666666
[[ 8  1]
 [ 1 86]]
              precision    recall  f1-score   support

    Emerging       0.89      0.89      0.89         9
      Static       0.99      0.99      0.99        87

    accuracy                           0.98        96
   macro avg       0.94      0.94      0.94        96
weighted avg       0.98      0.98      0.98        96



In [8]:
# Create a gradient boosting classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=3,
                                            max_depth=3,
                                            random_state=42)

    # Fit the model
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))

Learning rate:  0.05
Accuracy score (training): 0.990
Accuracy score (validation): 0.969
Learning rate:  0.1
Accuracy score (training): 1.000
Accuracy score (validation): 0.958
Learning rate:  0.25
Accuracy score (training): 1.000
Accuracy score (validation): 0.969
Learning rate:  0.5
Accuracy score (training): 1.000
Accuracy score (validation): 0.958
Learning rate:  0.75
Accuracy score (training): 1.000
Accuracy score (validation): 0.958
Learning rate:  1
Accuracy score (training): 1.000
Accuracy score (validation): 0.958


In [9]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.25,
                                        max_features=3,
                                        max_depth=3,
                                        random_state=42)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X_test)

In [10]:
print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

0.96875
[[ 8  1]
 [ 2 85]]
              precision    recall  f1-score   support

    Emerging       0.80      0.89      0.84         9
      Static       0.99      0.98      0.98        87

    accuracy                           0.97        96
   macro avg       0.89      0.93      0.91        96
weighted avg       0.97      0.97      0.97        96



In [11]:
# Creating a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [12]:
rf_model.fit(X_train, y_train)
forest_predict = rf_model.predict(X_test)

In [13]:
print(accuracy_score(y_test, forest_predict))
print(confusion_matrix(y_test, forest_predict))
print(classification_report(y_test, forest_predict))

0.96875
[[ 8  1]
 [ 2 85]]
              precision    recall  f1-score   support

    Emerging       0.80      0.89      0.84         9
      Static       0.99      0.98      0.98        87

    accuracy                           0.97        96
   macro avg       0.89      0.93      0.91        96
weighted avg       0.97      0.97      0.97        96



In [14]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3832108657624258, 'Employment'),
 (0.22907188137675114, 'Population'),
 (0.2288012310050099, 'Unemployment'),
 (0.15891602185581316, 'GDP')]

In [15]:
# create SVM classifier
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)

SVC(kernel='linear')

In [16]:
y_pred_svm = model_svm.predict(X_test)
print(accuracy_score(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

0.9583333333333334
[[ 7  2]
 [ 2 85]]
              precision    recall  f1-score   support

    Emerging       0.78      0.78      0.78         9
      Static       0.98      0.98      0.98        87

    accuracy                           0.96        96
   macro avg       0.88      0.88      0.88        96
weighted avg       0.96      0.96      0.96        96



In [17]:
# using SMOTE to over sample the "emerging" cities
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Static': 257, 'Emerging': 257})

In [18]:
model_smote_LR = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
model_smote_LR.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [19]:
y_pred_smote_LR = model_smote_LR.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred_smote_LR))
print(confusion_matrix(y_test, y_pred_smote_LR))
print(classification_report_imbalanced(y_test, y_pred_smote_LR))

0.9655172413793103
[[ 9  0]
 [ 6 81]]
                   pre       rec       spe        f1       geo       iba       sup

   Emerging       0.60      1.00      0.93      0.75      0.96      0.94         9
     Static       1.00      0.93      1.00      0.96      0.96      0.92        87

avg / total       0.96      0.94      0.99      0.94      0.96      0.93        96



In [20]:
# rerun svm model with smote oversampling
model_svm2 = SVC(kernel='linear')
model_svm2.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [21]:
y_pred_svm2 = model_svm2.predict(X_test)
print(accuracy_score(y_test, y_pred_svm2))
print(confusion_matrix(y_test, y_pred_svm2))
print(classification_report(y_test, y_pred_svm2))

0.9583333333333334
[[ 9  0]
 [ 4 83]]
              precision    recall  f1-score   support

    Emerging       0.69      1.00      0.82         9
      Static       1.00      0.95      0.98        87

    accuracy                           0.96        96
   macro avg       0.85      0.98      0.90        96
weighted avg       0.97      0.96      0.96        96

