In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('features_for_fit5_total.csv')
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment,Total
0,"Abilene, TX",4.672014,16.444673,1.397860,28.947368,824.0
1,"Akron, OH",1.819752,11.310636,-0.128197,32.015000,899.0
2,"Albany, GA",2.294333,16.066871,-2.434386,25.714286,1151.5
3,"Albany-Lebanon, OR",8.949608,25.114448,7.911940,28.358209,429.0
4,"Albany-Schenectady-Troy, NY",4.109291,18.285130,0.147426,31.111111,719.5
...,...,...,...,...,...,...
378,"Yakima, WA",6.186625,24.300861,1.240113,26.829268,754.0
379,"York-Hanover, PA",4.926748,12.935877,1.656400,29.787234,840.0
380,"Youngstown-Warren-Boardman, OH-PA",-1.471502,8.218357,-2.317246,27.868852,1335.0
381,"Yuba City, CA",10.104159,24.611951,3.727698,29.411765,461.0


In [3]:
# Target "Emerging" and "Static" based off of top 40 MSA's with best ranking score.
df.loc[df['Total'] <= 335, 'Target'] = 'Emerging'
df.loc[df['Total'] >335, 'Target'] = 'Static'

In [4]:
# set x and y data
y = df['Target']
X = df.drop(columns=['MSA', 'Total', 'Target'])

In [5]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Counter(y_train)

Counter({'Static': 259, 'Emerging': 28})

In [24]:
# instantiate the model a fit using the training sets
model = LogisticRegression(solver='lbfgs', max_iter=50, random_state=42, class_weight={'Emerging':0.50})
model.fit(X_train, y_train)

LogisticRegression(class_weight={'Emerging': 0.5}, max_iter=50, random_state=42)

In [25]:
# test the model accuracy
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9583333333333334
[[10  2]
 [ 2 82]]
              precision    recall  f1-score   support

    Emerging       0.83      0.83      0.83        12
      Static       0.98      0.98      0.98        84

    accuracy                           0.96        96
   macro avg       0.90      0.90      0.90        96
weighted avg       0.96      0.96      0.96        96



In [18]:
# import feature set to run predictions on 2024 data
df_2024 = pd.read_csv('2024_ROC_rank_total2.csv')
df_2024.head()

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.01012,282.5,89.0,317.0,344.0,1032.5


In [19]:
df_2024.columns.tolist()

['CBSA',
 'Metropolitan_Area',
 '2024_Pop_ROC',
 '2024_Unem_ROC',
 '2024_Emp_ROC',
 '2024_GDP_ROC',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total']

In [20]:
# set x2 data for logisticRegression prediction
X2 = df_2024.drop(columns=['CBSA',
 'Metropolitan_Area',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total'])
X2.head()

Unnamed: 0,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC
0,3.9,57.291667,-1.245487,17.059092
1,2.6,61.891892,7.734707,10.671246
2,-0.8,60.731707,0.680513,3.427676
3,-0.7,67.058824,2.565748,0.525464
4,0.3,61.621622,-1.425836,4.01012


In [21]:
# reorder columns to match training data columns
X2 = X2[['2024_Emp_ROC','2024_GDP_ROC','2024_Pop_ROC','2024_Unem_ROC']]
X2.head()

Unnamed: 0,2024_Emp_ROC,2024_GDP_ROC,2024_Pop_ROC,2024_Unem_ROC
0,-1.245487,17.059092,3.9,57.291667
1,7.734707,10.671246,2.6,61.891892
2,0.680513,3.427676,-0.8,60.731707
3,2.565748,0.525464,-0.7,67.058824
4,-1.425836,4.01012,0.3,61.621622


In [26]:
# run the logisticRegression model over our 2024 data set
emerging_pred = model.predict(X2)
# added predicted classifiers back into original dataframe
df_2024['Predicted_Classifier_LR'] = emerging_pred
# dispay MSA that our ML model predicted to be emerging in 2024
df_2024.loc[df_2024['Predicted_Classifier_LR']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_RFC
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5,Emerging,Static
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5,Emerging,Static
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5,Emerging,Static
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0,Emerging,Static
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.010120,282.5,89.0,317.0,344.0,1032.5,Emerging,Static
...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,41620,"Salt Lake City, UT",6.4,43.461538,4.557238,18.504085,58.0,372.0,79.0,55.0,564.0,Emerging,Static
379,42700,"Sebring-Avon Park, FL",3.4,51.272727,2.787827,4.657780,163.0,324.5,148.0,340.0,975.5,Emerging,Static
380,43900,"Spartanburg, SC",4.7,56.428571,-1.597985,17.360000,114.0,257.0,323.0,84.0,778.0,Emerging,Static
381,44300,"State College, PA",4.1,59.285714,7.441246,15.787866,134.0,185.5,32.0,118.0,469.5,Emerging,Static


In [13]:
# use svm to predict emerging
# create SVM classifier
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)
emerging_pred_svm = model_svm.predict(X2)
df_2024['Predicted_Classifier_svm'] = emerging_pred_svm
df_2024.loc[df_2024['Predicted_Classifier_svm']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5,Emerging,Emerging
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5,Emerging,Emerging
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5,Emerging,Emerging
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0,Emerging,Emerging
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.010120,282.5,89.0,317.0,344.0,1032.5,Emerging,Emerging
...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,41620,"Salt Lake City, UT",6.4,43.461538,4.557238,18.504085,58.0,372.0,79.0,55.0,564.0,Emerging,Emerging
379,42700,"Sebring-Avon Park, FL",3.4,51.272727,2.787827,4.657780,163.0,324.5,148.0,340.0,975.5,Emerging,Emerging
380,43900,"Spartanburg, SC",4.7,56.428571,-1.597985,17.360000,114.0,257.0,323.0,84.0,778.0,Emerging,Emerging
381,44300,"State College, PA",4.1,59.285714,7.441246,15.787866,134.0,185.5,32.0,118.0,469.5,Emerging,Emerging


In [23]:
# Creating a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
rf_model.fit(X_train, y_train)
forest_predict = rf_model.predict(X2)
df_2024['Predicted_Classifier_RFC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_RFC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_RFC
29,22020,"Fargo, ND-MN",9.9,44.736842,9.286568,20.468566,14.0,369.0,14.0,29.0,426.0,Emerging,Emerging
84,45540,"The Villages, FL",17.5,52.857143,18.71458,18.666587,1.0,305.0,1.0,52.0,359.0,Emerging,Emerging
105,33260,"Midland, TX",12.3,49.230769,13.470572,17.93853,3.5,344.5,3.0,72.0,423.0,Emerging,Emerging
157,34060,"Morgantown, WV",4.9,67.058824,10.129259,15.591279,104.5,8.5,11.0,126.0,250.0,Emerging,Emerging
173,13900,"Bismarck, ND",10.5,36.190476,13.609255,22.560386,9.0,378.0,2.0,18.0,407.0,Emerging,Emerging
182,21420,"Enid, OK",1.8,59.62963,10.871624,-3.325161,228.5,171.5,8.0,375.0,783.0,Emerging,Emerging
272,25220,"Hammond, LA",6.1,48.888889,9.153985,-1.094599,65.5,349.5,15.0,369.0,799.0,Emerging,Emerging
276,29700,"Laredo, TX",5.9,62.1875,11.238739,18.670049,70.5,72.5,5.0,51.0,199.0,Emerging,Emerging
299,27340,"Jacksonville, NC",3.1,61.463415,11.049093,3.723449,176.0,98.5,7.0,346.0,627.5,Emerging,Emerging
318,26380,"Houma-Thibodaux, LA",1.2,59.25,11.095704,-11.760221,252.5,191.5,6.0,384.0,834.0,Emerging,Emerging


In [15]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.25,
                                        max_features=4,
                                        max_depth=3,
                                        random_state=42)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X2)
df_2024['Predicted_Classifier_GBC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_GBC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC
29,22020,"Fargo, ND-MN",9.9,44.736842,9.286568,20.468566,14.0,369.0,14.0,29.0,426.0,Emerging,Emerging,Emerging,Emerging
84,45540,"The Villages, FL",17.5,52.857143,18.71458,18.666587,1.0,305.0,1.0,52.0,359.0,Emerging,Emerging,Emerging,Emerging
105,33260,"Midland, TX",12.3,49.230769,13.470572,17.93853,3.5,344.5,3.0,72.0,423.0,Emerging,Emerging,Emerging,Emerging
173,13900,"Bismarck, ND",10.5,36.190476,13.609255,22.560386,9.0,378.0,2.0,18.0,407.0,Emerging,Emerging,Emerging,Emerging
182,21420,"Enid, OK",1.8,59.62963,10.871624,-3.325161,228.5,171.5,8.0,375.0,783.0,Emerging,Emerging,Emerging,Emerging
272,25220,"Hammond, LA",6.1,48.888889,9.153985,-1.094599,65.5,349.5,15.0,369.0,799.0,Emerging,Emerging,Emerging,Emerging
276,29700,"Laredo, TX",5.9,62.1875,11.238739,18.670049,70.5,72.5,5.0,51.0,199.0,Emerging,Emerging,Emerging,Emerging
299,27340,"Jacksonville, NC",3.1,61.463415,11.049093,3.723449,176.0,98.5,7.0,346.0,627.5,Emerging,Emerging,Emerging,Emerging
318,26380,"Houma-Thibodaux, LA",1.2,59.25,11.095704,-11.760221,252.5,191.5,6.0,384.0,834.0,Emerging,Emerging,Emerging,Emerging
327,36220,"Odessa, TX",9.6,58.4375,9.785776,-4.373828,17.5,212.0,13.0,377.0,619.5,Emerging,Emerging,Emerging,Emerging
