In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('features_for_fit10_total.csv')
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment,Total
0,"Abilene, TX",10.945263,32.378757,3.910378,28.947368,899.0
1,"Akron, OH",7.761940,30.070473,0.063724,32.015000,857.0
2,"Albany, GA",7.061047,29.231972,-4.813001,25.714286,1239.5
3,"Albany-Lebanon, OR",15.685181,52.685485,10.999991,28.358209,547.0
4,"Albany-Schenectady-Troy, NY",9.664928,39.598832,1.067523,31.111111,761.5
...,...,...,...,...,...,...
378,"Yakima, WA",14.634453,47.539475,2.709043,26.829268,802.0
379,"York-Hanover, PA",9.534342,24.927250,3.133806,29.787234,973.0
380,"Youngstown-Warren-Boardman, OH-PA",0.967322,18.783134,-5.089178,27.868852,1343.0
381,"Yuba City, CA",20.879314,36.901149,5.110743,29.411765,678.0


In [3]:
print(df['Employment'].mean())
print(df['GDP'].mean())
print(df['Population'].mean())
print(df['Unemployment'].mean())

13.716340750885117
37.50540560933163
5.656211670506528
29.23073892754569


In [4]:
df['Total'].sort_values(ascending=True).head(40)

54      65.0
63      65.0
130    114.0
19     122.0
284    126.5
84     141.5
64     147.0
315    152.0
251    158.0
245    160.5
74     167.0
32     168.5
17     183.0
260    184.0
144    198.5
89     209.5
41     233.0
169    233.0
332    238.0
270    244.0
244    251.0
154    258.0
307    264.5
15     285.5
277    290.0
225    290.5
233    291.0
140    298.0
190    302.0
344    315.0
86     317.0
76     321.5
341    322.0
374    323.5
306    327.5
159    328.5
195    334.0
16     337.0
337    342.0
255    342.5
Name: Total, dtype: float64

In [5]:
# Target "Emerging" and "Static" based off of top 40 MSA's with best ranking score.
df.loc[df['Total'] < 343, 'Target'] = 'Emerging'
df.loc[df['Total'] > 343, 'Target'] = 'Static'
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment,Total,Target
0,"Abilene, TX",10.945263,32.378757,3.910378,28.947368,899.0,Static
1,"Akron, OH",7.761940,30.070473,0.063724,32.015000,857.0,Static
2,"Albany, GA",7.061047,29.231972,-4.813001,25.714286,1239.5,Static
3,"Albany-Lebanon, OR",15.685181,52.685485,10.999991,28.358209,547.0,Static
4,"Albany-Schenectady-Troy, NY",9.664928,39.598832,1.067523,31.111111,761.5,Static
...,...,...,...,...,...,...,...
378,"Yakima, WA",14.634453,47.539475,2.709043,26.829268,802.0,Static
379,"York-Hanover, PA",9.534342,24.927250,3.133806,29.787234,973.0,Static
380,"Youngstown-Warren-Boardman, OH-PA",0.967322,18.783134,-5.089178,27.868852,1343.0,Static
381,"Yuba City, CA",20.879314,36.901149,5.110743,29.411765,678.0,Static


In [20]:
df.loc[df['Target'] == 'Emerging']

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment,Total,Target
15,"Asheville, NC",23.089915,49.712107,8.779371,32.608696,285.5,Emerging
16,"Athens-Clarke County, GA",20.115993,48.901375,10.499951,31.578947,337.0,Emerging
17,"Atlanta-Sandy Springs-Alpharetta, GA",29.497727,57.811298,13.536119,31.578947,183.0,Emerging
19,"Auburn-Opelika, AL",31.334402,55.085479,16.867196,32.692308,122.0,Emerging
32,"Bend, OR",41.152886,99.963782,25.326166,30.508475,168.5,Emerging
41,"Boise City, ID",32.991115,62.718127,21.256389,30.24,233.0,Emerging
54,"Cape Coral-Fort Myers, FL",41.614405,56.665186,24.19667,32.692308,65.0,Emerging
63,"Charleston-North Charleston, SC",33.679292,66.307795,20.177092,32.692308,65.0,Emerging
64,"Charlotte-Concord-Gastonia, NC-SC",33.285144,62.447199,17.188498,31.481481,147.0,Emerging
74,"Coeur d'Alene, ID",25.377724,52.48194,19.336973,32.25,167.0,Emerging


In [6]:
# set x and y data
y = df['Target']
X = df.drop(columns=['MSA', 'Total', 'Target'])

In [7]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Counter(y_train)

Counter({'Static': 259, 'Emerging': 28})

In [8]:
# instantiate the model a fit using the training sets
model = LogisticRegression(solver='lbfgs', max_iter=50, random_state=42, class_weight={'Emerging':.5})
model.fit(X_train, y_train)

LogisticRegression(class_weight={'Emerging': 0.5}, max_iter=50, random_state=42)

In [9]:
# test the model accuracy
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9791666666666666
[[11  1]
 [ 1 83]]
              precision    recall  f1-score   support

    Emerging       0.92      0.92      0.92        12
      Static       0.99      0.99      0.99        84

    accuracy                           0.98        96
   macro avg       0.95      0.95      0.95        96
weighted avg       0.98      0.98      0.98        96



In [10]:
# import feature set to run predictions on 2024 data
df_2024 = pd.read_csv('2024_ROC_rank_total2.csv')
df_2024.head()

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.01012,282.5,89.0,317.0,344.0,1032.5


In [21]:
df_2024['2024_Unem_ROC'].sort_values()

369    26.521739
13     27.826087
169    29.791667
363    30.416667
207    34.545455
         ...    
202    67.096774
347    67.096774
87     67.096774
172    67.971014
156    74.677419
Name: 2024_Unem_ROC, Length: 383, dtype: float64

In [11]:
print(df_2024['2024_Emp_ROC'].mean())
print(df_2024['2024_GDP_ROC'].mean())
print(df_2024['2024_Pop_ROC'].mean())
print(df_2024['2024_Unem_ROC'].mean())

1.8355239515649782
12.498457627383488
2.8237597911227152
57.3105010344605


In [12]:
df_2024.columns.tolist()

['CBSA',
 'Metropolitan_Area',
 '2024_Pop_ROC',
 '2024_Unem_ROC',
 '2024_Emp_ROC',
 '2024_GDP_ROC',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total']

In [13]:
# set x2 data for logisticRegression prediction
X2 = df_2024.drop(columns=['CBSA',
 'Metropolitan_Area',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total'])
X2.head()

Unnamed: 0,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC
0,3.9,57.291667,-1.245487,17.059092
1,2.6,61.891892,7.734707,10.671246
2,-0.8,60.731707,0.680513,3.427676
3,-0.7,67.058824,2.565748,0.525464
4,0.3,61.621622,-1.425836,4.01012


In [14]:
# reorder columns to match training data columns
X2 = X2[['2024_Emp_ROC','2024_GDP_ROC','2024_Pop_ROC','2024_Unem_ROC']]
X2.head()

Unnamed: 0,2024_Emp_ROC,2024_GDP_ROC,2024_Pop_ROC,2024_Unem_ROC
0,-1.245487,17.059092,3.9,57.291667
1,7.734707,10.671246,2.6,61.891892
2,0.680513,3.427676,-0.8,60.731707
3,2.565748,0.525464,-0.7,67.058824
4,-1.425836,4.01012,0.3,61.621622


In [15]:
# run the logisticRegression model over our 2024 data set
emerging_pred = model.predict(X2)
# added predicted classifiers back into original dataframe
df_2024['Predicted_Classifier_LR'] = emerging_pred
# dispay MSA that our ML model predicted to be emerging in 2024
df_2024.loc[df_2024['Predicted_Classifier_LR']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5,Emerging
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5,Emerging
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5,Emerging
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0,Emerging
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.010120,282.5,89.0,317.0,344.0,1032.5,Emerging
...,...,...,...,...,...,...,...,...,...,...,...,...
378,41620,"Salt Lake City, UT",6.4,43.461538,4.557238,18.504085,58.0,372.0,79.0,55.0,564.0,Emerging
379,42700,"Sebring-Avon Park, FL",3.4,51.272727,2.787827,4.657780,163.0,324.5,148.0,340.0,975.5,Emerging
380,43900,"Spartanburg, SC",4.7,56.428571,-1.597985,17.360000,114.0,257.0,323.0,84.0,778.0,Emerging
381,44300,"State College, PA",4.1,59.285714,7.441246,15.787866,134.0,185.5,32.0,118.0,469.5,Emerging


In [16]:
# use svm to predict emerging
# create SVM classifier
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)
emerging_pred_svm = model_svm.predict(X2)
df_2024['Predicted_Classifier_svm'] = emerging_pred_svm
df_2024.loc[df_2024['Predicted_Classifier_svm']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5,Emerging,Emerging
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5,Emerging,Emerging
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5,Emerging,Emerging
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0,Emerging,Emerging
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.010120,282.5,89.0,317.0,344.0,1032.5,Emerging,Emerging
...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,41620,"Salt Lake City, UT",6.4,43.461538,4.557238,18.504085,58.0,372.0,79.0,55.0,564.0,Emerging,Emerging
379,42700,"Sebring-Avon Park, FL",3.4,51.272727,2.787827,4.657780,163.0,324.5,148.0,340.0,975.5,Emerging,Emerging
380,43900,"Spartanburg, SC",4.7,56.428571,-1.597985,17.360000,114.0,257.0,323.0,84.0,778.0,Emerging,Emerging
381,44300,"State College, PA",4.1,59.285714,7.441246,15.787866,134.0,185.5,32.0,118.0,469.5,Emerging,Emerging


In [19]:
# Creating a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
rf_model.fit(X_train, y_train)
forest_predict = rf_model.predict(X2)
df_2024['Predicted_Classifier_RFC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_RFC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC


In [18]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.25,
                                        max_features=4,
                                        max_depth=3,
                                        random_state=42)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X2)
df_2024['Predicted_Classifier_GBC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_GBC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC
