In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('features_for_fit5.csv')
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment
0,"Abilene, TX",4.672014,16.444673,1.397860,28.947368
1,"Akron, OH",1.819752,11.310636,-0.128197,32.015000
2,"Albany, GA",2.294333,16.066871,-2.434386,25.714286
3,"Albany-Lebanon, OR",8.949608,25.114448,7.911940,28.358209
4,"Albany-Schenectady-Troy, NY",4.109291,18.285130,0.147426,31.111111
...,...,...,...,...,...
378,"Yakima, WA",6.186625,24.300861,1.240113,26.829268
379,"York-Hanover, PA",4.926748,12.935877,1.656400,29.787234
380,"Youngstown-Warren-Boardman, OH-PA",-1.471502,8.218357,-2.317246,27.868852
381,"Yuba City, CA",10.104159,24.611951,3.727698,29.411765


In [3]:
print(df['Employment'].mean())
print(df['GDP'].mean())
print(df['Unemployment'].mean())

5.569363228477806
15.902238680955618
29.23073892754569


In [4]:
# Target "Emerging" and "Static" based off of top 40 population ROC
df.loc[df['Population'] > 7.25, 'Target'] = "Emerging"
df.loc[df['Population'] < 7.25, 'Target'] = "Static"
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment,Target
0,"Abilene, TX",4.672014,16.444673,1.397860,28.947368,Static
1,"Akron, OH",1.819752,11.310636,-0.128197,32.015000,Static
2,"Albany, GA",2.294333,16.066871,-2.434386,25.714286,Static
3,"Albany-Lebanon, OR",8.949608,25.114448,7.911940,28.358209,Emerging
4,"Albany-Schenectady-Troy, NY",4.109291,18.285130,0.147426,31.111111,Static
...,...,...,...,...,...,...
378,"Yakima, WA",6.186625,24.300861,1.240113,26.829268,Static
379,"York-Hanover, PA",4.926748,12.935877,1.656400,29.787234,Static
380,"Youngstown-Warren-Boardman, OH-PA",-1.471502,8.218357,-2.317246,27.868852,Static
381,"Yuba City, CA",10.104159,24.611951,3.727698,29.411765,Static


In [5]:
y = df['Target']
X = df.drop(columns=['MSA', 'Population', 'Target'])
X

Unnamed: 0,Employment,GDP,Unemployment
0,4.672014,16.444673,28.947368
1,1.819752,11.310636,32.015000
2,2.294333,16.066871,25.714286
3,8.949608,25.114448,28.358209
4,4.109291,18.285130,31.111111
...,...,...,...
378,6.186625,24.300861,26.829268
379,4.926748,12.935877,29.787234
380,-1.471502,8.218357,27.868852
381,10.104159,24.611951,29.411765


In [6]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Counter(y_train)

Counter({'Static': 255, 'Emerging': 32})

In [7]:
# instantiate the model a fit using the training sets
model = LogisticRegression(solver='lbfgs', max_iter=50, random_state=42, class_weight={'Emerging':1.75})
model.fit(X_train, y_train)

LogisticRegression(class_weight={'Emerging': 1.75}, max_iter=50,
                   random_state=42)

In [8]:
# test the model accuracy
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9270833333333334
[[ 4  4]
 [ 3 85]]
              precision    recall  f1-score   support

    Emerging       0.57      0.50      0.53         8
      Static       0.96      0.97      0.96        88

    accuracy                           0.93        96
   macro avg       0.76      0.73      0.75        96
weighted avg       0.92      0.93      0.92        96



In [9]:
# import feature set to run predictions on 2024 data
df_2024 = pd.read_csv('2024_ROC_rank_total2.csv')
df_2024.head()

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total
0,10540,"Albany-Lebanon, OR",3.9,57.291667,-1.245487,17.059092,142.0,236.5,312.0,88.0,778.5
1,12940,"Baton Rouge, LA",2.6,61.891892,7.734707,10.671246,194.5,82.0,28.0,255.0,559.5
2,16060,"Carbondale-Marion, IL",-0.8,60.731707,0.680513,3.427676,322.0,118.5,243.0,347.0,1030.5
3,21060,"Elizabethtown-Fort Knox, KY",-0.7,67.058824,2.565748,0.525464,317.5,8.5,159.0,362.0,847.0
4,22900,"Fort Smith, AR-OK",0.3,61.621622,-1.425836,4.01012,282.5,89.0,317.0,344.0,1032.5


In [10]:
print(df_2024['2024_Emp_ROC'].mean())
print(df_2024['2024_GDP_ROC'].mean())
print(df_2024['2024_Unem_ROC'].mean())

1.8355239515649782
12.498457627383488
57.3105010344605


In [11]:
df_2024.columns.tolist()

['CBSA',
 'Metropolitan_Area',
 '2024_Pop_ROC',
 '2024_Unem_ROC',
 '2024_Emp_ROC',
 '2024_GDP_ROC',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total']

In [12]:
# set x2 data for logisticRegression prediction
X2 = df_2024.drop(columns=['CBSA',
 'Metropolitan_Area',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 '2024_Pop_ROC',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total'])
X2.head()

Unnamed: 0,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC
0,57.291667,-1.245487,17.059092
1,61.891892,7.734707,10.671246
2,60.731707,0.680513,3.427676
3,67.058824,2.565748,0.525464
4,61.621622,-1.425836,4.01012


In [13]:
# reorder columns to match training data columns
X2 = X2[['2024_Emp_ROC','2024_GDP_ROC','2024_Unem_ROC']]
X2.head()

Unnamed: 0,2024_Emp_ROC,2024_GDP_ROC,2024_Unem_ROC
0,-1.245487,17.059092,57.291667
1,7.734707,10.671246,61.891892
2,0.680513,3.427676,60.731707
3,2.565748,0.525464,67.058824
4,-1.425836,4.01012,61.621622


In [14]:
# run the logisticRegression model over our 2024 data set
emerging_pred = model.predict(X2)
# added predicted classifiers back into original dataframe
df_2024['Predicted_Classifier_LR'] = emerging_pred
# dispay MSA that our ML model predicted to be emerging in 2024
df_2024.loc[df_2024['Predicted_Classifier_LR']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR
84,45540,"The Villages, FL",17.5,52.857143,18.71458,18.666587,1.0,305.0,1.0,52.0,359.0,Emerging
173,13900,"Bismarck, ND",10.5,36.190476,13.609255,22.560386,9.0,378.0,2.0,18.0,407.0,Emerging


In [15]:
# use svm to predict emerging
# create SVM classifier
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)
emerging_pred_svm = model_svm.predict(X2)
df_2024['Predicted_Classifier_svm'] = emerging_pred_svm
df_2024.loc[df_2024['Predicted_Classifier_svm']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm
84,45540,"The Villages, FL",17.5,52.857143,18.71458,18.666587,1.0,305.0,1.0,52.0,359.0,Emerging,Emerging
105,33260,"Midland, TX",12.3,49.230769,13.470572,17.93853,3.5,344.5,3.0,72.0,423.0,Static,Emerging
173,13900,"Bismarck, ND",10.5,36.190476,13.609255,22.560386,9.0,378.0,2.0,18.0,407.0,Emerging,Emerging


In [16]:
# Creating a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42, class_weight={'Emerging':1.75})
rf_model.fit(X_train, y_train)
forest_predict = rf_model.predict(X2)
df_2024['Predicted_Classifier_RFC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_RFC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC
173,13900,"Bismarck, ND",10.5,36.190476,13.609255,22.560386,9.0,378.0,2.0,18.0,407.0,Emerging,Emerging,Emerging


In [17]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.25,
                                        max_features=3,
                                        max_depth=3,
                                        random_state=42)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X2)
df_2024['Predicted_Classifier_GBC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_GBC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC
173,13900,"Bismarck, ND",10.5,36.190476,13.609255,22.560386,9.0,378.0,2.0,18.0,407.0,Emerging,Emerging,Emerging,Emerging
