In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('features_for_fit5_ufix.csv')
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment
0,"Abilene, TX",4.672014,16.444673,1.397860,3.0
1,"Akron, OH",1.819752,11.310636,-0.128197,4.3
2,"Albany, GA",2.294333,16.066871,-2.434386,4.3
3,"Albany-Lebanon, OR",8.949608,25.114448,7.911940,4.3
4,"Albany-Schenectady-Troy, NY",4.109291,18.285130,0.147426,3.7
...,...,...,...,...,...
378,"Yakima, WA",6.186625,24.300861,1.240113,7.0
379,"York-Hanover, PA",4.926748,12.935877,1.656400,3.8
380,"Youngstown-Warren-Boardman, OH-PA",-1.471502,8.218357,-2.317246,5.7
381,"Yuba City, CA",10.104159,24.611951,3.727698,6.8


In [3]:
print(df['Employment'].mean())
print(df['GDP'].mean())
print(df['Unemployment'].mean())

5.569363228477806
15.902238680955618
3.8827676240208877


In [4]:
# Target "Emerging" and "Static" based off of top 40 population ROC
df.loc[df['Population'] > 7.25, 'Target'] = "Emerging"
df.loc[df['Population'] < 7.25, 'Target'] = "Static"
df

Unnamed: 0,MSA,Employment,GDP,Population,Unemployment,Target
0,"Abilene, TX",4.672014,16.444673,1.397860,3.0,Static
1,"Akron, OH",1.819752,11.310636,-0.128197,4.3,Static
2,"Albany, GA",2.294333,16.066871,-2.434386,4.3,Static
3,"Albany-Lebanon, OR",8.949608,25.114448,7.911940,4.3,Emerging
4,"Albany-Schenectady-Troy, NY",4.109291,18.285130,0.147426,3.7,Static
...,...,...,...,...,...,...
378,"Yakima, WA",6.186625,24.300861,1.240113,7.0,Static
379,"York-Hanover, PA",4.926748,12.935877,1.656400,3.8,Static
380,"Youngstown-Warren-Boardman, OH-PA",-1.471502,8.218357,-2.317246,5.7,Static
381,"Yuba City, CA",10.104159,24.611951,3.727698,6.8,Static


In [5]:
y = df['Target']
X = df.drop(columns=['MSA', 'Population', 'Target'])
X

Unnamed: 0,Employment,GDP,Unemployment
0,4.672014,16.444673,3.0
1,1.819752,11.310636,4.3
2,2.294333,16.066871,4.3
3,8.949608,25.114448,4.3
4,4.109291,18.285130,3.7
...,...,...,...
378,6.186625,24.300861,7.0
379,4.926748,12.935877,3.8
380,-1.471502,8.218357,5.7
381,10.104159,24.611951,6.8


In [6]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Counter(y_train)

Counter({'Static': 255, 'Emerging': 32})

In [7]:
# instantiate the model a fit using the training sets
model = LogisticRegression(solver='lbfgs', max_iter=50, random_state=42, class_weight={'Emerging':1.75})
model.fit(X_train, y_train)

LogisticRegression(class_weight={'Emerging': 1.75}, max_iter=50,
                   random_state=42)

In [8]:
# test the model accuracy
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9270833333333334
[[ 4  4]
 [ 3 85]]
              precision    recall  f1-score   support

    Emerging       0.57      0.50      0.53         8
      Static       0.96      0.97      0.96        88

    accuracy                           0.93        96
   macro avg       0.76      0.73      0.75        96
weighted avg       0.92      0.93      0.92        96



In [9]:
# import feature set to run predictions on 2024 data
df_2024 = pd.read_csv('2024_ROC_rank_total.csv')
df_2024.head()

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total
0,10180,"Abilene, TX",2.0,1.15,3.040151,8.072256,217.5,38.0,134.0,297.0,686.5
1,10500,"Albany, GA",-3.2,2.58,0.089023,7.19908,370.0,349.0,260.0,312.0,1291.0
2,10420,"Akron, OH",0.2,1.2,1.238754,12.475028,286.5,76.5,214.0,201.0,778.0
3,10540,"Albany-Lebanon, OR",3.9,2.05,-1.245487,17.059092,142.0,304.5,312.0,88.0,846.5
4,10580,"Albany-Schenectady-Troy, NY",1.0,1.17,1.270928,14.602271,257.5,53.0,211.0,155.0,676.5


In [10]:
print(df_2024['2024_Emp_ROC'].mean())
print(df_2024['2024_GDP_ROC'].mean())
print(df_2024['2024_Unem_ROC'].mean())

1.8355239515649788
12.498457627383495
1.6673368146214056


In [11]:
df_2024.columns.tolist()

['CBSA',
 'Metropolitan_Area',
 '2024_Pop_ROC',
 '2024_Unem_ROC',
 '2024_Emp_ROC',
 '2024_GDP_ROC',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total']

In [12]:
# set x2 data for logisticRegression prediction
X2 = df_2024.drop(columns=['CBSA',
 'Metropolitan_Area',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 '2024_Pop_ROC',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total'])
X2.head()

Unnamed: 0,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC
0,1.15,3.040151,8.072256
1,2.58,0.089023,7.19908
2,1.2,1.238754,12.475028
3,2.05,-1.245487,17.059092
4,1.17,1.270928,14.602271


In [13]:
# reorder columns to match training data columns
X2 = X2[['2024_Emp_ROC','2024_GDP_ROC','2024_Unem_ROC']]
X2.head()

Unnamed: 0,2024_Emp_ROC,2024_GDP_ROC,2024_Unem_ROC
0,3.040151,8.072256,1.15
1,0.089023,7.19908,2.58
2,1.238754,12.475028,1.2
3,-1.245487,17.059092,2.05
4,1.270928,14.602271,1.17


In [14]:
# run the logisticRegression model over our 2024 data set
emerging_pred = model.predict(X2)
# added predicted classifiers back into original dataframe
df_2024['Predicted_Classifier_LR'] = emerging_pred
# dispay MSA that our ML model predicted to be emerging in 2024
df_2024.loc[df_2024['Predicted_Classifier_LR']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging
228,33260,"Midland, TX",12.3,1.32,13.470572,17.93853,3.5,149.5,3.0,72.0,228.0,Emerging
344,45540,"The Villages, FL",17.5,2.64,18.71458,18.666587,1.0,351.0,1.0,52.0,405.0,Emerging


In [15]:
# use svm to predict emerging
# create SVM classifier
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)
emerging_pred_svm = model_svm.predict(X2)
df_2024['Predicted_Classifier_svm'] = emerging_pred_svm
df_2024.loc[df_2024['Predicted_Classifier_svm']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging,Emerging
228,33260,"Midland, TX",12.3,1.32,13.470572,17.93853,3.5,149.5,3.0,72.0,228.0,Emerging,Emerging
344,45540,"The Villages, FL",17.5,2.64,18.71458,18.666587,1.0,351.0,1.0,52.0,405.0,Emerging,Emerging


In [16]:
# Creating a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42, class_weight={'Emerging':1.75})
rf_model.fit(X_train, y_train)
forest_predict = rf_model.predict(X2)
df_2024['Predicted_Classifier_RFC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_RFC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging,Emerging,Emerging


In [17]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.25,
                                        max_features=3,
                                        max_depth=3,
                                        random_state=42)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X2)
df_2024['Predicted_Classifier_GBC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_GBC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging,Emerging,Emerging,Emerging
