In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
df = pd.read_csv('features_for_fit.csv')
df

Unnamed: 0,CBSA,MSA,Employment,GDP,Population,Unemployment,Total
0,10180,"abilene, tx",4.672014,16.444673,1.397860,3.0,670.5
1,10420,"akron, oh",1.819752,11.310636,-0.128197,4.3,1139.5
2,10500,"albany, ga",2.294333,16.066871,-2.434386,4.3,1101.5
3,10540,"albany-lebanon, or",8.949608,25.114448,7.911940,4.3,446.5
4,10580,"albany-schenectady-troy, ny",4.109291,18.285130,0.147426,3.7,825.5
...,...,...,...,...,...,...,...
378,49420,"yakima, wa",6.186625,24.300861,1.240113,7.0,815.0
379,49620,"york-hanover, pa",4.926748,12.935877,1.656400,3.8,856.0
380,49660,"youngstown-warren-boardman, oh-pa",-1.471502,8.218357,-2.317246,5.7,1414.5
381,49700,"yuba city, ca",10.104159,24.611951,3.727698,6.8,614.0


In [3]:
# Target "Emerging" and "Static" based off of top 40 MSA's with best ranking score.
df.loc[df['Total'] < 300, 'Target'] = 'Emerging'
df.loc[df['Total'] >300, 'Target'] = 'Static'

In [4]:
# set x and y data
y = df['Target']
X = df.drop(columns=['CBSA','MSA', 'Total', 'Target'])

In [5]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Counter(y_train)

Counter({'Static': 254, 'Emerging': 33})

In [24]:
# instantiate the model a fit using the training sets
model = LogisticRegression(solver='lbfgs', max_iter=50, random_state=42, class_weight={'Emerging':0.5})
model.fit(X_train, y_train)

LogisticRegression(class_weight={'Emerging': 0.5}, max_iter=50, random_state=42)

In [25]:
# test the model accuracy
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

1.0
[[ 7  0]
 [ 0 89]]
              precision    recall  f1-score   support

    Emerging       1.00      1.00      1.00         7
      Static       1.00      1.00      1.00        89

    accuracy                           1.00        96
   macro avg       1.00      1.00      1.00        96
weighted avg       1.00      1.00      1.00        96



In [8]:
# import feature set to run predictions on 2024 data
df_2024 = pd.read_csv('2024_ROC_rank_total.csv')
df_2024.head()

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total
0,10180,"Abilene, TX",2.0,1.15,3.040151,8.072256,217.5,38.0,134.0,297.0,686.5
1,10500,"Albany, GA",-3.2,2.58,0.089023,7.19908,370.0,349.0,260.0,312.0,1291.0
2,10420,"Akron, OH",0.2,1.2,1.238754,12.475028,286.5,76.5,214.0,201.0,778.0
3,10540,"Albany-Lebanon, OR",3.9,2.05,-1.245487,17.059092,142.0,304.5,312.0,88.0,846.5
4,10580,"Albany-Schenectady-Troy, NY",1.0,1.17,1.270928,14.602271,257.5,53.0,211.0,155.0,676.5


In [9]:
df_2024.columns.tolist()

['CBSA',
 'Metropolitan_Area',
 '2024_Pop_ROC',
 '2024_Unem_ROC',
 '2024_Emp_ROC',
 '2024_GDP_ROC',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total']

In [10]:
# set x2 data for logisticRegression prediction
X2 = df_2024.drop(columns=['CBSA',
 'Metropolitan_Area',
 'Pop_ROC_Rank',
 'Unem_ROC_Rank',
 'Emp_ROC_Rank',
 'GDP_ROC_Rank',
 'Rank_Total'])
X2.head()

Unnamed: 0,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC
0,2.0,1.15,3.040151,8.072256
1,-3.2,2.58,0.089023,7.19908
2,0.2,1.2,1.238754,12.475028
3,3.9,2.05,-1.245487,17.059092
4,1.0,1.17,1.270928,14.602271


In [11]:
# reorder columns to match training data columns
X2 = X2[['2024_Emp_ROC','2024_GDP_ROC','2024_Pop_ROC','2024_Unem_ROC']]
X2.head()

Unnamed: 0,2024_Emp_ROC,2024_GDP_ROC,2024_Pop_ROC,2024_Unem_ROC
0,3.040151,8.072256,2.0,1.15
1,0.089023,7.19908,-3.2,2.58
2,1.238754,12.475028,0.2,1.2
3,-1.245487,17.059092,3.9,2.05
4,1.270928,14.602271,1.0,1.17


In [26]:
# run the logisticRegression model over our 2024 data set
emerging_pred = model.predict(X2)
# added predicted classifiers back into original dataframe
df_2024['Predicted_Classifier_LR'] = emerging_pred
# dispay MSA that our ML model predicted to be emerging in 2024
df_2024.loc[df_2024['Predicted_Classifier_LR']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC
11,11180,"Ames, IA",4.9,0.82,3.52056,18.096775,104.5,1.0,110.0,68.0,283.5,Emerging,Emerging,Static,Static
17,12220,"Auburn-Opelika, AL",9.3,1.27,7.461697,20.463152,20.5,114.5,30.0,30.0,195.0,Emerging,Emerging,Static,Static
22,12420,"Austin-Round Rock-Georgetown, TX",12.4,1.41,7.740261,23.695131,2.0,185.5,27.0,13.0,227.5,Emerging,Emerging,Emerging,Emerging
31,13460,"Bend, OR",10.5,1.61,3.452002,30.132628,9.0,244.5,117.0,4.0,374.5,Emerging,Emerging,Emerging,Emerging
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging,Emerging,Emerging,Emerging
52,15980,"Cape Coral-Fort Myers, FL",11.5,1.27,4.772877,20.047516,5.0,114.5,75.0,33.0,227.5,Emerging,Emerging,Static,Static
58,16220,"Casper, WY",4.1,1.14,10.334702,14.395404,134.0,34.5,9.0,158.0,335.5,Emerging,Emerging,Static,Static
63,16700,"Charleston-North Charleston, SC",9.8,1.27,5.161834,20.54105,15.0,114.5,67.0,26.0,222.5,Emerging,Emerging,Static,Static
65,16740,"Charlotte-Concord-Gastonia, NC-SC",8.2,1.47,4.662991,20.556409,35.0,203.0,76.0,25.0,339.0,Emerging,Emerging,Static,Static
74,17660,"Coeur d'Alene, ID",7.5,1.2,3.779303,17.598315,42.5,76.5,102.0,80.0,301.0,Emerging,Emerging,Static,Static


In [13]:
# use svm to predict emerging
# create SVM classifier
model_svm = SVC(kernel='linear')
model_svm.fit(X_train, y_train)
emerging_pred_svm = model_svm.predict(X2)
df_2024['Predicted_Classifier_svm'] = emerging_pred_svm
df_2024.loc[df_2024['Predicted_Classifier_svm']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm
9,11100,"Amarillo, TX",3.0,1.40,6.122966,11.424719,180.0,176.5,51.0,239.0,646.5,Static,Emerging
10,11260,"Anchorage, AK",3.3,1.40,7.444749,8.052706,167.5,175.0,31.0,299.0,672.5,Static,Emerging
11,11180,"Ames, IA",4.9,0.82,3.520560,18.096775,104.5,1.0,110.0,68.0,283.5,Emerging,Emerging
12,11460,"Ann Arbor, MI",5.0,1.15,-1.959702,17.030465,97.5,38.0,331.0,89.0,555.5,Static,Emerging
13,11540,"Appleton, WI",3.0,1.00,1.509336,17.355080,180.0,2.5,201.0,85.0,468.5,Static,Emerging
...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,47900,"Washington-Arlington-Alexandria, DC-VA-MD-WV",6.1,1.29,4.069063,12.284865,65.5,129.0,91.0,207.0,492.5,Emerging,Emerging
370,48300,"Wenatchee, WA",4.3,1.78,5.227460,17.476622,129.5,273.0,66.0,82.0,550.5,Static,Emerging
371,48620,"Wichita, KS",1.6,1.18,0.698957,18.222154,239.0,60.0,242.0,63.0,604.0,Static,Emerging
374,48900,"Wilmington, NC",8.4,1.52,3.746677,14.671400,32.5,226.0,104.0,153.0,515.5,Emerging,Emerging


In [14]:
# Creating a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42, class_weight={'Emerging':1.75})
rf_model.fit(X_train, y_train)
forest_predict = rf_model.predict(X2)
df_2024['Predicted_Classifier_RFC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_RFC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC
22,12420,"Austin-Round Rock-Georgetown, TX",12.4,1.41,7.740261,23.695131,2.0,185.5,27.0,13.0,227.5,Emerging,Emerging,Emerging
31,13460,"Bend, OR",10.5,1.61,3.452002,30.132628,9.0,244.5,117.0,4.0,374.5,Emerging,Emerging,Emerging
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging,Emerging,Emerging
96,19780,"Des Moines-West Des Moines, IA",7.6,1.17,4.89686,24.355305,40.5,53.0,72.0,9.0,174.5,Emerging,Emerging,Emerging
115,22020,"Fargo, ND-MN",9.9,1.05,9.286568,20.468566,14.0,10.0,14.0,29.0,67.0,Emerging,Emerging,Emerging
118,22220,"Fayetteville-Springdale-Rogers, AR",9.7,1.25,5.977,23.579081,16.0,108.0,54.0,14.0,192.0,Emerging,Emerging,Emerging
124,22660,"Fort Collins, CO",9.2,1.33,4.110821,22.2493,22.5,157.5,89.0,19.0,288.0,Emerging,Emerging,Emerging
140,24540,"Greeley, CO",11.3,1.25,5.772005,27.541953,6.0,108.0,57.0,5.0,176.0,Emerging,Emerging,Emerging
195,29700,"Laredo, TX",5.9,1.21,11.238739,18.670049,70.5,85.5,5.0,51.0,212.0,Emerging,Emerging,Emerging
228,33260,"Midland, TX",12.3,1.32,13.470572,17.93853,3.5,149.5,3.0,72.0,228.0,Emerging,Emerging,Emerging


In [15]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.25,
                                        max_features=4,
                                        max_depth=3,
                                        random_state=42)

# Fit the model
classifier.fit(X_train, y_train)

# Make Prediction
predictions = classifier.predict(X2)
df_2024['Predicted_Classifier_GBC'] = forest_predict
df_2024.loc[df_2024['Predicted_Classifier_GBC']=='Emerging']

Unnamed: 0,CBSA,Metropolitan_Area,2024_Pop_ROC,2024_Unem_ROC,2024_Emp_ROC,2024_GDP_ROC,Pop_ROC_Rank,Unem_ROC_Rank,Emp_ROC_Rank,GDP_ROC_Rank,Rank_Total,Predicted_Classifier_LR,Predicted_Classifier_svm,Predicted_Classifier_RFC,Predicted_Classifier_GBC
22,12420,"Austin-Round Rock-Georgetown, TX",12.4,1.41,7.740261,23.695131,2.0,185.5,27.0,13.0,227.5,Emerging,Emerging,Emerging,Emerging
31,13460,"Bend, OR",10.5,1.61,3.452002,30.132628,9.0,244.5,117.0,4.0,374.5,Emerging,Emerging,Emerging,Emerging
36,13900,"Bismarck, ND",10.5,1.34,13.609255,22.560386,9.0,160.0,2.0,18.0,189.0,Emerging,Emerging,Emerging,Emerging
96,19780,"Des Moines-West Des Moines, IA",7.6,1.17,4.89686,24.355305,40.5,53.0,72.0,9.0,174.5,Emerging,Emerging,Emerging,Emerging
115,22020,"Fargo, ND-MN",9.9,1.05,9.286568,20.468566,14.0,10.0,14.0,29.0,67.0,Emerging,Emerging,Emerging,Emerging
118,22220,"Fayetteville-Springdale-Rogers, AR",9.7,1.25,5.977,23.579081,16.0,108.0,54.0,14.0,192.0,Emerging,Emerging,Emerging,Emerging
124,22660,"Fort Collins, CO",9.2,1.33,4.110821,22.2493,22.5,157.5,89.0,19.0,288.0,Emerging,Emerging,Emerging,Emerging
140,24540,"Greeley, CO",11.3,1.25,5.772005,27.541953,6.0,108.0,57.0,5.0,176.0,Emerging,Emerging,Emerging,Emerging
195,29700,"Laredo, TX",5.9,1.21,11.238739,18.670049,70.5,85.5,5.0,51.0,212.0,Emerging,Emerging,Emerging,Emerging
228,33260,"Midland, TX",12.3,1.32,13.470572,17.93853,3.5,149.5,3.0,72.0,228.0,Emerging,Emerging,Emerging,Emerging
