In [1]:
import pymongo
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from pymongo import MongoClient

In [2]:
# Connection URI
client = MongoClient('mongodb+srv://<username>:<password>@cluster0.l3pqt.mongodb.net/MSA?retryWrites=true&w=majority')
# Select database
db = client['MSA']
# Select the collection within the database (in this case its GDP_raw data, check list below for other collections)
unemployment = db.unemployment_raw
# Convert entire collection to Pandas dataframe
df = pd.DataFrame(list(unemployment.find()))
df

Unnamed: 0,_id,2015,2016,2017,2018,2019,CBSA Code,Metropolitan area
0,602ccd39d9e65148e4b7cf9c,3.8,3.5,3.1,2.8,2.7,10180,"Abilene, TX"
1,602ccd39d9e65148e4b7cf9d,5,4.6,4.1,3.6,3.4,10420,"Akron, OH"
2,602ccd39d9e65148e4b7cf9e,7,6.5,5.7,5.3,5.2,10500,"Albany, GA"
3,602ccd39d9e65148e4b7cf9f,6.7,6.2,5.5,4.9,4.8,10540,"Albany, OR"
4,602ccd39d9e65148e4b7cfa0,4.5,4.2,3.7,3.3,3.1,10580,"Albany-Schenectady-Troy, NY"
...,...,...,...,...,...,...,...,...
378,602ccd39d9e65148e4b7d116,8.2,7.8,7.2,6.4,6,49420,"Yakima, WA"
379,602ccd39d9e65148e4b7d117,4.7,4.5,4,3.5,3.3,49620,"York-Hanover, PA"
380,602ccd39d9e65148e4b7d118,6.1,5.8,5.1,4.6,4.4,49660,"Youngstown-Warren-Boardman, OH-PA"
381,602ccd39d9e65148e4b7d119,10.2,9.5,8.5,7.6,7.2,49700,"Yuba City, CA"


In [3]:
# drop _id column
df.drop(columns ='_id', inplace=True)

# rearrange columns
df = df[['CBSA Code', 'Metropolitan area', '2015', '2016', '2017', '2018', '2019']]

# take out spaces in column names
df.rename(columns = {'CBSA Code': 'CBSA_Code', 'Metropolitan area': 'Metropolitan_Area'}, inplace=True)

df

Unnamed: 0,CBSA_Code,Metropolitan_Area,2015,2016,2017,2018,2019
0,10180,"Abilene, TX",3.8,3.5,3.1,2.8,2.7
1,10420,"Akron, OH",5,4.6,4.1,3.6,3.4
2,10500,"Albany, GA",7,6.5,5.7,5.3,5.2
3,10540,"Albany, OR",6.7,6.2,5.5,4.9,4.8
4,10580,"Albany-Schenectady-Troy, NY",4.5,4.2,3.7,3.3,3.1
...,...,...,...,...,...,...,...
378,49420,"Yakima, WA",8.2,7.8,7.2,6.4,6
379,49620,"York-Hanover, PA",4.7,4.5,4,3.5,3.3
380,49660,"Youngstown-Warren-Boardman, OH-PA",6.1,5.8,5.1,4.6,4.4
381,49700,"Yuba City, CA",10.2,9.5,8.5,7.6,7.2


In [4]:
df.dtypes

CBSA_Code            object
Metropolitan_Area    object
2015                 object
2016                 object
2017                 object
2018                 object
2019                 object
dtype: object

In [5]:
# change values in years columns to float
df[['2015','2016','2017','2018','2019']] = df[['2015','2016','2017','2018','2019']].apply(pd.to_numeric)
df.dtypes

CBSA_Code             object
Metropolitan_Area     object
2015                 float64
2016                 float64
2017                 float64
2018                 float64
2019                 float64
dtype: object

In [6]:
# instantiate linearRegression
model = LinearRegression()

In [7]:
# predict 2019 and compare to determing percent error based of off 2019 prediction vs actual data
predictions = []
for i in range(0,383):
    x = [[2015],[2016],[2017],[2018]]
    y = df.iloc[i, 2:6].values
    model.fit(x,y)
    y_pred = model.predict([[2019]])
    predictions.append(y_pred)
flat_list = [item for sublist in predictions for item in sublist]

In [8]:
# create new dataframe to hold actual 2019 and predicted 2019 values
df_accuracy = df[['2019']].copy()
df_accuracy['Predicted_2019'] = flat_list
df_accuracy

Unnamed: 0,2019,Predicted_2019
0,2.7,2.45
1,3.4,3.15
2,5.2,4.65
3,4.8,4.30
4,3.1,2.90
...,...,...
378,6.0,5.90
379,3.3,3.15
380,4.4,4.10
381,7.2,6.75


In [9]:
# calculate percent error for each linear regression
df_accuracy['Percent_Error'] = (df_accuracy['2019'] - df_accuracy['Predicted_2019'])/df_accuracy['2019']*100

# set values to absolute in percent_error column
df_accuracy['Percent_Error'] = df_accuracy['Percent_Error'].abs()

# print mean of a percent_error
mean_percent_error = df_accuracy['Percent_Error'].mean()
print(f'Average percent error for linear regression predicting 2019 unempoloyment rate is: {round(mean_percent_error,3)}%' )
df_accuracy

Average percent error for linear regression predicting 2019 unempoloyment rate is: 6.677%


Unnamed: 0,2019,Predicted_2019,Percent_Error
0,2.7,2.45,9.259259
1,3.4,3.15,7.352941
2,5.2,4.65,10.576923
3,4.8,4.30,10.416667
4,3.1,2.90,6.451613
...,...,...,...
378,6.0,5.90,1.666667
379,3.3,3.15,4.545455
380,4.4,4.10,6.818182
381,7.2,6.75,6.250000


In [10]:
# use linear regression to predict unemployment rate for 2024 on df
predictions2 = []
for i in range(0,383):
    x2 = [[2015],[2016],[2017],[2018],[2019]]
    y2 = df.iloc[i, 2:].values
    model.fit(x2,y2)
    y_pred2 = model.predict([[2024]])
    predictions2.append(y_pred2)
flat_list2 = [item for sublist in predictions2 for item in sublist]
df['Predicted_2024'] = flat_list2
df

Unnamed: 0,CBSA_Code,Metropolitan_Area,2015,2016,2017,2018,2019,Predicted_2024
0,10180,"Abilene, TX",3.8,3.5,3.1,2.8,2.7,1.15
1,10420,"Akron, OH",5.0,4.6,4.1,3.6,3.4,1.20
2,10500,"Albany, GA",7.0,6.5,5.7,5.3,5.2,2.58
3,10540,"Albany, OR",6.7,6.2,5.5,4.9,4.8,2.05
4,10580,"Albany-Schenectady-Troy, NY",4.5,4.2,3.7,3.3,3.1,1.17
...,...,...,...,...,...,...,...,...
378,49420,"Yakima, WA",8.2,7.8,7.2,6.4,6.0,3.06
379,49620,"York-Hanover, PA",4.7,4.5,4.0,3.5,3.3,1.34
380,49660,"Youngstown-Warren-Boardman, OH-PA",6.1,5.8,5.1,4.6,4.4,1.98
381,49700,"Yuba City, CA",10.2,9.5,8.5,7.6,7.2,3.07


In [11]:
# create dataframe to store back into mongo with predictions and rank
unemployment_prediction = df[['CBSA_Code', 'Metropolitan_Area', 'Predicted_2024']].copy()

unemployment_prediction['Rank'] = df['Predicted_2024'].rank()

unemployment_prediction

Unnamed: 0,CBSA_Code,Metropolitan_Area,Predicted_2024,Rank
0,10180,"Abilene, TX",1.15,38.0
1,10420,"Akron, OH",1.20,76.5
2,10500,"Albany, GA",2.58,349.0
3,10540,"Albany, OR",2.05,304.5
4,10580,"Albany-Schenectady-Troy, NY",1.17,53.0
...,...,...,...,...
378,49420,"Yakima, WA",3.06,376.0
379,49620,"York-Hanover, PA",1.34,160.0
380,49660,"Youngstown-Warren-Boardman, OH-PA",1.98,299.5
381,49700,"Yuba City, CA",3.07,377.0


In [12]:
# create new collection for dataframe with prediction and rank
unemployment_predicted_2024 = db.unemployment_predicted_2024

In [13]:
# turn dataframe into readable format for mongo
df_dict = unemployment_prediction.to_dict(orient='records')

In [14]:
# write dataframe to unemployment_predicted_2024 collection
unemployment_predicted_2024.insert_many(df_dict)

<pymongo.results.InsertManyResult at 0x1c1a6297548>

In [15]:
client.MSA.collection_names()

  """Entry point for launching an IPython kernel.


['GDP_raw',
 'employment_raw',
 'msa_codes',
 'unemployment_predicted_2024',
 'population_raw',
 'unemployment_clean',
 'unemployment_raw']

In [17]:
df2 = pd.DataFrame(list(unemployment_predicted_2024.find()))
df2

Unnamed: 0,_id,CBSA_Code,Metropolitan_Area,Predicted_2024,Rank
0,602f2542b3dad6fbac520a64,10180,"Abilene, TX",1.15,38.0
1,602f2542b3dad6fbac520a65,10420,"Akron, OH",1.20,76.5
2,602f2542b3dad6fbac520a66,10500,"Albany, GA",2.58,349.0
3,602f2542b3dad6fbac520a67,10540,"Albany, OR",2.05,304.5
4,602f2542b3dad6fbac520a68,10580,"Albany-Schenectady-Troy, NY",1.17,53.0
...,...,...,...,...,...
378,602f2542b3dad6fbac520bde,49420,"Yakima, WA",3.06,376.0
379,602f2542b3dad6fbac520bdf,49620,"York-Hanover, PA",1.34,160.0
380,602f2542b3dad6fbac520be0,49660,"Youngstown-Warren-Boardman, OH-PA",1.98,299.5
381,602f2542b3dad6fbac520be1,49700,"Yuba City, CA",3.07,377.0
