In [22]:
import pymongo
import pandas as pd
from pymongo import MongoClient
import os
from sklearn.linear_model import LinearRegression

In [23]:
# Create instance of MongoClient
client = MongoClient()
# Connection URI
connStr = os.getenv("MONGO_CONN")
client = MongoClient(connStr)
# Select database
db = client['MSA']
# Select the collection within the database (in this case its GDP_raw data, check list below for other collections)
gdp = db.GDP_clean
# Convert entire collection to Pandas dataframe
df_gdp = pd.DataFrame(list(gdp.find()))
df_gdp.head()

Unnamed: 0,_id,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,60329bde9efa42f38169dc16,10180,5839625,5996498,6285840,6450382,6678674,6638709,6631955,6833064,7348112,7730423
1,60329bde9efa42f38169dc17,10420,28983322,29589772,30484007,31447075,32753646,33868052,34458540,35193942,36624333,37698744
2,60329bde9efa42f38169dc18,10500,5097990,5279579,5436011,5530836,5577624,5676239,5874144,5956081,6166172,6588233
3,60329bde9efa42f38169dc19,10540,3476449,3574061,3642763,3792310,3921082,4242542,4498372,4642425,5102768,5308033
4,60329bde9efa42f38169dc1a,10580,46985416,47754508,49077007,50933123,52822054,55451680,57904993,60255221,62666517,65591092


In [24]:
# Drop _id column from mongodb
df_gdp.drop(columns='_id', inplace=True)

In [25]:
df_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   CBSA    384 non-null    int64
 1   2010    384 non-null    int64
 2   2011    384 non-null    int64
 3   2012    384 non-null    int64
 4   2013    384 non-null    int64
 5   2014    384 non-null    int64
 6   2015    384 non-null    int64
 7   2016    384 non-null    int64
 8   2017    384 non-null    int64
 9   2018    384 non-null    int64
 10  2019    384 non-null    int64
dtypes: int64(11)
memory usage: 33.1 KB


In [26]:
# check to see what values we need for the model
df_gdp.iloc[1,1:]

2010    28983322
2011    29589772
2012    30484007
2013    31447075
2014    32753646
2015    33868052
2016    34458540
2017    35193942
2018    36624333
2019    37698744
Name: 1, dtype: int64

In [27]:
# create instance of model
model = LinearRegression()

In [28]:
# create a list to hold predictions
predictions = []
for i in range(0,384):
    # the years of data to use in the model
    x = [[2010],[2011],[2012],[2013],[2014],[2015],[2016],[2017],[2018],[2019]]
    # get just the values we need per row
    y = df_gdp.iloc[i, 1:].values
    model.fit(x,y)
    y_pred = model.predict([[2024]])
    predictions.append(y_pred)
# a list inside a list was created, so we flatten it
flat_list = [item for sublist in predictions for item in sublist]
# add column with predicted values to df
df_gdp['Predicted_2024_amount'] = flat_list
df_gdp.head()

Unnamed: 0,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Predicted_2024_amount
0,10180,5839625,5996498,6285840,6450382,6678674,6638709,6631955,6833064,7348112,7730423,8354443.0
1,10420,28983322,29589772,30484007,31447075,32753646,33868052,34458540,35193942,36624333,37698744,42401670.0
2,10500,5097990,5279579,5436011,5530836,5577624,5676239,5874144,5956081,6166172,6588233,7062525.0
3,10540,3476449,3574061,3642763,3792310,3921082,4242542,4498372,4642425,5102768,5308033,6213535.0
4,10580,46985416,47754508,49077007,50933123,52822054,55451680,57904993,60255221,62666517,65591092,75168880.0


In [29]:
# round to two decimals, cuz its money and why not
df_gdp['Predicted_2024_amount'] = df_gdp['Predicted_2024_amount'].round(2)
df_gdp.head()

Unnamed: 0,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Predicted_2024_amount
0,10180,5839625,5996498,6285840,6450382,6678674,6638709,6631955,6833064,7348112,7730423,8354442.52
1,10420,28983322,29589772,30484007,31447075,32753646,33868052,34458540,35193942,36624333,37698744,42401673.05
2,10500,5097990,5279579,5436011,5530836,5577624,5676239,5874144,5956081,6166172,6588233,7062525.18
3,10540,3476449,3574061,3642763,3792310,3921082,4242542,4498372,4642425,5102768,5308033,6213535.22
4,10580,46985416,47754508,49077007,50933123,52822054,55451680,57904993,60255221,62666517,65591092,75168881.12


In [30]:
# calculate rate of change
df_gdp['2024_ROC'] = (df_gdp['Predicted_2024_amount'] - df_gdp['2019'])/df_gdp['2019']*100
df_gdp.head()

Unnamed: 0,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Predicted_2024_amount,2024_ROC
0,10180,5839625,5996498,6285840,6450382,6678674,6638709,6631955,6833064,7348112,7730423,8354442.52,8.072256
1,10420,28983322,29589772,30484007,31447075,32753646,33868052,34458540,35193942,36624333,37698744,42401673.05,12.475028
2,10500,5097990,5279579,5436011,5530836,5577624,5676239,5874144,5956081,6166172,6588233,7062525.18,7.19908
3,10540,3476449,3574061,3642763,3792310,3921082,4242542,4498372,4642425,5102768,5308033,6213535.22,17.059092
4,10580,46985416,47754508,49077007,50933123,52822054,55451680,57904993,60255221,62666517,65591092,75168881.12,14.602271


In [33]:
# check for duplicates before ranking
df_gdp[df_gdp.duplicated(['2024_ROC'])]

Unnamed: 0,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Predicted_2024_amount,2024_ROC


In [34]:
# rank the values descending by ROC
df_gdp['2024_Rank'] = df_gdp['2024_ROC'].rank(ascending=False)
df_gdp.head()

Unnamed: 0,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Predicted_2024_amount,2024_ROC,2024_Rank
0,10180,5839625,5996498,6285840,6450382,6678674,6638709,6631955,6833064,7348112,7730423,8354442.52,8.072256,297.0
1,10420,28983322,29589772,30484007,31447075,32753646,33868052,34458540,35193942,36624333,37698744,42401673.05,12.475028,201.0
2,10500,5097990,5279579,5436011,5530836,5577624,5676239,5874144,5956081,6166172,6588233,7062525.18,7.19908,312.0
3,10540,3476449,3574061,3642763,3792310,3921082,4242542,4498372,4642425,5102768,5308033,6213535.22,17.059092,88.0
4,10580,46985416,47754508,49077007,50933123,52822054,55451680,57904993,60255221,62666517,65591092,75168881.12,14.602271,155.0


In [36]:
# Check to see if ranking worked
df_gdp.sort_values(by='2024_ROC', inplace=True, ascending=False)
df_gdp.head()

Unnamed: 0,CBSA,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,Predicted_2024_amount,2024_ROC,2024_Rank
296,40580,6583419,6193876,5579311,5760276,7329627,9011475,9539431,9188754,9278142,7689597,11220323.67,45.915627,1.0
371,48540,6129452,6369342,6446153,7180473,8397012,8665041,9365988,11548497,12686215,11939964,16291393.24,36.444241,2.0
107,21140,9265583,9095516,9622682,10519924,11706483,13267137,14645512,16320319,16389054,16193156,21962360.82,35.627427,3.0
32,13460,5022378,5091721,5407947,5875268,6558294,7320179,8078986,8983549,9597273,10042937,13069137.85,30.132628,4.0
140,24540,10071686,10976437,11937442,13745206,16304556,14847684,14798021,16991143,20267534,19889946,25368025.63,27.541953,5.0


In [37]:
df_gdp.sort_values(by='CBSA', inplace=True)

In [43]:
# turn dataframe into readable format for mongo with only the columns we need
df_dict = df_gdp[['CBSA', '2024_ROC','2024_Rank']].to_dict(orient='records')
df_dict[0]

{'CBSA': 10180, '2024_ROC': 8.072255813168304, '2024_Rank': 297.0}

In [44]:
# create new collection for dataframe with predicted ROC and rank
predicted_2024 = db.GDP_predicted_2024

In [45]:
# write dataframe to unemployment_predicted_2024 collection
predicted_2024.insert_many(df_dict)

<pymongo.results.InsertManyResult at 0x298fd09d3c0>