In [1]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import pymongo
from pymongo import MongoClient

In [2]:
# Create instance of MongoClient
client = MongoClient()
# Connection URI
client = MongoClient(os.getenv('MONGO_CONN'))
# Select database
db = client['MSA']
# Select the collection within the database (in this case its GDP_raw data, check list below for other collections)
data = db.CPI_RegionUrban_clean
# Convert entire collection to Pandas dataframe
df = pd.DataFrame(list(data.find()))
df

Unnamed: 0,_id,Region,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,603dd586680af8db371b28bc,1,235.141,241.987,246.456,249.567,250.519,251.67,256.427,260.791,265.286,270.429
1,603dd586680af8db371b28bd,2,209.27,215.173,219.033,221.194,222.821,222.722,226.794,230.548,233.458,238.734
2,603dd586680af8db371b28be,3,212.488,219.469,223.109,227.082,228.451,229.581,234.204,238.512,242.15,247.289
3,603dd586680af8db371b28bf,4,222.081,228.117,232.029,236.096,239.095,243.434,249.516,257.347,265.209,272.584


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   _id     4 non-null      object 
 1   Region  4 non-null      object 
 2   2010    4 non-null      float64
 3   2011    4 non-null      float64
 4   2012    4 non-null      float64
 5   2013    4 non-null      float64
 6   2014    4 non-null      float64
 7   2015    4 non-null      float64
 8   2016    4 non-null      float64
 9   2017    4 non-null      float64
 10  2018    4 non-null      float64
 11  2019    4 non-null      float64
dtypes: float64(10), object(2)
memory usage: 512.0+ bytes


In [4]:
df.drop(columns='_id', inplace=True)

In [5]:
df.columns

Index(['Region', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2019'],
      dtype='object')

In [6]:
df[['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']] = df[['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']].astype(int)

In [7]:
df.dtypes

Region    object
2010       int32
2011       int32
2012       int32
2013       int32
2014       int32
2015       int32
2016       int32
2017       int32
2018       int32
2019       int32
dtype: object

In [8]:
# for loop to predict 2024 values
predictions = []
for i in range(0,4):
    y = df.iloc[i, 1:].values
    series = pd.Series(y, dtype='int')
    model = ARIMA(series, order=(2, 1, 1))
    model_fit = model.fit()
    pred = model_fit.forecast(5)
    predictions.append(pred.values.tolist()[-1])
df['2024'] = predictions

In [9]:
df

Unnamed: 0,Region,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2024
0,1,235,241,246,249,250,251,256,260,265,270,289.633993
1,2,209,215,219,221,222,222,226,230,233,238,255.833767
2,3,212,219,223,227,228,229,234,238,242,247,265.925504
3,4,222,228,232,236,239,243,249,257,265,272,299.76712


In [10]:
df.sort_values(by='2024', inplace=True)
df

Unnamed: 0,Region,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2024
1,2,209,215,219,221,222,222,226,230,233,238,255.833767
2,3,212,219,223,227,228,229,234,238,242,247,265.925504
0,1,235,241,246,249,250,251,256,260,265,270,289.633993
3,4,222,228,232,236,239,243,249,257,265,272,299.76712


In [11]:
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Region,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2024
0,2,209,215,219,221,222,222,226,230,233,238,255.833767
1,3,212,219,223,227,228,229,234,238,242,247,265.925504
2,1,235,241,246,249,250,251,256,260,265,270,289.633993
3,4,222,228,232,236,239,243,249,257,265,272,299.76712


In [12]:
i = 192 # half of 384, which lowers the weight since this is an averaged data point
for index, row in df.iterrows():
    df.loc[index, 'Weighted'] = i
    i = i - 15
df

Unnamed: 0,Region,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2024,Weighted
0,2,209,215,219,221,222,222,226,230,233,238,255.833767,192.0
1,3,212,219,223,227,228,229,234,238,242,247,265.925504,177.0
2,1,235,241,246,249,250,251,256,260,265,270,289.633993,162.0
3,4,222,228,232,236,239,243,249,257,265,272,299.76712,147.0


In [13]:
df.columns

Index(['Region', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2019', '2024', 'Weighted'],
      dtype='object')

In [14]:
df.drop(columns=['2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2019', ], inplace=True)

In [17]:
df.rename(columns={'Weighted' : 'Score'}, inplace=True)
df

Unnamed: 0,Region,2024,Score
0,2,255.833767,192.0
1,3,265.925504,177.0
2,1,289.633993,162.0
3,4,299.76712,147.0


In [18]:
df.rename(columns={'2024' : '2024_CPI', 'Score' : 'CPI_Score'}, inplace=True)
df

Unnamed: 0,Region,2024_CPI,CPI_Score
0,2,255.833767,192.0
1,3,265.925504,177.0
2,1,289.633993,162.0
3,4,299.76712,147.0


In [20]:
# Select the collection within the database (in this case its GDP_raw data, check list below for other collections)
data = db.regions_divisions
# Convert entire collection to Pandas dataframe
df_region = pd.DataFrame(list(data.find()))
df_region.head()

Unnamed: 0,_id,FIPS,State,Region,Region_Name,Division,Division_Name
0,603f3e2de251dabd4e0b32e4,9,Connecticut,1,Northeast,1,New England
1,603f3e2de251dabd4e0b32e5,23,Maine,1,Northeast,1,New England
2,603f3e2de251dabd4e0b32e6,25,Massachusetts,1,Northeast,1,New England
3,603f3e2de251dabd4e0b32e7,33,New Hampshire,1,Northeast,1,New England
4,603f3e2de251dabd4e0b32e8,44,Rhode Island,1,Northeast,1,New England


In [22]:
df.Region = df.Region.astype(int)

In [24]:
df_reg_comb = df_region.merge(df, on='Region')
df_reg_comb.head()

Unnamed: 0,_id,FIPS,State,Region,Region_Name,Division,Division_Name,2024_CPI,CPI_Score
0,603f3e2de251dabd4e0b32e4,9,Connecticut,1,Northeast,1,New England,289.633993,162.0
1,603f3e2de251dabd4e0b32e5,23,Maine,1,Northeast,1,New England,289.633993,162.0
2,603f3e2de251dabd4e0b32e6,25,Massachusetts,1,Northeast,1,New England,289.633993,162.0
3,603f3e2de251dabd4e0b32e7,33,New Hampshire,1,Northeast,1,New England,289.633993,162.0
4,603f3e2de251dabd4e0b32e8,44,Rhode Island,1,Northeast,1,New England,289.633993,162.0


In [26]:
# Select the collection within the database (in this case its GDP_raw data, check list below for other collections)
data = db.msa_codes
# Convert entire collection to Pandas dataframe
df_msa = pd.DataFrame(list(data.find()))
df_msa.head()

Unnamed: 0,_id,CBSA,Name,Primary_City,Primary_State_Code,Primary_State_FIPS,Latitude,Longitude,County_FIPS_agg
0,603dedea9226a0b6c19374de,10180,"Abilene, TX",Abilene,TX,48,32.4543,-99.7384,"48059, 48253, 48441"
1,603dedea9226a0b6c19374df,10420,"Akron, OH",Akron,OH,39,41.0798,-81.5219,"39133, 39153"
2,603dedea9226a0b6c19374e0,10500,"Albany, GA",Albany,GA,13,31.5776,-84.1762,"13095, 13177, 13273, 13321"
3,603dedea9226a0b6c19374e1,10540,"Albany-Lebanon, OR",Albany,OR,41,44.6274,-123.0966,41043
4,603dedea9226a0b6c19374e2,10580,"Albany-Schenectady-Troy, NY",Albany,NY,36,42.6664,-73.7987,"36001, 36083, 36091, 36093, 36095"


In [27]:
df_final = df_msa.merge(df_reg_comb, left_on='Primary_State_FIPS', right_on='FIPS')
df_final

Unnamed: 0,_id_x,CBSA,Name,Primary_City,Primary_State_Code,Primary_State_FIPS,Latitude,Longitude,County_FIPS_agg,_id_y,FIPS,State,Region,Region_Name,Division,Division_Name,2024_CPI,CPI_Score
0,603dedea9226a0b6c19374de,10180,"Abilene, TX",Abilene,TX,48,32.4543,-99.7384,"48059, 48253, 48441",603f3e2de251dabd4e0b3309,48,Texas,3,South,7,West South Central,265.925504,177.0
1,603dedea9226a0b6c19374e7,11100,"Amarillo, TX",Amarillo,TX,48,35.1988,-101.8311,"48011, 48065, 48359, 48375, 48381",603f3e2de251dabd4e0b3309,48,Texas,3,South,7,West South Central,265.925504,177.0
2,603dedea9226a0b6c19374f3,12420,"Austin-Round Rock-Georgetown, TX",Austin,TX,48,30.3004,-97.7522,"48021, 48055, 48209, 48453, 48491",603f3e2de251dabd4e0b3309,48,Texas,3,South,7,West South Central,265.925504,177.0
3,603dedea9226a0b6c19374fb,13140,"Beaumont-Port Arthur, TX",Beaumont,TX,48,30.0849,-94.1451,"48199, 48245, 48361",603f3e2de251dabd4e0b3309,48,Texas,3,South,7,West South Central,265.925504,177.0
4,603dedea9226a0b6c193750d,15180,"Brownsville-Harlingen, TX",Brownsville,TX,48,25.9980,-97.4565,48061,603f3e2de251dabd4e0b3309,48,Texas,3,South,7,West South Central,265.925504,177.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,603dedea9226a0b6c19375b7,31700,"Manchester-Nashua, NH",Manchester,NH,33,42.9848,-71.4447,33011,603f3e2de251dabd4e0b32e7,33,New Hampshire,1,Northeast,1,New England,289.633993,162.0
380,603dedea9226a0b6c19375f6,39300,"Providence-Warwick, RI-MA",Providence,RI,44,41.8230,-71.4187,"25005, 44001, 44003, 44005, 44007, 44009",603f3e2de251dabd4e0b32e8,44,Rhode Island,1,Northeast,1,New England,289.633993,162.0
381,603dedea9226a0b6c19375fc,39660,"Rapid City, SD",Rapid City,SD,46,44.0716,-103.2204,"46093, 46103",603f3e2de251dabd4e0b32f8,46,South Dakota,2,Midwest,4,West North Central,255.833767,192.0
382,603dedea9226a0b6c1937626,43620,"Sioux Falls, SD",Sioux Falls,SD,46,43.5397,-96.7320,"46083, 46087, 46099, 46125",603f3e2de251dabd4e0b32f8,46,South Dakota,2,Midwest,4,West North Central,255.833767,192.0


In [28]:
df_final.columns

Index(['_id_x', 'CBSA', 'Name', 'Primary_City', 'Primary_State_Code',
       'Primary_State_FIPS', 'Latitude', 'Longitude', 'County_FIPS_agg',
       '_id_y', 'FIPS', 'State', 'Region', 'Region_Name', 'Division',
       'Division_Name', '2024_CPI', 'CPI_Score'],
      dtype='object')

In [30]:
df_final.drop(columns=['_id_x', 'Name', 'Primary_City', 'Primary_State_Code',
       'Primary_State_FIPS', 'Latitude', 'Longitude', 'County_FIPS_agg',
       '_id_y', 'FIPS', 'State', 'Region', 'Region_Name', 'Division',
       'Division_Name'], inplace=True)
df_final

Unnamed: 0,CBSA,2024_CPI,CPI_Score
0,10180,265.925504,177.0
1,11100,265.925504,177.0
2,12420,265.925504,177.0
3,13140,265.925504,177.0
4,15180,265.925504,177.0
...,...,...,...
379,31700,289.633993,162.0
380,39300,289.633993,162.0
381,39660,255.833767,192.0
382,43620,255.833767,192.0


In [31]:
# create new collection
collection = db.arima_cpi_pred_score
# turn dataframe into readable format for mongo
df_dict = df_final.to_dict(orient='records')
# write dataframe to unemployment_predicted_2024 collection
collection.insert_many(df_dict)

<pymongo.results.InsertManyResult at 0x1f12fe24280>