In [1]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt

In [3]:
df = pd.read_csv('unemp_month.csv')
df

Unnamed: 0,_id,CBSA,year,month,value
0,6038baa2f3720aac8688bb68,11500,2019,December,3.0
1,6038baa2f3720aac8688bb69,11500,2019,November,2.9
2,6038baa2f3720aac8688bb6a,11500,2019,October,3.0
3,6038baa2f3720aac8688bb6b,11500,2019,September,3.0
4,6038baa2f3720aac8688bb6c,11500,2019,August,3.4
...,...,...,...,...,...
46075,6038baa3f3720aac86896f63,49340,2010,May,8.8
46076,6038baa3f3720aac86896f64,49340,2010,April,8.8
46077,6038baa3f3720aac86896f65,49340,2010,March,9.4
46078,6038baa3f3720aac86896f66,49340,2010,February,9.6


In [4]:
df['month'] = df['month'].str.replace(' ', '')

In [5]:
df['month'] = df['month'].map({'January':1,'February':2,'March':3,
                               'April':4,'May':5,'June':6,'July':7,
                               'August':8,'September':9,'October':10,'November':11,'December':12})
df

Unnamed: 0,_id,CBSA,year,month,value
0,6038baa2f3720aac8688bb68,11500,2019,12,3.0
1,6038baa2f3720aac8688bb69,11500,2019,11,2.9
2,6038baa2f3720aac8688bb6a,11500,2019,10,3.0
3,6038baa2f3720aac8688bb6b,11500,2019,9,3.0
4,6038baa2f3720aac8688bb6c,11500,2019,8,3.4
...,...,...,...,...,...
46075,6038baa3f3720aac86896f63,49340,2010,5,8.8
46076,6038baa3f3720aac86896f64,49340,2010,4,8.8
46077,6038baa3f3720aac86896f65,49340,2010,3,9.4
46078,6038baa3f3720aac86896f66,49340,2010,2,9.6


In [6]:
df['date'] = pd.to_datetime(df[['year', 'month']].assign(DAY=1))
df

Unnamed: 0,_id,CBSA,year,month,value,date
0,6038baa2f3720aac8688bb68,11500,2019,12,3.0,2019-12-01
1,6038baa2f3720aac8688bb69,11500,2019,11,2.9,2019-11-01
2,6038baa2f3720aac8688bb6a,11500,2019,10,3.0,2019-10-01
3,6038baa2f3720aac8688bb6b,11500,2019,9,3.0,2019-09-01
4,6038baa2f3720aac8688bb6c,11500,2019,8,3.4,2019-08-01
...,...,...,...,...,...,...
46075,6038baa3f3720aac86896f63,49340,2010,5,8.8,2010-05-01
46076,6038baa3f3720aac86896f64,49340,2010,4,8.8,2010-04-01
46077,6038baa3f3720aac86896f65,49340,2010,3,9.4,2010-03-01
46078,6038baa3f3720aac86896f66,49340,2010,2,9.6,2010-02-01


In [7]:
df.sort_values(by='date', inplace=True)
df

Unnamed: 0,_id,CBSA,year,month,value,date
23039,6038baa3f3720aac86891567,41180,2010,1,10.4,2010-01-01
14999,6038baa3f3720aac8688f5ff,26900,2010,1,10.5,2010-01-01
15119,6038baa3f3720aac8688f677,29020,2010,1,14.5,2010-01-01
15239,6038baa3f3720aac8688f6ef,29200,2010,1,10.2,2010-01-01
15359,6038baa3f3720aac8688f767,33140,2010,1,14.2,2010-01-01
...,...,...,...,...,...,...
28800,6038baa3f3720aac86892be8,10420,2019,12,4.0,2019-12-01
2280,6038baa2f3720aac8688c450,49740,2019,12,14.3,2019-12-01
10320,6038baa2f3720aac8688e3b8,10500,2019,12,3.6,2019-12-01
28320,6038baa3f3720aac86892a08,49180,2019,12,3.2,2019-12-01


In [8]:
df.drop(columns=['_id','year', 'month'], inplace=True)
df

Unnamed: 0,CBSA,value,date
23039,41180,10.4,2010-01-01
14999,26900,10.5,2010-01-01
15119,29020,14.5,2010-01-01
15239,29200,10.2,2010-01-01
15359,33140,14.2,2010-01-01
...,...,...,...
28800,10420,4.0,2019-12-01
2280,49740,14.3,2019-12-01
10320,10500,3.6,2019-12-01
28320,49180,3.2,2019-12-01


In [9]:
df_pivot = df.pivot(index = 'CBSA', columns='date', values='value')
df_pivot.head()

date,2010-01-01,2010-02-01,2010-03-01,2010-04-01,2010-05-01,2010-06-01,2010-07-01,2010-08-01,2010-09-01,2010-10-01,...,2019-03-01,2019-04-01,2019-05-01,2019-06-01,2019-07-01,2019-08-01,2019-09-01,2019-10-01,2019-11-01,2019-12-01
CBSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10180,7.2,7.0,6.9,6.6,6.7,7.3,7.4,7.2,6.9,6.8,...,3.1,2.5,2.7,3.3,3.3,3.2,2.9,2.8,2.8,2.7
10420,12.4,12.0,11.8,10.8,10.2,10.4,10.3,10.0,9.6,9.4,...,4.5,3.7,3.6,4.4,4.6,4.4,4.0,3.9,3.8,4.0
10500,12.2,12.0,11.5,11.1,11.4,11.9,12.0,12.1,11.7,11.6,...,4.4,3.8,4.1,4.8,4.7,4.5,3.9,3.8,3.5,3.6
10540,14.1,14.3,14.1,13.1,12.5,12.9,12.5,12.3,12.1,11.9,...,5.0,4.4,4.0,4.6,4.8,4.5,3.6,3.5,3.4,3.4
10580,8.1,8.0,7.6,7.0,6.9,7.1,7.4,7.1,7.0,6.9,...,3.9,3.3,3.3,3.5,3.8,3.8,3.4,3.4,3.4,3.7


In [10]:
df_pivot.reset_index(inplace=True)
df_pivot

date,CBSA,2010-01-01 00:00:00,2010-02-01 00:00:00,2010-03-01 00:00:00,2010-04-01 00:00:00,2010-05-01 00:00:00,2010-06-01 00:00:00,2010-07-01 00:00:00,2010-08-01 00:00:00,2010-09-01 00:00:00,...,2019-03-01 00:00:00,2019-04-01 00:00:00,2019-05-01 00:00:00,2019-06-01 00:00:00,2019-07-01 00:00:00,2019-08-01 00:00:00,2019-09-01 00:00:00,2019-10-01 00:00:00,2019-11-01 00:00:00,2019-12-01 00:00:00
0,10180,7.2,7.0,6.9,6.6,6.7,7.3,7.4,7.2,6.9,...,3.1,2.5,2.7,3.3,3.3,3.2,2.9,2.8,2.8,2.7
1,10420,12.4,12.0,11.8,10.8,10.2,10.4,10.3,10.0,9.6,...,4.5,3.7,3.6,4.4,4.6,4.4,4.0,3.9,3.8,4.0
2,10500,12.2,12.0,11.5,11.1,11.4,11.9,12.0,12.1,11.7,...,4.4,3.8,4.1,4.8,4.7,4.5,3.9,3.8,3.5,3.6
3,10540,14.1,14.3,14.1,13.1,12.5,12.9,12.5,12.3,12.1,...,5.0,4.4,4.0,4.6,4.8,4.5,3.6,3.5,3.4,3.4
4,10580,8.1,8.0,7.6,7.0,6.9,7.1,7.4,7.1,7.0,...,3.9,3.3,3.3,3.5,3.8,3.8,3.4,3.4,3.4,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,49420,13.4,13.1,12.3,11.3,11.0,9.5,8.7,10.3,8.2,...,9.3,7.3,6.8,5.8,5.5,6.1,4.4,4.6,6.4,8.0
380,49620,9.4,9.6,9.1,8.2,8.5,8.6,8.7,8.3,7.5,...,3.6,3.1,3.5,4.1,4.5,4.3,3.7,3.9,3.8,3.8
381,49660,14.1,13.9,14.0,11.9,11.5,11.6,11.6,11.4,10.6,...,6.1,5.2,5.2,5.8,6.1,6.0,5.3,5.3,5.4,5.4
382,49700,19.1,20.1,19.9,18.6,17.9,18.0,17.7,16.3,16.1,...,8.9,7.5,6.3,6.6,6.6,5.7,5.0,5.2,5.7,6.5


In [11]:
#df_pivot.to_csv('ML_ready_unemp_month.csv', index=False)

In [16]:
# for loop to predict 2024 values
predictions = []
for i in range(0,384):
    y = df_pivot.iloc[i, 1:].values
    series = pd.Series(y, dtype='int')
    model = ARIMA(series, order=(2, 1, 1))
    model_fit = model.fit()
    pred = model_fit.forecast(60)
    predictions.append(pred.values.tolist()[-1])
df_pivot['2024'] = predictions



In [17]:
df_pivot

date,CBSA,2010-01-01 00:00:00,2010-02-01 00:00:00,2010-03-01 00:00:00,2010-04-01 00:00:00,2010-05-01 00:00:00,2010-06-01 00:00:00,2010-07-01 00:00:00,2010-08-01 00:00:00,2010-09-01 00:00:00,...,2019-04-01 00:00:00,2019-05-01 00:00:00,2019-06-01 00:00:00,2019-07-01 00:00:00,2019-08-01 00:00:00,2019-09-01 00:00:00,2019-10-01 00:00:00,2019-11-01 00:00:00,2019-12-01 00:00:00,2024
0,10180,7.2,7.0,6.9,6.6,6.7,7.3,7.4,7.2,6.9,...,2.5,2.7,3.3,3.3,3.2,2.9,2.8,2.8,2.7,2.824899
1,10420,12.4,12.0,11.8,10.8,10.2,10.4,10.3,10.0,9.6,...,3.7,3.6,4.4,4.6,4.4,4.0,3.9,3.8,4.0,4.037593
2,10500,12.2,12.0,11.5,11.1,11.4,11.9,12.0,12.1,11.7,...,3.8,4.1,4.8,4.7,4.5,3.9,3.8,3.5,3.6,0.672844
3,10540,14.1,14.3,14.1,13.1,12.5,12.9,12.5,12.3,12.1,...,4.4,4.0,4.6,4.8,4.5,3.6,3.5,3.4,3.4,3.542766
4,10580,8.1,8.0,7.6,7.0,6.9,7.1,7.4,7.1,7.0,...,3.3,3.3,3.5,3.8,3.8,3.4,3.4,3.4,3.7,3.614529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,49420,13.4,13.1,12.3,11.3,11.0,9.5,8.7,10.3,8.2,...,7.3,6.8,5.8,5.5,6.1,4.4,4.6,6.4,8.0,8.453392
380,49620,9.4,9.6,9.1,8.2,8.5,8.6,8.7,8.3,7.5,...,3.1,3.5,4.1,4.5,4.3,3.7,3.9,3.8,3.8,3.876512
381,49660,14.1,13.9,14.0,11.9,11.5,11.6,11.6,11.4,10.6,...,5.2,5.2,5.8,6.1,6.0,5.3,5.3,5.4,5.4,5.461639
382,49700,19.1,20.1,19.9,18.6,17.9,18.0,17.7,16.3,16.1,...,7.5,6.3,6.6,6.6,5.7,5.0,5.2,5.7,6.5,6.578777


In [21]:
df_pivot.sort_values(by='2024', ascending=True)

date,CBSA,2010-01-01 00:00:00,2010-02-01 00:00:00,2010-03-01 00:00:00,2010-04-01 00:00:00,2010-05-01 00:00:00,2010-06-01 00:00:00,2010-07-01 00:00:00,2010-08-01 00:00:00,2010-09-01 00:00:00,...,2019-04-01 00:00:00,2019-05-01 00:00:00,2019-06-01 00:00:00,2019-07-01 00:00:00,2019-08-01 00:00:00,2019-09-01 00:00:00,2019-10-01 00:00:00,2019-11-01 00:00:00,2019-12-01 00:00:00,2024
312,41940,11.2,11.1,11.1,10.6,10.3,10.6,10.7,10.5,10.2,...,2.3,2.2,2.7,2.8,2.7,2.3,2.4,2.4,2.3,-1.112402
64,16740,12.9,12.8,12.5,11.9,11.8,11.8,11.8,11.5,10.9,...,3.3,3.5,3.8,3.9,3.7,3.1,3.2,3.2,3.1,-0.906014
153,25860,15.3,15.2,14.6,13.7,13.5,13.3,13.1,12.8,12.1,...,3.3,3.7,4.0,4.1,4.0,3.3,3.4,3.3,3.2,-0.763200
310,41740,11.1,10.8,11.0,10.5,10.3,10.7,11.1,10.9,10.7,...,2.9,2.8,3.3,3.6,3.4,2.9,3.0,2.9,2.8,-0.616787
329,43780,13.0,13.0,12.8,11.8,11.5,11.5,12.1,11.3,10.3,...,3.1,3.3,3.9,4.2,3.9,3.3,3.4,3.4,3.4,0.295968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,49420,13.4,13.1,12.3,11.3,11.0,9.5,8.7,10.3,8.2,...,7.3,6.8,5.8,5.5,6.1,4.4,4.6,6.4,8.0,8.453392
361,47300,18.5,19.1,19.5,17.0,15.6,15.9,17.3,16.1,15.8,...,9.9,8.4,9.2,9.9,9.2,7.9,8.4,8.7,9.3,9.241563
383,49740,21.0,20.1,20.4,25.2,27.2,26.6,29.2,30.3,26.9,...,15.0,17.2,18.9,20.4,21.4,19.2,16.6,15.2,14.3,14.061057
254,36140,20.9,20.9,19.3,15.2,12.3,10.2,9.2,8.6,9.7,...,7.4,5.0,4.2,4.1,3.8,4.2,5.6,8.7,10.5,16.695651


In [20]:
y = df_pivot.iloc[2, 1:-1].values
series = pd.Series(y, index=pd.date_range('2010-01-01', periods=120, freq='MS'), dtype='int')
model = ARIMA(series, order=(2, 1, 1))
model_fit = model.fit()
pred = model_fit.forecast(60)
#predictions.append(pred.values.tolist()[-1])
pred

2020-01-01    3.573836
2020-02-01    3.528126
2020-03-01    3.479388
2020-04-01    3.430182
2020-05-01    3.380906
2020-06-01    3.331620
2020-07-01    3.282334
2020-08-01    3.233050
2020-09-01    3.183768
2020-10-01    3.134488
2020-11-01    3.085209
2020-12-01    3.035933
2021-01-01    2.986658
2021-02-01    2.937385
2021-03-01    2.888114
2021-04-01    2.838845
2021-05-01    2.789577
2021-06-01    2.740312
2021-07-01    2.691048
2021-08-01    2.641787
2021-09-01    2.592527
2021-10-01    2.543269
2021-11-01    2.494013
2021-12-01    2.444758
2022-01-01    2.395506
2022-02-01    2.346255
2022-03-01    2.297007
2022-04-01    2.247760
2022-05-01    2.198515
2022-06-01    2.149272
2022-07-01    2.100030
2022-08-01    2.050791
2022-09-01    2.001554
2022-10-01    1.952318
2022-11-01    1.903084
2022-12-01    1.853852
2023-01-01    1.804622
2023-02-01    1.755394
2023-03-01    1.706167
2023-04-01    1.656943
2023-05-01    1.607720
2023-06-01    1.558500
2023-07-01    1.509281
2023-08-01 

In [None]:
plt.plot(series)

In [None]:
# for loop to predict 2019 so percent error score can be calculated
predictions = []
for i in range(0,384):
    y = df.iloc[i, 2:-1].values
    series = pd.Series(y, dtype='int')
    model = ARIMA(series, order=(1, 1, 0))
    model_fit = model.fit()
    pred = model_fit.forecast(1)
    predictions.append(pred.values.tolist()[-1])
df['2019_pred'] = predictions

In [None]:
df

In [None]:
# calculate percent error for each linear regression
df['Percent_Error'] = (df['POPESTIMATE2019'] - df['2019_pred'])/df['POPESTIMATE2019']*100

# set values to absolute in percent_error column
df['Percent_Error'] = df['Percent_Error'].abs()

# print mean of a percent_error
mean_percent_error = df['Percent_Error'].mean()
print(f'Average percent error for ARIMA predicting 2019 unempoloyment rate is: {round(mean_percent_error,3)}%' )
df

In [None]:
Rmse = sqrt(mean_squared_error(df['2019'], df['2019_pred']))
Rmse

In [None]:
df.loc[df['Percent_Error'] > 123]

In [None]:

y2 = df.iloc[176, 1:-1].values
series2 = pd.Series(y2, index=pd.date_range('2015', periods=4, freq='Y'), dtype='float')
#series_float2 = series2.astype(float)
model2 = ARIMA(series2, order=(1, 1, 0))
model_fit2 = model2.fit()
pred2 = model_fit2.forecast(5)
pred2

In [None]:
series2

In [None]:
pred2

In [None]:
yy = df.iloc[0, 1:].values
series2 = pd.Series(yy, index=pd.date_range('2015', periods=5, freq='Y'), dtype='float')
plt.plot(series2)