<a href="https://colab.research.google.com/github/madanjha/PythonDS/blob/main/TimeSeriesHandsonSess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Time Series HandsOn

AirPassenger Dataset Link : https://drive.google.com/file/d/1nDdOM0ww0dM4vViMptnpIZhz63lVp23K/view?usp=sharing


# Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/AirPassengers.csv')

In [None]:
df.head()

In [None]:
df.shape # We have 12 year of data

In [None]:
df.info()

In [None]:
df['Month'] = pd.to_datetime(df['Month'])

In [None]:
df

In [None]:
df.isnull().sum()
# If we have very less rows (mean imputation (Centeral Tendencies))

In [None]:
df.duplicated().sum()

In [None]:
df.set_index('Month',inplace = True)

In [None]:
df

In [None]:
df.info()

### Let's see the components of Time series (Trend, Seasonality and residue)

In [None]:
plt.plot(df['#Passengers'])

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
decompose = seasonal_decompose(df['#Passengers'])

In [None]:
print(decompose.plot())

In [None]:
# Yes we do have Trend
# Yes we do have seasonality
# Yes we do have Residuals

# Check if data is stationary or not
# By using ADF test (augmented dickey fuller test)

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
# null hypothesis - Data is not stationary
# alt hypothesis - Data is stationary

In [None]:
result = adfuller(df['#Passengers'])
result

* ADF stats
* P_value
* Lags
* observations
* Critical values:
* AIC

In [None]:
p_value = result[1]
p_value

In [None]:
if p_value <=0.05: # Significance level
  print('Null hypothesis is rejected thus data is stationary')
else:
  print('Null Hypothesis is accepted thus data is non stationary')

In [None]:
# Checking stationary data visually
# By checking Constant rolling mean, rolling std

In [None]:
rolling_mean = df.rolling(window=12).mean()
rolling_std = df.rolling(window=12).std()

In [None]:
df.head(15)

In [None]:
rolling_mean.head(15)

In [None]:
rolling_mean.isnull().sum() # null = window -1

In [None]:
plt.plot(df,color = 'blue',label = 'Original')
plt.plot(rolling_mean,color = 'red',label = 'Rolling Mean')
plt.plot(rolling_std,color = 'black',label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean and std')
plt.show()

# Reduce the trends
Log transformation
* Without log: Trend will grow exponenitally
* With log: Exponential growth become linear, Making it easiser to understand by my model

In [None]:
log_df = np.log(df)


In [None]:
log_df

In [None]:
rolling_mean = log_df.rolling(window=12).mean()
rolling_std = log_df.rolling(window=12).std()

In [None]:
plt.plot(log_df,color = 'blue',label = 'Original')
plt.plot(rolling_mean,color = 'red',label = 'Rolling Mean')
plt.plot(rolling_std,color = 'black',label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean and std')
plt.show()

Note : now still after transformation we can see the data is not stationary as the rolling mean and std is not yet constant, No we will apply differencing


# Differencing
* Help us to focus more on change between the values rather than the values itself

In [None]:
# Perform Diff 1 st time (D = 1)
diff_data = log_df.diff()
diff_data

In [None]:
diff_data.dropna(inplace = True)

In [None]:
diff_data

In [None]:
rolling_mean = diff_data.rolling(window=12).mean()
rolling_std = diff_data.rolling(window=12).std()

plt.plot(diff_data,color = 'blue',label = 'Original')
plt.plot(rolling_mean,color = 'red',label = 'Rolling Mean')
plt.plot(rolling_std,color = 'black',label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean and std')
plt.show()

In [None]:
result = adfuller(diff_data['#Passengers'])
result[1]

In [None]:
if result[1] <=0.05: # Significance level
  print('Null hypothesis is rejected thus data is stationary')
else:
  print('Null Hypothesis is accepted thus data is non stationary')

In [None]:
# We have reached marginal p_value

In [None]:
# Model implementations

In [None]:
log_df # d = 1

# Data Split

In [None]:
train = log_df.iloc[:120,:] # 10 year of data for training
test = log_df.iloc[120:,:] # 2 year of data for testing

In [None]:
from statsmodels.tsa.arima.model import ARIMA

* P - AR(Auto regressive)
* D - Differencing
* Q - MA (Moving Average)

In [None]:
model = ARIMA(train,order=(1,1,2)) # order of Arima (p = ?(1), d = 1, q=?(2)) | Hit and trial;

In [None]:
model = model.fit()

In [None]:
log_df['Arima_prediction'] = model.predict(start=len(train),end= len(train) + len(test) -1)

In [None]:
log_df

In [None]:
plt.plot(log_df)

In [None]:
# We are not getting good predictions (PDQ should be better)

In [None]:
import itertools

p = range(1,8)
d = range(1,2)
q = range(1,8)

pdq_combination = list(itertools.product(p,d,q))
pdq_combination

In [None]:
len(pdq_combination)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
rmse = []
order1 = []
for pdq in pdq_combination:
  model = ARIMA(train,order = pdq)
  model_fit = model.fit()
  pred = model_fit.predict(start=len(train),end= len(train) + len(test) -1)
  error = np.sqrt(mean_squared_error(test,pred))
  order1.append(pdq)
  rmse.append(error)
results = pd.DataFrame(index = order1,data=rmse,columns=['RMSE'])

In [None]:
results.sort_values(by='RMSE',ascending=True)

In [None]:
# p =5
# d = 1
#q = 4
model = ARIMA(train,order=(5,1,4))
model = model.fit()

In [None]:
log_df['Arima_prediction'] = model.predict(start=len(train),end= len(train) + len(test) -1)

In [None]:
log_df

In [None]:
plt.plot(log_df)

# Implementing sarima

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
s_model = SARIMAX(train,order=(5,1,4),seasonal_order=(5,1,4,12)) #p,d,q,s
s_model = s_model.fit()

In [None]:
log_df['Sarima_pred'] = s_model.predict(start=len(train),end= len(train) + len(test) -1)

In [None]:
log_df

In [None]:
plt.plot(log_df['#Passengers'])
plt.plot(log_df['Sarima_pred'])

In [None]:
plt.plot(log_df)

# Forecast

In [None]:
future = s_model.forecast(steps=60) # Next 5 year

In [None]:
future

In [None]:
plt.plot(log_df['#Passengers'])
plt.plot(future)
plt.show()

In [None]:
future

In [None]:
round(np.exp(6.085083))

In [None]:
round(np.exp(future))