In [None]:
import os
os.chdir('???')
os.getcwd()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline

In [None]:
df = pd.read_csv('airline_passengers.csv')
df.head()

In [None]:
df.dtypes

### String Time Format Code List: https://strftime.org/

In [None]:
df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m')
df.head()

In [None]:
df.dtypes

In [None]:
df = df.set_index('Month')
df.head()

In [None]:
df.plot()
plt.show()

In [None]:
# Detecting and removing outliers using Z-score
from scipy.stats import zscore
z_scores = zscore(df)
abs_z_scores = np.abs(z_scores)
abs_z_scores

In [None]:
zscore_threshold = 3 #any data above mean +/- (this_threshold)*sd is considered outliers
outliers = (abs_z_scores > zscore_threshold)  # Z-score threshold
df_outlier_removed = df[~outliers]
df_outlier_removed

In [None]:
fig, ax = plt.subplots(2,1)
temp_line, = ax[0].plot(df.index, df, label='original', color='r')
temp_line, = ax[1].plot(df_outlier_removed.index, df_outlier_removed, label='outliers removed', color='m')
ax[0].legend(loc='upper left')
ax[1].legend(loc='upper left')
plt.show()

In [None]:
plt.clf()
zscore_threshold = 2 #any data above mean +/- (this_threshold)*sd is considered outliers
outliers = (abs_z_scores > zscore_threshold)  # Z-score threshold
df_outliers = df[outliers]
df_nonoutliers = df[~outliers]

fig, ax = plt.subplots(3,1)
temp_line, = ax[0].plot(df.index, df, label='original', color='r')
temp_line, = ax[1].plot(df_outliers.index, df_outliers, label='outliers', color='m')
temp_line, = ax[2].plot(df_nonoutliers.index, df_nonoutliers, label='non-outliers', color='c')
ax[0].legend(loc='upper left')
ax[1].legend(loc='upper left')
ax[2].legend(loc='upper left')
ax[0].set_ylim([100, 700])
ax[1].set_ylim([100, 700])
ax[2].set_ylim([100, 700])
plt.show()

In [None]:
df.dropna(inplace=True)

In [None]:
df.plot()
plt.show()

In [None]:
df.index

In [None]:
# Sampling using resample and mean
sampling_set = df['Thousands of Passengers'].resample('YE').mean()  # one sample = average over time ('A' represents year)
sampling_set.head()

In [None]:
# Sampling using asfreq (ffill = forward fill, bfill = backward fill)
freq_set = df['Thousands of Passengers'].asfreq(freq='QE', method='ffill')     # one sample = data selected at end of time
freq_set.head()

In [None]:
# Plot sample sets
sampling_set.plot(figsize=(16,6))        # one sample = average over month ('M' represents month)
freq_set.plot(style=':')                 # one sample = data selected at end of year
plt.legend(['resample','asfreq'],loc='upper right');
plt.show()

In [None]:
# Shifting
shifted_period = 1
df_shifted = df.shift(periods=shifted_period)
df_shifted.head()

In [None]:
fig = plt.figure(figsize=(16,6)) 
plt.plot(df.index, df['Thousands of Passengers'], df_shifted.index, df_shifted['Thousands of Passengers'], ':')              
plt.legend(['Original','Shifted'],loc='upper right')
plt.show()

## Smoothing: Moving Average

In [None]:
df['6-month-SMA'] = df['Thousands of Passengers'].rolling(window=6).mean()

In [None]:
df['12-month-SMA'] = df['Thousands of Passengers'].rolling(window=12).mean()

In [None]:
df['6-month-SMA-Center'] = df['Thousands of Passengers'].rolling(window=6,center=True).mean()

In [None]:
df.head(15)

In [None]:
df.tail(15)

In [None]:
plt.clf()
df.plot(figsize=(10,8))
plt.show()

## Smoothing: Exponential Smoothing

In [None]:
df['EWMA-0.3'] = df['Thousands of Passengers'].ewm(alpha=0.3,adjust=False).mean()

In [None]:
df['EWMA-0.6'] = df['Thousands of Passengers'].ewm(alpha=0.6,adjust=False).mean()

In [None]:
df[['Thousands of Passengers','EWMA-0.3','EWMA-0.6']].plot(figsize=(10,8))
plt.show()

## Decomposition

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
result = seasonal_decompose(df['Thousands of Passengers'])
fig = result.plot()
fig.set_size_inches(15, 8)
plt.show()

In [None]:
result.trend.plot()
plt.show()

In [None]:
result.seasonal.plot()
plt.show()

In [None]:
result.resid.plot()
plt.show()