In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import numpy as np
import string
import math
plt.style.use("ggplot")

Univariate time-series¶

Some techniques are taken from website: https://www.machinelearningplus.com/time-series/time-series-analysis-python/

In [None]:
# Let's take an AdminStrata of the Yemen country in order to analyze an univariate time-series.
AdminStrata = "Abyan"

In [None]:
# Isolating the univariate time-series.
t = ts[AdminStrata].to_frame()
t.rename(columns = {AdminStrata: "Metric"}, inplace = True)
t.head()

In [None]:
print("The length (monthly based) of the current time-series is:", len(t))

In [None]:
# Plot the time-series.
t.plot(figsize = (11, 5), style = ".-", title = AdminStrata + " (Yemen)", legend = False)
plt.ylabel("Metric")
plt.autoscale()

Plotting each year as a separate line in the same plot (seasonal plots). This lets you compare the year wise patterns side-by-side.

In [None]:
# Prepare data.
ty = t.copy()
ty["Year"] = [d.year for d in ty.index]
ty["Month"] = [d.strftime('%b') for d in ty.index]
years = ty["Year"].unique()

# Prep Colors.
np.random.seed(100)
mycolors = np.random.choice(list(mpl.colors.XKCD_COLORS.keys()), len(years), replace = False)

# Draw Plot.
plt.figure(figsize = (11, 5))
for i, y in enumerate(years):
    if i > 0:        
        plt.plot("Month", "Metric", data = ty.loc[ty.Year == y, :], marker = ".", color = mycolors[i], label = y)
        plt.text(ty.loc[ty.Year == y, :].shape[0]-.9, ty.loc[ty.Year == y, "Metric"][-1:].values[0], y, fontsize = 12, color = mycolors[i])
plt.xlabel("Month")
plt.ylabel("Metric")

You can group the data at seasonal intervals and see how the values are distributed within a given year or month and how it compares over time.


In [None]:
# Draw Plot.
fig, axes = plt.subplots(1, 2, figsize = (14, 6))
sns.boxplot(x = "Year", y = "Metric", data = ty, ax = axes[0])
sns.boxplot(x = "Month", y = "Metric", data = ty.loc[~ty.Year.isin([2015, 2019]), :])

# Set Title.
axes[0].set_title("Year-wise Box Plot\n(The Trend)")
axes[1].set_title("Month-wise Box Plot\n(The Seasonality)")
plt.show()

A time-series can be expressed as either a sum or a product of the components:

Additive time series: Value = Base Level + Trend + Seasonality + Error

Multiplicative Time Series: Value = Base Level x Trend x Seasonality x Error

We can do a classical decomposition of a time series by considering the series as an additive or multiplicative combination of the base level, trend, seasonal index and the residual.

In [None]:
# Multiplicative Decomposition.
result_mul = seasonal_decompose(ty["Metric"], model = "multiplicative", extrapolate_trend = "freq")

# Additive Decomposition.
result_add = seasonal_decompose(ty["Metric"], model = "additive", extrapolate_trend = "freq")

# Plot.
result_mul.plot().suptitle("Multiplicative Decompose")
plt.autoscale()
result_add.plot().suptitle("Additive Decompose")
plt.autoscale()
plt.show()

One step before performing forecasting is to check stationarity of the time-series. This is due to the fact that time-series models work on the assumption that the time-series is stationary. A time-series is said to be stationary if its statistical properties such as mean, variance remain constant over time (if they do not have trend or seasonal effects and other structures that depend on the time index).

Let’s plot a histogram to see what the underlying distribution looks like.

In [None]:
t.Metric.hist(rwidth = 0.9)
plt.xlabel("Metric")

From this histogram, I’m pretty confident that we could have a stationary dataset otherwise we would see something much less ‘gaussian-shaped’ due to trending and/or seasonality (e.g., we would see more data plotted to the left or right).

Returning to statistical properties, an approach is to look at the mean and variance of multiple sections of the data (splitting your time-series into two (or more) partitions) and compare them, remembering that if the data is stationary, the means/variances should be similar.

In [None]:
split = int(len(t)*0.5)
one, two = t.Metric[:split], t.Metric[split:]

In [None]:
mean1, mean2 = one.mean(), two.mean()
var1, var2 = one.var(), two.var()

print(mean1, mean2)
print(var1, var2)

Another approach consists to use statistical tests. The Augmented Dickey-Fuller test is a type of statistical test called a unit root test. The intuition behind a unit root test is that it determines how strongly a time series is defined by a trend. The augmented Dickey–Fuller (ADF) statistic, used in the test, is a negative number. The more negative it is, the stronger the rejection of the hypothesis that there is a unit root at some level of confidence.

p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
result = adfuller(t.Metric)
print("ADF Statistic: %f" % result[0])
print("p-value: %f" % result[1])
print("Critical Values:")
for key, value in result[4].items():
    print("\t%s: %.3f" % (key, value))