# Transformation and Decomposition. Taks2

    author: Oleg Naidovich

In [88]:
from tsdata.raw import available_data, load_data

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
plt.rcParams["figure.figsize"] = (18, 8)
pd.set_option('display.max_columns', 500)

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from statsmodels.distributions.empirical_distribution import ECDF
import scipy.stats as st

from sklearn.metrics import mean_absolute_error


import warnings
warnings.filterwarnings('ignore')

In [120]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'


def plotSeries(data, legend='', title=''):
    plt.figure(figsize=(20, 5))
    plt.plot(data, '-d', color='navy', markersize=3)
    plt.legend([legend], loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title(title)


def plotDecomposition(decomposition):
    trend = decomposition.trend.dropna()
    seasonal = decomposition.seasonal.dropna()
    resid = decomposition.resid.dropna()

    plt.subplots(3, 1, figsize=(20, 21))

    plt.subplot(3, 1, 1)
    plt.plot(trend, color='navy', markersize=3, label='trend')
    plt.legend(loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title("Trend")

    plt.subplot(3, 1, 2)
    plt.plot(seasonal,
             '-gd', markersize=3, label='seasonal')
    plt.plot([seasonal.index[0], seasonal.index[-1]],
             seasonal.mean()*np.array([1, 1]), '--k',
             label=f"mean = {seasonal.mean():.3g}")
    plt.legend(loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title(
        f"Seasonal : range={(seasonal.max() - seasonal.min()):.3g}")

    plt.subplot(3, 1, 3)
    plt.plot(decomposition.resid, '-o', color='maroon',
             markersize=3,  label='residuals')
    plt.plot([resid.index[0], resid.index[-1]],
             resid.mean()*np.array([1, 1]), '--k',
             label=f"mean = {resid.mean():.3g}")
    plt.legend(loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title("Residuals")

    plt.show()


def plotACF(data, decomposition, lags=50):
    trend = decomposition.trend.dronpna()
    seasonal = decomposition.seasonal.dronpna()
    resid = decomposition.resid.dropna()

    fig, axes = plt.subplots(4, 1, figsize=(15, 4*6))

    plot_acf(
        data,
        lags=lags,
        vlines_kwargs={'color': 'b'},
        markerfacecolor='b',
        markeredgecolor='b',
        title='Autocorrelation of target'
    )
    plot_acf(
        trend,
        lags=lags,
        vlines_kwargs={'color': 'navy'},
        markerfacecolor='navy',
        markeredgecolor='navy',
        title='Autocorrelation of trend'
    )
    plot_acf(
        seasonal,
        lags=lags,
        vlines_kwargs={'color': 'g'},
        markerfacecolor='g',
        markeredgecolor='g',
        title='Autocorrelation of seasonal'
    )

    plot_acf(
        resid,
        lags=lags,
        vlines_kwargs={'color': 'maroon'},
        markerfacecolor='maroon',
        markeredgecolor='maroon',
        title='Autocorrelation of residuals'
    )

    plt.show()


def plotPACF(data, decomposition, lags=36):
    trend = decomposition.trend.dronpna()
    seasonal = decomposition.seasonal.dronpna()
    resid = decomposition.resid.dropna()

    plot_acf(
        dataStat,
        lags=lags,
        vlines_kwargs={'color': 'b'},
        markerfacecolor='b', markeredgecolor='b',
        title='Autocorrelation of target'
    )

    plot_pacf(
        trend.dropna(),
        lags=lags,
        vlines_kwargs={'color': 'navy'},
        markerfacecolor='navy',
        markeredgecolor='navy',
        title='Partial autocorrelation of trend'
    )

    try:
        plot_pacf(
            seasonal,
            lags=lags,
            vlines_kwargs={'color': 'g'},
            markerfacecolor='g',
            markeredgecolor='g',
            title='Partial autocorrelation of seasonal'
        )
    except Exception as exc:
        print(exc)

    try:
        plot_pacf(
            resid,
            lags=lags,
            vlines_kwargs={'color': 'maroon'},
            markerfacecolor='maroon',
            markeredgecolor='maroon',
            title='Partial autocorrelation of residuals'
        )
    except Exception as exc:
        print(exc)

    plt.show()


def getResidAnalytics(resid):
    resid = resid.dropna()
    color = 'maroon'

    plt.subplots(1, 2, figsize=(24, 8))

    plt.subplot(1, 2, 1)
    plt.plot(resid, '-', color=color)
    plt.grid(linestyle=':', color='k')
    plt.title("Residuals")

    x_fit = np.linspace(resid.min(), resid.max(), 201)
    loc_laplace, scale_laplace = st.laplace.fit(resid.dropna())
    loc_norm, scale_norm = st.norm.fit(resid.dropna())
    # print(f"Fitting of residuals by Laplace distribution: fitted mean = {loc:.3f}, fitted std = {scale:.3f}")
    y_fit_laplace = st.laplace.pdf(x_fit, loc_laplace, scale_laplace)
    y_fit_norm = st.norm.pdf(x_fit, loc_norm, scale_norm)

    plt.subplot(1, 2, 2)
    sns.distplot(resid, color=color, bins=100, vertical=True,
                 label="distribution of residuals")
    plt.plot(y_fit_laplace, x_fit, '-b',
             label=f"approximation by Laplace distribution:\n  fitted mean = {loc_laplace:.4g}, fitted std = {scale_laplace:.4g}")
    plt.plot(y_fit_norm, x_fit, '-g',
             label=f"approximation by normal distribution:\n  fitted mean = {loc_norm:.4g}, fitted std = {scale_norm:.4g}")
    plt.legend()
    # plt.ylim(resid-0.02*y_range, y_max+0.02*y_range)
    plt.title("Distribution of residuals")
    plt.grid(linestyle=':', color='k')

    plt.show()

    ecdf_resid_instance = ECDF(resid.dropna())
    resid_arr = resid.dropna().sort_values().values
    ecdf_resid = ecdf_resid_instance(resid_arr)

    cdf_norm = st.norm.cdf(resid_arr, loc=loc_norm, scale=scale_norm)
    cdf_laplace = st.laplace.cdf(
        resid_arr, loc=loc_laplace, scale=scale_laplace)

    mae_norm = mean_absolute_error(ecdf_resid, cdf_norm)
    mae_laplace = mean_absolute_error(ecdf_resid, cdf_laplace)

    plt.subplots(1, 1, figsize=(20, 8))
    plt.plot(resid_arr, ecdf_resid, '-', color='maroon')
    plt.plot(resid_arr, cdf_norm, '-g',
             label=f"Normal approx : MAE = {mae_norm:.3g}")
    plt.plot(resid_arr, cdf_laplace, '-b',
             label=f"Laplace approx: MAE = {mae_laplace:.3g}")
    plt.legend()
    plt.title("CDF of decomposition residuals")
    plt.show()

    get_stationary(resid)


def adffullerTest(data, alpha=0.05):
    print("==== Augmented Dickey–Fuller (Null hypothesis - The process is non-stationary) ====")
    result = adfuller(data.values, autolag='AIC')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    if result[1] < alpha:
        print("The process is" + color.BOLD + color.GREEN +
              " stationary " + color.END + "by ADF.\n")
    else:
        print("The process is" + color.BOLD + color.RED +
              " non-stationary " + color.END + "by ADF.\n")


def kpssTest(data, alpha=0.05):
    print('==== Kwiatkowski–Phillips–Schmidt–Shin (KPSS) test (Null hypothesis - The process is stationary) ====')
    kpsstest = kpss(data.values, regression='c')
    print("KPSS Statistic = " + str(kpsstest[0]))
    print("p-value = " + str(kpsstest[1]))
    if kpsstest[1] < alpha:
        print("The process is" + color.BOLD + color.RED +
              " non-stationary " + color.END + "by KPSS.\n")
    else:
        print("The process is" + color.BOLD + color.GREEN +
              " stationary " + color.END + "by KPSS.\n")


def getStationary(data):
    adffullerTest(data)
    kpssTest(data)


# Gather data

In [121]:
df = sm.datasets.longley.load_pandas().data
print(df.isna().sum())
print('\nshape: ', df.shape)
df

TOTEMP     0
GNPDEFL    0
GNP        0
UNEMP      0
ARMED      0
POP        0
YEAR       0
dtype: int64

shape:  (16, 7)


Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP,YEAR
0,60323.0,83.0,234289.0,2356.0,1590.0,107608.0,1947.0
1,61122.0,88.5,259426.0,2325.0,1456.0,108632.0,1948.0
2,60171.0,88.2,258054.0,3682.0,1616.0,109773.0,1949.0
3,61187.0,89.5,284599.0,3351.0,1650.0,110929.0,1950.0
4,63221.0,96.2,328975.0,2099.0,3099.0,112075.0,1951.0
5,63639.0,98.1,346999.0,1932.0,3594.0,113270.0,1952.0
6,64989.0,99.0,365385.0,1870.0,3547.0,115094.0,1953.0
7,63761.0,100.0,363112.0,3578.0,3350.0,116219.0,1954.0
8,66019.0,101.2,397469.0,2904.0,3048.0,117388.0,1955.0
9,67857.0,104.6,419180.0,2822.0,2857.0,118734.0,1956.0


In [122]:
_index = pd.date_range('1947-01-01', '1963-01-01', freq='Y')
df.index = _index
df.drop(columns=['YEAR'], inplace=True)
df

Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP
1947-12-31,60323.0,83.0,234289.0,2356.0,1590.0,107608.0
1948-12-31,61122.0,88.5,259426.0,2325.0,1456.0,108632.0
1949-12-31,60171.0,88.2,258054.0,3682.0,1616.0,109773.0
1950-12-31,61187.0,89.5,284599.0,3351.0,1650.0,110929.0
1951-12-31,63221.0,96.2,328975.0,2099.0,3099.0,112075.0
1952-12-31,63639.0,98.1,346999.0,1932.0,3594.0,113270.0
1953-12-31,64989.0,99.0,365385.0,1870.0,3547.0,115094.0
1954-12-31,63761.0,100.0,363112.0,3578.0,3350.0,116219.0
1955-12-31,66019.0,101.2,397469.0,2904.0,3048.0,117388.0
1956-12-31,67857.0,104.6,419180.0,2822.0,2857.0,118734.0


In [123]:
px.line(df)

Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP
0,70551.0,116.9,554894.0,4806.0,3594.0,130081.0


In [137]:
def getNormalized(df):
    dfNormalized = pd.DataFrame(df.values / pd.DataFrame(df.max()).T.values)
    dfNormalized.columns = df.columns
    dfNormalized.index = df.index
    return dfNormalized

In [139]:
dfNormalized = getNormalized(df)
dfNormalized

Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP
1947-12-31,0.855027,0.710009,0.422223,0.490221,0.442404,0.827238
1948-12-31,0.866352,0.757057,0.467524,0.48377,0.40512,0.83511
1949-12-31,0.852872,0.754491,0.465051,0.766126,0.449638,0.843882
1950-12-31,0.867273,0.765612,0.512889,0.697253,0.459098,0.852769
1951-12-31,0.896104,0.822926,0.592861,0.436746,0.86227,0.861579
1952-12-31,0.902028,0.839179,0.625343,0.401998,1.0,0.870765
1953-12-31,0.921163,0.846878,0.658477,0.389097,0.986923,0.884787
1954-12-31,0.903758,0.855432,0.654381,0.744486,0.932109,0.893436
1955-12-31,0.935763,0.865697,0.716297,0.604245,0.84808,0.902422
1956-12-31,0.961815,0.894782,0.755424,0.587183,0.794936,0.91277


In [140]:
px.line(dfNormalized)