In [21]:
from analytics_utils.partial_autocorrelation import partial_autocorrelation
from analytics_utils.linear_regression import linear_regression
from analytics_utils.autocorrelation import autocorrelation
from analytics_utils.describe_data import describe_data
from analytics_utils.interpolate import interpolate
from analytics_utils.decomposers import seasonal
from analytics_utils.correlate import correlate
from statsmodels.tsa.stattools import adfuller
from analytics_utils.roll import roll
from analytics_utils.ewm import ewm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [22]:
#Perform Dickey-Fuller test:
def dftest(df):
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(df, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [23]:
# Load dataset
# df = pd.read_csv('temp/bear_log.csv', parse_dates={'datetime': ['date', 'time']}, index_col='datetime').iloc[:1000,2:]
# df = pd.read_csv('temp/repsol.csv', parse_dates=['data'], index_col='data')[:1000]
df = pd.read_csv('/home/ferraz/dev/my/prediction-wind-speed-mp/confidential/train150.txt', delimiter='\t', header=None)[:1000]

In [24]:
# Show the values of bear
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,30,11,2015,14,13.012139,75.105481,27.516129,72.930636,1020.422601
1,30,11,2015,15,12.726087,68.334332,27.238095,75.212121,1020.394348
2,30,11,2015,16,12.081111,64.457865,27.105263,75.741379,1020.508333
3,30,11,2015,17,11.647222,53.842100,26.305556,75.302632,1020.611000
4,30,11,2015,18,11.064444,53.945279,25.464286,76.592593,1020.866500
5,30,11,2015,19,10.324444,51.320714,24.764706,83.344444,1021.286500
6,30,11,2015,20,9.862778,47.354507,24.310811,88.044643,1021.627500
7,30,11,2015,21,9.598889,41.816108,24.304348,91.329670,1021.892333
8,30,11,2015,22,9.488889,39.561270,24.500000,92.415385,1021.856167
9,30,11,2015,23,7.502222,37.881537,24.419355,93.629630,1021.834667


---

## describe_data

describe_data(
    data_frame: pd.DataFrame, lang: str = "pt", headers: [str] = None
) -> pd.DataFrame

- langs types: {'en', 'pt'}

In [25]:
# Show the datas description
describe_data(df, lang='en')

Unnamed: 0_level_0,max,min,mean,median,1-quartile,3-quartile,variance,standard deviation,absolute deviation,amplitude,rms,kurtosis,skewness,count,NaNs
header,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,30.0,1.0,12.087273,12.0,6.0,18.0,47.489637,6.891273,5.851398,29.0,13.910624,-0.747118,0.198428,550,0
1,12.0,11.0,11.981818,12.0,12.0,12.0,0.017884,0.13373,0.035702,1.0,11.982563,50.487216,-7.232125,550,0
2,2015.0,2015.0,2015.0,2015.0,2015.0,2015.0,0.0,0.0,0.0,0.0,2015.0,0.0,0.0,550,0
3,23.0,0.0,11.496364,11.0,5.25,17.75,48.173939,6.940745,6.018169,23.0,13.425823,-1.21043,0.001558,550,0
4,13.572222,1.211905,8.176719,8.32,6.215278,10.146806,7.425558,2.724988,2.226815,12.360317,8.61805,-0.604439,-0.116473,550,0
5,356.553715,1.448179,73.027647,59.019307,38.605367,83.329498,4224.514238,64.996263,36.885553,355.105536,97.723439,10.090187,3.102942,550,0
6,30.140625,20.616279,24.443445,24.336134,23.314815,25.359375,2.787691,1.669638,1.287227,9.524346,,0.81696,0.680716,549,1
7,98.0,46.355556,85.263578,86.765805,80.942222,91.434531,73.608878,8.579562,6.592405,51.644444,85.693365,2.13121,-1.253161,550,0
8,1023.7265,937.534333,1017.313986,1020.04675,1016.544542,1021.497917,77.372876,8.796185,5.065583,86.192167,1017.351944,22.104152,-4.065313,550,0


---

## correlate

correlate(
    data_frame: pd.DataFrame, method: str = "pearson", min_periods: int = 1
) -> pd.DataFrame

- methos types: {‘pearson’, ‘kendall’, ‘spearman’}

In [None]:
# Show correlation matrix datas
corr = correlate(df)
display(corr)

In [None]:
# Show correlation matrix
# sns.heatmap(corr, vmin=-1, vmax=1, cmap='coolwarm')
plt.figure(figsize=(12, 9))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, cmap='coolwarm', square=True)

---

## interpolate

- Please note that only method='linear' is supported for DataFrame/Series with a MultiIndex.
- methods types: {‘linear’, ‘time’, ‘index’, ‘values’, ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘barycentric’, ‘krogh’, ‘polynomial’, ‘spline’ ‘piecewise_polynomial’, ‘pchip’}

In [None]:
# Show interpolate linear
inter_linear = interpolate(df.iloc[:,0], method="linear")
inter_linear = inter_linear.rename(inter_linear.name+"_linear")
display(inter_linear)

In [None]:
# Show interpolate cubic
inter_cubic = interpolate(df.iloc[:,0], method="cubic")
inter_cubic = inter_cubic.rename(inter_cubic.name+"_cubic")
display(inter_cubic)

In [None]:
# Show [0] column of time series with cubic and linear interpolate
wide_df = df.iloc[:,0].copy()
wide_df = pd.concat([wide_df, inter_linear, inter_cubic], axis=1)
display(wide_df)

In [None]:
# Show plot [0] column of time series with cubic and linear interpolate
zero_df = df.iloc[:,0].copy()
linear_df = pd.concat([zero_df, inter_linear], axis=1)
cubic_df = pd.concat([zero_df, inter_cubic], axis=1)

fig = plt.figure(figsize=(16, 12))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

sns.lineplot(data=linear_df, ax=ax1)
sns.lineplot(data=cubic_df, ax=ax2)

---

## roll (rolling window)

def roll(
    data_frame: pd.DataFrame,
    window: int,
    roll_type: str = "mean",
    headers: [str] = None,
) -> pd.DataFrame

- roll_type: {‘mean’, ‘var’, 'std'}

In [None]:
# Show moving average, std and var in the column zero
roll_df = cubic_df.iloc[:,1].copy()
m_roll = roll(roll_df, 24, roll_type="mean")
m_roll = m_roll.rename(m_roll.name+"_mean")
std_roll = roll(roll_df, 24, roll_type="std")
std_roll = std_roll.rename(std_roll.name+"_std")
var_roll = roll(roll_df, 24, roll_type="var")
var_roll = var_roll.rename(var_roll.name+"_var")
wide_rdf = pd.concat([roll_df, m_roll, std_roll, var_roll], axis=1)
display(wide_rdf)

In [None]:
# Show graphic moving average, std and var in the column zero
fig = plt.figure(figsize=(16, 12))
ax = sns.lineplot(data=wide_rdf)

In [None]:
# Show Dickey-Fuller test
dftest(roll_df.dropna())

## roll - method of stationary a time serie

In [None]:
# Show moving average, std and var in the column zero (stationary)
roll_df_stat = (roll_df - m_roll).copy()
m_roll_stat = roll(roll_df_stat, 24, roll_type="mean")
m_roll_stat = m_roll_stat.rename("stat_mean")
std_roll_stat = roll(roll_df_stat, 24, roll_type="std")
std_roll_stat = std_roll_stat.rename("stat_std")
var_roll_stat = roll(roll_df_stat, 24, roll_type="var")
var_roll_stat = var_roll_stat.rename("stat_var")
wide_rdf_stat = pd.concat([roll_df_stat, m_roll_stat, std_roll_stat, var_roll_stat], axis=1)
display(wide_rdf_stat)

In [None]:
# Show graphic moving average, std and var in the column zero (stationary)
fig = plt.figure(figsize=(16, 12))
ax = sns.lineplot(data=wide_rdf_stat)

In [None]:
# Show Dickey-Fuller test (stationary)
dftest(roll_df_stat.dropna())

---

## ewm (exponential weighted moving)

def ewm(
        data_frame: pd.DataFrame,
        com: float = None,
        span: float = None,
        halflife: float = None,
        alpha: float = None,
        ignore_na: bool = False,
        ewm_type: str = "mean",
        headers: [str] = None,
    ) -> pd.DataFrame

- com: α=1/(1+com), for com≥0
- span: α=2/(span+1), for span≥1
- halflife: α=1−exp(log(0.5)/halflife), for halflife>0
- alpha: 0<α≤1
- ewm_types: {‘mean’, ‘var’, 'std'}

In [None]:
# Show ewm average, std and var in the column zero
ewm_df = cubic_df.iloc[:,1].copy()
m_ewm = ewm(ewm_df, halflife=24, ewm_type="mean")
m_ewm = m_ewm.rename(m_ewm.name+"_mean")
std_ewm = ewm(ewm_df, halflife=24, ewm_type="std")
std_ewm = std_ewm.rename(std_ewm.name+"_std")
var_ewm = ewm(ewm_df, halflife=24, ewm_type="var")
var_ewm = var_ewm.rename(var_ewm.name+"_var")
wide_edf = pd.concat([ewm_df, m_ewm, std_ewm, var_ewm], axis=1)
display(wide_edf)

In [None]:
# Show graphic ewm average, std and var in the column zero
fig = plt.figure(figsize=(16, 12))
ax = sns.lineplot(data=wide_edf)

In [None]:
# Show Dickey-Fuller test (real value - mean value)
dftest(ewm_df.dropna())

## ewm - method of stationary a time serie

In [None]:
# Show ewm average, std and var in the column zero (stationary)
ewm_df_stat = (ewm_df - m_ewm).copy() # or ewm_df - ewm_df.shift() for diferentiation
m_ewm_stat = ewm(ewm_df_stat, halflife=24, ewm_type="mean")
m_ewm_stat = m_ewm_stat.rename("stat_mean")
std_ewm_stat = ewm(ewm_df_stat, halflife=24, ewm_type="std")
std_ewm_stat = std_ewm_stat.rename("stat_std")
var_ewm_stat = ewm(ewm_df_stat, halflife=24, ewm_type="var")
var_ewm_stat = var_ewm_stat.rename("stat_var")
wide_edf_stat = pd.concat([ewm_df_stat, m_ewm_stat, std_ewm_stat, var_ewm_stat], axis=1)
display(wide_edf_stat)

In [None]:
# Show graphic ewm average, std and var in the column zero (stationary)
fig = plt.figure(figsize=(16, 12))
ax = sns.lineplot(data=wide_edf_stat)

In [None]:
# Show Dickey-Fuller test (stationary)
dftest(ewm_df_stat.dropna())

---

## seasonal

def seasonal(
    data_frame: pd.DataFrame,
    model: str = "additive",
    filt: [] = None,
    freq: int = None,
    two_sided: bool = True,
    extrapolate_trend: int = 0,
    lang: str = "pt",
    headers: [str] = None,
) -> pd.DataFrame

- model: {“additive”, “multiplicative”}

## decompose additive

In [None]:
# Show seasonal decompose (additive)
decompose_add = seasonal(inter_cubic.dropna(), model='additive', freq=24)

observed_add = decompose_add.iloc[0,0].rename("observed")
seasonal_add = decompose_add.iloc[0,1].rename("seasonal")
trend_add = decompose_add.iloc[0,2].rename("trend")
resid_add = decompose_add.iloc[0,3].rename("resid")

wide_df_add = pd.concat([observed_add, seasonal_add, trend_add, resid_add], axis=1)
wide_df_add

In [None]:
# Show graphic seasonal decompose (additive)
fig = plt.figure(figsize=(16, 12))
ax1 = fig.add_subplot(411)
ax2 = fig.add_subplot(412)
ax3 = fig.add_subplot(413)
ax4 = fig.add_subplot(414)

sns.lineplot(data=observed_add, ax=ax1)
sns.lineplot(data=seasonal_add, ax=ax2)
sns.lineplot(data=trend_add, ax=ax3)
sns.lineplot(data=resid_add, ax=ax4)

## decompose multiplicative

In [None]:
# Show seasonal decompose (multiplicative)
decompose_mult = seasonal(inter_cubic.dropna(), model='multiplicative', freq=24)

observed_mult = decompose_mult.iloc[0,0].rename('observed')
seasonal_mult = decompose_mult.iloc[0,1].rename('seasonal')
trend_mult = decompose_mult.iloc[0,2].rename('tend')
resid_mult = decompose_mult.iloc[0,3].rename('resid')

wide_df_mult = pd.concat([observed_mult, seasonal_mult, trend_mult, resid_mult], axis=1)
wide_df_mult

In [None]:
# Show graphic seasonal decompose (additive)
fig = plt.figure(figsize=(16, 12))
ax1 = fig.add_subplot(411)
ax2 = fig.add_subplot(412)
ax3 = fig.add_subplot(413)
ax4 = fig.add_subplot(414)

sns.lineplot(data=observed_mult, ax=ax1)
sns.lineplot(data=seasonal_mult, ax=ax2)
sns.lineplot(data=trend_mult, ax=ax3)
sns.lineplot(data=resid_mult, ax=ax4)

---

## autocorrelation

def autocorrelation(
    data_frame: pd.DataFrame,
    unbiased: bool = False,
    nlags: int = 40,
    fft: bool = None,
    alpha: float = None,
    missing: str = "none",
    headers: [str] = None,
) -> pd.DataFrame

- missing: {‘none’, ‘raise’, ‘conservative’, ‘drop’}

In [None]:
auto = autocorrelation(observed_mult, fft=False)
auto

In [None]:
sns.lineplot(data=auto)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(observed_mult, lags=40)

---

## partial_autocorrelation

def partial_autocorrelation(
    data_frame: pd.DataFrame,
    nlags: int = 40,
    method: str = "ywunbiased",
    alpha: float = None,
    headers: [str] = None,
) -> pd.DataFrame

- method: {‘yw’ or ‘ywunbiased’, ‘ywm’ or ‘ywmle’, ‘ols’, ‘ols-inefficient’, ‘ols-unbiased’, ‘ld’ or ‘ldunbiased’, ‘ldb’ or ‘ldbiased’}

In [None]:
partial_auto = partial_autocorrelation(observed_mult)
partial_auto

In [None]:
sns.lineplot(data=partial_auto)

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(observed_mult, lags=40)

---

## linear_regression

def linear_regression(
    data_frame: pd.DataFrame,
    fit_intercept: bool = True,
    normalize: bool = False,
    copy_X: bool = True,
    n_jobs: int = None,
    offset: int = 1,
    regressors: [str] = None,
    predictors: [str] = None,
) -> pd.DataFrame

In [None]:
regress = linear_regression(df, regressors=["T_room"], predictors=["T_room"])
regress

In [None]:
sns.lineplot(data=regress)

In [None]:
sns.lineplot(data=df.loc[:,"T_room"])