In [None]:
# !pip install plotly

In [None]:
from tsdata.raw import available_data, load_data

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
plt.rcParams["figure.figsize"] = (18, 8)

import scipy.stats as st
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.distributions.empirical_distribution import ECDF

from sklearn.metrics import mean_absolute_error #r2_score, median_absolute_error, 

import warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_decomposition(decomposition):
    plt.subplots(3, 1, figsize=(20, 21))

    plt.subplot(3, 1, 1)
    plt.plot(decomposition.trend, color='navy', markersize=3, label='trend')
    plt.legend(loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title("Trend")


    plt.subplot(3, 1, 2)
    plt.plot(decomposition.seasonal, 
             '-gd', markersize=3, label='seasonal')
    plt.plot([decomposition.seasonal.index[0], decomposition.seasonal.index[-1]], 
             decomposition.seasonal.mean()*np.array([1, 1]), '--k', 
             label=f"mean = {decomposition.seasonal.mean():.3g}")
    plt.legend(loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title(f"Seasonal : range={(decomposition.seasonal.max() - decomposition.seasonal.min()):.3g}")


    plt.subplot(3, 1, 3)
    plt.plot(decomposition.resid, '-o', color='maroon', markersize=3,  label='residuals')
    plt.plot([decomposition.resid.index[0], decomposition.resid.index[-1]], 
             decomposition.resid.mean()*np.array([1, 1]), '--k', 
             label=f"mean = {decomposition.resid.mean():.3g}")
    plt.legend(loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title("Residuals")

    plt.show()
    

def correlation_analysis(data, decomposition):
    N = len(data)
    plot_acf(data, 
             lags=N-1, 
             vlines_kwargs={'color' : 'b'},
             markerfacecolor='b', markeredgecolor='b', 
             title='Autocorrelation of target');

    trend = decomposition.trend
    N = len(trend.dropna())
    k = np.arange(0, N)
    plot_acf(trend.dropna(), 
             lags=N-1, 
             vlines_kwargs={'color' : 'navy'}, 
             markerfacecolor='navy', markeredgecolor='navy', 
             title='Autocorrelation of trend');

    seasonal = decomposition.seasonal
    N = len(seasonal)
    plot_acf(seasonal, 
             lags=N-1, 
             vlines_kwargs={'color' : 'g'}, 
             markerfacecolor='g', markeredgecolor='g', 
             title='Autocorrelation of seasonal');

    resid = decomposition.resid
    N = len(resid.dropna())
    plot_acf(resid.dropna(), 
             lags=N-1,   
             vlines_kwargs={'color' : 'maroon'}, 
             markerfacecolor='maroon', markeredgecolor='maroon', 
             title='Autocorrelation of residuals');


    plt.show()
    
    
def partial_autocorrelation(data, decomposition, lags=36):
    plot_pacf(data, 
             lags=lags, 
             vlines_kwargs={'color' : 'b'},
             markerfacecolor='b', markeredgecolor='b', 
             title='Partial autocorrelation of target');

    trend = decomposition.trend
    plot_pacf(trend.dropna(), 
             lags=lags, 
             vlines_kwargs={'color' : 'navy'}, 
             markerfacecolor='navy', markeredgecolor='navy', 
             title='Partial autocorrelation of trend');

    seasonal = decomposition.seasonal
    plot_pacf(seasonal, 
             lags=lags, 
             vlines_kwargs={'color' : 'g'}, 
             markerfacecolor='g', markeredgecolor='g', 
             title='Partial autocorrelation of seasonal');

    resid = decomposition.resid
    plot_pacf(resid.dropna(), 
             lags=lags,   
             vlines_kwargs={'color' : 'maroon'}, 
             markerfacecolor='maroon', markeredgecolor='maroon', 
             title='Partial autocorrelation of residuals');

    plt.show()

In [None]:
print(available_data())

# Tasks: 1, 2

Use the help function to explore what the series gafa_stock, PBS, vic_elec and pelt represent.

1. Use autoplot() to plot some of the series in these data sets.
2. What is the time interval of each series?

In [None]:
gafa_stock = load_data('gafa_stock')
PBS = load_data('PBS')
vic_elec = load_data('vic_elec')
pelt = load_data('pelt')

In [None]:
gafa_stock.head()

In [None]:
gafa_stock.Symbol.unique()
gafa_stock['Symbol'].unique()

<div style="background-color: rgb(175, 219, 245);">
<span style="color:blue">

**NOTE:** 

It is better use `['column_name_string']` to call the column by name 
    
instead of `df.column_name_string` since just imagine, e.g., what if the `column_name = mean`

</span>
</div>

In [None]:
gafa_stock.Date = pd.to_datetime(gafa_stock.Date)

In [None]:
# g = sns.lineplot(data=gafa_stock, x='Date', y='Close', hue='Symbol')

company_names = gafa_stock.Symbol.unique()
n_companies = len(company_names) 
colors = ['red', 'navy', 'blue', 'green']

for color, company_name in zip(colors, company_names):
    plt.figure(figsize=(20, 5))
    x = gafa_stock[gafa_stock.Symbol == company_name]['Date']
    y = gafa_stock[gafa_stock.Symbol == company_name]['Close']
    plt.plot(x, y, '-d', color=color, markersize=1)
    plt.legend(['Close price',], loc='upper right')
    plt.grid(linestyle=':', color='k')
    plt.title(f'{company_name}: Close prices for stocks')

# What is the time interval of each series?
# day interval without weekends

<div style="background-color: rgb(255, 255, 224);">
<span style="color: rgb(186, 22, 12)">

**ALERT:** 

And what information can we extract from these plots?
    
General remark - there is a lack of conclusions, ideas, hypotheses...
    
**I suspect** these data for non-equidistance

</span>
</div>


<div style="background-color: rgb(255, 255, 224);">
<span style="color: rgb(186, 22, 12)">

**ALERT:** 

Are you sure that your data are equidistant, i.e., you have not missing months?

</span>
</div>

In [None]:
PBS.head()

In [None]:
PBS[(PBS.Concession == 'Concessional') & (PBS.Type == 'Co-payments') & (PBS.ATC1 == 'A') & (PBS.ATC2_desc == 'STOMATOLOGICAL PREPARATIONS')]

<div style="background-color: rgb(175, 219, 245);">
<span style="color:blue">

**NOTE:** 

It will be better to create new table dropping all columns that now have a single value.

</span>.
</div>

In [None]:
fig = px.line(PBS[(PBS.Concession == 'Concessional') & (PBS.Type == 'Co-payments') & 
                  (PBS.ATC1 == 'A') & (PBS.ATC2_desc == 'STOMATOLOGICAL PREPARATIONS')], 
              x='Month', y='Cost', title='Cost of STOMATOLOGICAL PREPARATIONS', markers=True)
fig.show()

# monthly interval

## `vic_elec`

In [None]:
vic_elec

In [None]:
vic_elec.info()

<div style="background-color: rgb(255, 218, 233);">
<span style="color: rgb(206, 32, 41)">

**ERROR:** 

Everywhere the classical EDA is missed.
    
Are your timemarks monotonic? Are they equidistant?

</span>
</div>

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(vic_elec['Demand'], color='navy')
plt.title('Demand of the whole time range')
plt.grid(linestyle=':', color='k')
plt.show()

plt.figure(figsize=(20, 5))
plt.plot(vic_elec.loc['2014-01-01':, 'Demand'], color='navy')
plt.title('Demand of last 2014 year')
plt.grid(linestyle=':', color='k')
plt.show()

plt.figure(figsize=(20, 5))
plt.plot(vic_elec.loc['2014-12-01':, 'Demand'], '-o', color='navy')
plt.title('Demand of last month')
plt.grid(linestyle=':', color='k')
plt.show()

plt.figure(figsize=(20, 5));
plt.plot(vic_elec.loc['2014-12-25':, 'Demand'], '-o', color='navy')
plt.title('Demand of last week')
plt.grid(linestyle=':', color='k')
plt.show()

<div style="background-color: rgb(255, 218, 233);">
<span style="color: rgb(206, 32, 41)">

**ERROR:** 

That is not neither "last month" nor "last week"

</span>
</div>

In [None]:
pelt.head()

In [None]:
fig = px.line(pelt, x='Year', y=['Hare', 'Lynx'], title='Hare and Lynx changes', markers=True)
fig.show()

# year interval

# Task: 3 

Download the file tute1.csv from the book website, open it in Excel (or some other spreadsheet application), and review its contents. You should find four columns of information. Columns B through D each contain a quarterly series, labelled Sales, AdBudget and GDP. Sales contains the quarterly sales for a small company over the period 1981-2005. AdBudget is the advertising budget and GDP is the gross domestic product. All series have been adjusted for inflation.



In [None]:
tute = pd.read_csv('data/tute1.csv')
tute.head()

In [None]:
tute.isna().sum(axis=0)

<div style="background-color: rgb(144, 238, 144);">
<span style="color: rgb(0, 128, 0);">

**THUMP UP:** 

At least missig values have been checked :)
    
</span>
</div>

In [None]:
fig = px.line(tute, x="Quarter", y=['Sales','AdBudget', 'GDP'], title='Sales, AdBudget, GDP changes')
fig.show()

<div style="background-color: rgb(175, 219, 245);">
<span style="color:blue">

**NOTE:** 

What if you plot normalized/scaled values?

</span>
</div>

# Task: 5

Download tourism.xlsx from the book website and read it into R using readxl::read_excel().

1. Create a tsibble which is identical to the tourism tsibble from the tsibble package.
2. Find what combination of Region and Purpose had the maximum number of overnight trips on average.
3. Create a new tsibble which combines the Purposes and Regions, and just has total trips by State.

In [None]:
tourism = pd.read_csv('data/tourism.csv')

In [None]:
tourism

In [None]:
tourism_grouped = tourism.groupby(['Region', 'Purpose']).sum('Trips').reset_index().sort_values('Trips',ascending=False)

In [None]:
tourism_grouped.head()
# Sydney; Visiting

In [None]:
tourism_grouped_by_state = tourism.groupby(['State']).sum('Trips').reset_index().sort_values('Trips',ascending=False)
tourism_grouped_by_state

# Task 6

Create time plots of the following four time series: Bricks from aus_production, Lynx from pelt, Close from gafa_stock, Demand from vic_elec.

1. Use ? (or help()) to find out about the data in each series.
2. For the last plot, modify the axis labels and title.

In [None]:
aus_production = load_data('aus_production')

# for vic_elec, gafa_stock, pelt check TASK:1, 2

In [None]:
px.line(aus_production, x='Quarter', y='Bricks', title='Quarter bricks production', markers=True)

# Task: 7

The aus_arrivals data set comprises quarterly international arrivals to Australia from Japan, New Zealand, UK and the US.

1. Use autoplot(), gg_season() and gg_subseries() to compare the differences between the arrivals from these four countries.
2. Can you identify any unusual observations?

In [None]:
aus_arrivals = load_data('aus_arrivals')

In [None]:
aus_arrivals.head()

In [None]:
px.line(aus_arrivals, x='Quarter', y='Arrivals', color='Origin', markers=True, title='Number of arrivals to different countries')

# Can you identify any unusual observations?
# I dont see any real outliers in these plots (may be in Japan and NZ there are 2 points with peaks) 

# Taks: 8 

Monthly Australian retail data is provided in aus_retail. Select one of the time series as follows (but choose your own seed value

1. Can you spot any seasonality, cyclicity and trend? What do you learn about the series?



In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
aus_retail = load_data('aus_retail')

In [None]:
aus_retail

In [None]:
df = aus_retail[aus_retail['Series ID'] == 'A3349849A']
# select one series

In [None]:
px.line(df, x='Month', y='Turnover', markers=False, title='Turnover for A3349849A client')

In [None]:
df.set_index('Month', inplace=True)
df.index = pd.to_datetime(df.index, format='%Y %b')
df
# df = df['Turnover']

In [None]:
decomposition = seasonal_decompose(df['Turnover'], model='additive', period=12)

In [None]:
plot_decomposition(decomposition)

In [None]:
# Decomposition detect only trend and seasonal components, so if we have cyclicity component, 
# it will be seen in trend and residual plots

# Task 9

Use the following graphics functions: **autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF()** and explore features from the following time series: 
1. “Total Private” Employed from us_employment,
2. Bricks from aus_production
3. Hare from pelt
4. “H02” Cost from PBS, and us_gasoline.

Questions are:
1. Can you spot any seasonality, cyclicity and trend?
2. What do you learn about the series?
3. What can you say about the seasonal patterns?
4. Can you identify any unusual years?

In [None]:
us_employment = load_data('us_employment')
aus_production = load_data('aus_production')
pelt = load_data('pelt')
PBS = load_data('PBS') 

## us_employment

In [None]:
us_employment_observed = us_employment[us_employment.Title == 'Total Private']
us_employment_observed.Month = pd.to_datetime(us_employment_observed.Month)
us_employment_observed.set_index('Month', inplace=True)
us_employment_observed = us_employment_observed.Employed

In [None]:
px.line(us_employment_observed)

In [None]:
decomposition = seasonal_decompose(us_employment_observed, period=12, model='additive')

In [None]:
plot_decomposition(decomposition)

In [None]:
correlation_analysis(us_employment_observed, decomposition)

In [None]:
partial_autocorrelation(us_employment_observed, decomposition, 36)

1. we decompose series with trend, seasonal and residuals components. 
    * seasonal component looks strange, but i think it
    * residuals look very bad, not stationary series (without mean=0 and std=constant)
2. What do you learn about the series
    * many questions arrised while odserving this series. We should create better decomposition and detect cyclical component (for now i dont know how to do it :) )
3. What can you say about the seasonal patterns?
    * it is clear that from january to december we can catch uptrend but it feels down from december to january
4. Can you identify any unusual years?
    * I don't thing that there are any unysual years but there are cyclical component that we dont catch

<div style="background-color: rgb(250, 230, 250);">
<span style="color: rgb(204, 51, 204)">.

**DISCUSS:** 

</span>.
</div>

## aus_production

In [None]:
aus_production.head()
# Bricks from aus_production

In [None]:
qs = aus_production['Quarter'].str.replace(r'(\d+) (Q\d)', r'\1-\2')
qs

In [None]:
index = pd.PeriodIndex(qs, freq='Q').to_timestamp()
index

<div style="background-color: rgb(144, 238, 144);">
<span style="color: rgb(0, 128, 0);">

**THUMP UP:** 

Yahoo! At least here we see how to convert `str` type of timemarks in something more appropriate! :)
    
</span>
</div>

In [None]:
aus_production_observed = aus_production.copy()
aus_production_observed.set_index('Quarter', inplace=True)
aus_production_observed = aus_production_observed.Bricks
aus_production_observed.index = index

In [None]:
aus_production_observed

In [None]:
aus_production_observed.isna().sum() / aus_production_observed.count() * 100

In [None]:
aus_production_observed[aus_production_observed.isna() == True]

In [None]:
aus_production_observed.dropna(inplace=True)

<div style="background-color: rgb(255, 218, 233);">
<span style="color: rgb(206, 32, 41)">

**ERROR:** 

Why?! Why do you drop these rows?

</span>
</div>

In [None]:
px.line(aus_production_observed)

In [None]:
decomposition = seasonal_decompose(aus_production_observed, period=4, model='additive')

In [None]:
plot_decomposition(decomposition)

In [None]:
correlation_analysis(aus_production_observed, decomposition)

In [None]:
partial_autocorrelation(aus_production_observed, decomposition)

<div style="background-color: rgb(175, 219, 245);">
<span style="color:blue">

**NOTE:** 

To finalize the analysis of these data I recommend to analyze the distribution of residuals.`

</span>
</div>

<div style="background-color: rgb(250, 230, 250);">
<span style="color: rgb(204, 51, 204)">

**DISCUSS:** 
    
And to make it analysis not only "painting" I recommend you to train in the Art of Conclusions and Ideas.

</span>
</div>

# Task 10

The following time plots and ACF plots correspond to four different time series. Your task is to match each time plot in the first row with one of the ACF plots in the second row.

![image.png](attachment:image.png)

Answer: 
* 1 - b 
* 2 - a
* 3 - d
* 4 - c

# Task 11

The aus_livestock data contains the monthly total number of pigs slaughtered in Victoria, Australia, from Jul 1972 to Dec 2018. Use filter() to extract pig slaughters in Victoria between 1990 and 1995. Use autoplot() and ACF() for this data. How do they differ from white noise? If a longer period of data is used, what difference does it make to the ACF?



In [None]:
aus_livestock = load_data('aus_livestock')

In [None]:
aus_livestock.head()

In [None]:
aus_livestock.Animal.unique()

In [None]:
aus_livestock.State.unique()

In [None]:
aus_livestock[(aus_livestock['Animal'] == 'Pigs')]

In [None]:
aus_livestock_observed = aus_livestock[(aus_livestock['Animal'] == 'Pigs') & (aus_livestock.State == 'Victoria')]
aus_livestock_observed.index = pd.to_datetime(aus_livestock_observed.Month, format='%Y %b')
aus_livestock_observed = aus_livestock_observed[(aus_livestock_observed.index < '1995-01-01') & (aus_livestock_observed.index >'1990-01-01')]
aus_livestock_observed = aus_livestock_observed.Count

In [None]:
aus_livestock_observed.head(3), aus_livestock_observed.tail(3)

<div style="background-color: rgb(144, 238, 144);">
<span style="color: rgb(0, 128, 0);">

**THUMP UP:** 

Transformation of date-time data is excellent!
    
</span>
</div>

<div style="background-color: rgb(255, 255, 224);">
<span style="color: rgb(186, 22, 12)">

**ALERT:** 

But once again - are you sure that they are monotonic and equidistant?

</span>
</div>

In [None]:
px.line(aus_livestock_observed)

In [None]:
decomposition = seasonal_decompose(aus_livestock_observed, period=12)

In [None]:
plot_decomposition(decomposition)

In [None]:
correlation_analysis(aus_livestock_observed, decomposition)

In [None]:
partial_autocorrelation(aus_livestock_observed, decomposition, lags=20)

All residuals lay under the confidence interval, it is good, but lest observe more detailed this part 

<div style="background-color: rgb(255, 255, 224);">
<span style="color: rgb(186, 22, 12)">

**ALERT:** 

The QUESTION: How should we study correlation characteristic of non-stationary processes?

</span>
</div>

In [None]:
resid = decomposition.resid
color = 'maroon'

plt.subplots(1, 2, figsize=(24, 8))

plt.subplot(1, 2, 1)
plt.plot(resid, '-', color=color)
plt.grid(linestyle=':', color='k')
plt.title("Residuals")



x_fit = np.linspace(resid.min(), resid.max(), 201)
loc_laplace, scale_laplace = st.laplace.fit(resid.dropna())
loc_norm, scale_norm = st.norm.fit(resid.dropna())
# print(f"Fitting of residuals by Laplace distribution: fitted mean = {loc:.3f}, fitted std = {scale:.3f}")
y_fit_laplace = st.laplace.pdf(x_fit, loc_laplace, scale_laplace)
y_fit_norm = st.norm.pdf(x_fit, loc_norm, scale_norm)

plt.subplot(1, 2, 2)
sns.distplot(resid, color=color, bins=20, vertical=True, label="distribution of residuals")
plt.plot(y_fit_laplace, x_fit, '-b', 
         label=f"approximation by Laplace distribution:\n  fitted mean = {loc_laplace:.4g}, fitted std = {scale_laplace:.4g}")
plt.plot(y_fit_norm, x_fit, '-g', 
         label=f"approximation by normal distribution:\n  fitted mean = {loc_norm:.4g}, fitted std = {scale_norm:.4g}")
plt.legend()
# plt.ylim(resid-0.02*y_range, y_max+0.02*y_range)
plt.title("Distribution of residuals")
plt.grid(linestyle=':', color='k')

plt.show()

In [None]:
ecdf_resid_instance = ECDF(resid.dropna())
resid_arr = resid.dropna().sort_values().values
ecdf_resid = ecdf_resid_instance(resid_arr)


cdf_norm = st.norm.cdf(resid_arr, loc=loc_norm, scale=scale_norm)
cdf_laplace = st.laplace.cdf(resid_arr, loc=loc_laplace, scale=scale_laplace)


mae_norm = mean_absolute_error(ecdf_resid, cdf_norm)
mae_laplace = mean_absolute_error(ecdf_resid, cdf_laplace)


plt.subplots(1, 1, figsize=(20, 8))
plt.plot(resid_arr, ecdf_resid, '-', color='maroon')
plt.plot(resid_arr, cdf_norm, '-g', 
         label=f"Normal approx : MAE = {mae_norm:.3g}")
plt.plot(resid_arr, cdf_laplace, '-b', 
         label=f"Laplace approx: MAE = {mae_laplace:.3g}")
plt.legend()
plt.title("CDF of decomposition residuals")
plt.show()

print(mae_norm/mae_laplace)

<div style="background-color: rgb(144, 238, 144);">
<span style="color: rgb(0, 128, 0);">

**THUMP UP:** 

Thanks for analysis of residuals!
    
</span>
</div>

<div style="background-color: rgb(175, 219, 245);">
<span style="color:blue">

**NOTE:** 

Conclusions are expected...

</span>
</div>

<div style="background-color: rgb(250, 230, 250);">
<span style="color: rgb(204, 51, 204)">.

**DISCUSS:** All below...

</span>.
</div>

In [None]:
sns.distplot(resid, bins=20)

In [None]:
sns.distplot(resid, bins=200)

So, i think residuals looks like a normal dist, but we should mention that there is a really big std