In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

%matplotlib inline
mpl.rcParams['figure.figsize'] = (16,9)
pd.set_option('display.max_rows', 500)

import plotly.graph_objects as go

## Data Load

In [None]:
# try to parse the dates at the beginning
# it works out of the box if the date was stored ISO format YYYY-MM-DD

df_analyse=pd.read_csv('C:/Users/Nitin/ds-covid19/data/processed/COVID_small_sync_timeline_table.csv',sep=';',
                      parse_dates=[0])

df_analyse.sort_values('date',ascending=True).head()

In [None]:
country_list=df_analyse.columns[1:]

## Helper Functions

In [None]:
def quick_plot(x_in,df_input,y_scale='log',slider=False):
     """ Quick basic plot for quick static evaluation of a time series
        
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array
             array of date time object, or array of numbers
        df_input : pandas dataframe
             the plotting matrix where each column is plotted
             the name of the column will be used for the legend
        scale: str
             y-axis scale as 'log' or 'linear'
        slider: bool
             True or False for x-axis slider
             
             
        Returns:
        -------
        
    """
    fig = go.Figure()
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                         x=x_in,
                         y=df_input[each],
                         name=each,
                         opacity=0.8))
        
    fig.update_layout(autosize=True,
            width=1024,
            height=768,
            font=dict(
                family="PT sans, monospace",
                size=18,
                color="#7f7f7f"
                         )
             )
        
    fig.update_yaxes(type=y_scale)#, range=[0.1,2]
    fig.update_xaxes(tickangle=-45,
                        nticks=20,
                        tickfont=dict(size=14,color="#7f7f7f")
                        )
    if slider==True:
            fig.update_layout(xaxis_rangeslider_visible=True)
            fig.show()
        

In [None]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,3:-1],
           y_scale='linear',
           slider=True)

## Fitting a polynomial norm curve

Refer section: Hyperparameters and Model Validation from Jake VanderPlas

This function is from the Python Data Science Handbook by Jake VanderPlas

https://scikit-learn.org/stable/auto_examples/linear_model/plot_polynomial_interpolation.html#sphx-glr-download-auto-examples-linear-model-plot-polynomial-interpolation-py

In [6]:
# check that all the data is there
df_poly_check=df_analyse.iloc[0:27,3:-1].reset_index()
df_poly_check.head()

Unnamed: 0,index,Spain,Germany
0,0,0,0
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
5,5,0,1
6,6,0,4
7,7,0,4
8,8,0,4
9,9,0,5


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                        LinearRegression(**kwargs))

In [None]:
y=df_poly_check[['Germany','Italy','US','Spain']].unstack().sort_index(axis=0,level=1)

In [None]:
y.head()

In [None]:
test_points=28
y_train=y[0:-test_points-1]
y_test=y[-test_points:]

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn;
seaborn.set(rc={'figure.figsize':(16,8)})  #plot formatting,prepare for subplot

In [None]:
fig, axl = plt.subplots(1,1)

axl.scatter(np.arange(len(y))/4,y, color='blue')
axl.axvspan((len(y)-test_points-1)/4, len(y)/4, facecolor='b', alpha=0.5)

for degree in [1,3,7,15]:
    y_hat_insample=PolynomialRegression(degree).fit(X_train, y_train).predict(X_train)
    y_hat_test = PolynomialRegression(degree).fit(X_train, y_train).predict(X_test)
    
    X_plot=np.concatenate((X_train, X_test), axis=None)
    y_plot=np.concatenate((y_hat_insample, y_hat_test), axis=None)
    
    axl.plot(X_plot, y_plot, label='degree={0}'.format(degree)+
        '      MAPE train:    '+str(mean_absoluate_percentage_error(y_hat_insample,y_train))[0:3]
        +'     MAPE test      '+str(mean_absolute_percentage_error(y_hat_test, y_test))[0:3])


axl.set_ylim(100, 150000)
axl.set_yscale('log ')
axl.legend(loc='best',
          prop={'size': 16});

# Regression Metrics (source Wikipedia)

## Mean absolute error

In statistics, mean absolute error MAE is a measure of Errors between paired obseravtions expressing the same phenomenon.

## Mean absolute percentage error (MAPE)

Mean absolute percentage error (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of forecasting method in statistics.

For example, in trend estimation, also used as a loss function for regression problems in machine learning.

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

## Symmetric mean absolute percentage error

Symmetric mean absolute percentage error (SMAPE or sMAPE) is an accuracy measure based on percentage (or relative) errors.