In [1]:
# Libraries
import pandas as pd
import numpy as np
from time import strptime
import datetime
import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import ipywidgets as widgets

import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
import itertools

sns.set(palette = 'Set1',style='darkgrid')

In [2]:
time = pd.read_csv('../data/covid_19_data.csv')
time = time.groupby(['Country/Region', 'ObservationDate']).agg('sum')
time = time.reset_index()
time.drop('SNo', axis=1, inplace=True)
time.rename(columns={'ObservationDate': 'Date'}, inplace=True)
time = time.sort_values(by=['Date'])
time['Date'] =  pd.to_datetime(time['Date'], format='%m/%d/%Y')

In [3]:
#Returns a time series on rolling mean and rolling standard deviation
def roll(country = 'all',case='Confirmed'):
    if country == 'all':
        ts=time.groupby('Date').agg({'Confirmed':'sum', 'Deaths':'sum', 'Recovered':'sum'}).reset_index()
    if country in time['Country/Region'].unique():
        ts=time.loc[(time['Country/Region']==country)]     
    ts=ts[['Date',case]]
    ts=ts.set_index('Date')
    ts.astype('int64')
    return (ts.rolling(window=4,center=False).mean().dropna())


def rollStatsPlot(country = 'all', case='Confirmed'):
    if country == 'all':
        ts=time.groupby('Date').agg({'Confirmed':'sum', 'Deaths':'sum', 'Recovered':'sum'}).reset_index()
    if country in time['Country/Region'].unique():
        ts=time.loc[(time['Country/Region']==country)]  
    ts=ts[['Date',case]]
    ts=ts.set_index('Date')
    ts.astype('int64')
    plt.figure(figsize=(16,6))
    plt.plot(ts.rolling(window=7,center=False).mean().dropna(),label='Rolling Mean')
    plt.plot(ts[case])
    plt.plot(ts.rolling(window=7,center=False).std(),label='Rolling std')
    plt.legend()
    plt.title(f'{case} Report Distribution With Rolling Mean and Stdev (Country: {country})')
    plt.xticks([])
    plt.show()
    
def dickeyFull(time_series):
    print('Dickey-Fuller Test:')
    test = adfuller(time_series, autolag='AIC')
    results = pd.Series(test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for i,val in test[4].items():
        results['Critical Value (%s)'%i] = val
    print(results)
    
def autoCorr(time_series):
    plot_acf(time_series,lags=12,title="ACF")
    plot_pacf(time_series,lags=12,title="PACF")
    
def split(time_series):
    size = int(len(time_series) * 0.85)
    train= time_series[:size]
    test = time_series[size:]
    
    return(train,test)

def arima(time_series,test):
    p=d=q=range(0,6)
    a=99999
    pdq=list(itertools.product(p,d,q))
    
    #Determining the best parameters
    for var in pdq:
        try:
            model = ARIMA(time_series, order=var)
            result = model.fit()

            if result.aic <= a:
                a=result.aic
                param=var
        except:
            continue
            
    #Modeling
    model = ARIMA(time_series, order=param)
    result = model.fit()
    result.plot_predict(start=int(len(time_series) * 0.7), end=int(len(time_series) * 1.2))
    pred=result.forecast(steps=len(test))[0]
    #Plotting results
    f,ax=plt.subplots()
    plt.plot(pred,c='green', label= 'predictions')
    plt.plot(test.values, c='red',label='real values')
    plt.legend()
    plt.title('True vs predicted values')
    
    #Printing the results
    print(result.summary())        
    print(f'\nParameters used: AutoRegression {param[0]}, Difference order {param[1]}, Moving Average {param[2]}')
    return (pred)

def analyze(country = 'all', case = 'Confirmed'):
    rollStatsPlot(country, case)
    
    rolled = roll(country, case)
    fig=sm.tsa.seasonal_decompose(rolled.values,period=7).plot()
    plt.show()
    
    dickeyFull(rolled[case].values)
    autoCorr(rolled)
    plt.show()
    
    train,test=split(rolled)
    arima(train,test)

In [4]:
countries_list = list(time['Country/Region'].unique())
countries_list.append('all')
countries_list.sort()

@widgets.interact(
    case = widgets.Dropdown(
        options=['Confirmed', 'Deaths', 'Recovered'],
        value='Confirmed',
        description='Data:',
        disabled=False,
    ),
    country = widgets.Dropdown(
        options=countries_list,
        value='all',
        description='Data:',
        disabled=False,
    )
)

def chart(country, case):
    analyze(country=country,case=case)

interactive(children=(Dropdown(description='Data:', index=215, options=(' Azerbaijan', "('St. Martin',)", 'Afg…

## Comments Paolo

Great visualizations Victor, I love the drop down menu where you can choose different options. Also you were not scared to experiment with advanced concepts in time series analysis. Also I liked the fact that you defined all your functions at the beginning. Some comments:
- It would help and improve your projects if you had a least an introduction and conclusions in the notebook
- It would help clarity if you added docstrings to the functions
- I understand the need to performing the rolling mean to smoothen the data but I suggest to always visualize the raw data also, to show and comments your reasoning
- When you I look at the results of the Dickey-Fuller Test for the cases I have tested (for example 'all') I have noticed that the p-value is always quite big, suggesting that the results are not significant.
- As another idea you could try to fit the the data to an exponential $y=A(\exp{\beta{x}})$ with fitting parameters $A$ and $\beta$ 