In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')

In [None]:
columns=['total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'tests_per_case', 'positive_rate', 'stringency_index',
       'population', 'population_density', 'median_age', 'aged_65_older',
       'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
       'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers',
       'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand',
       'life_expectancy', 'human_development_index']

**Converting columns to float type**

In [None]:
for column in columns:
  df[column] = df[column].astype(float)

**Eliminating the rows with negative values which doesn't make sense**

In [None]:
problem_idx = df[(df['new_cases']<0)|(df['new_deaths']<0)|
                 (df['new_cases_smoothed']<0)|(df['new_deaths_smoothed']<0)|(df['new_cases_per_million']<0)|(df['new_deaths_per_million']<0)
                 |(df['new_deaths_smoothed_per_million']<0)|(df['new_tests']<0)|(df['new_tests_per_thousand']<0)|(df['location'].isin(['World']))].index
df = df[~df.index.isin(problem_idx)]

**Replacing NaN values with 0**

In [None]:
df['date']=df[['date']].apply(pd.to_datetime)
df['population_coverage'] = (df['total_tests'] / df['population']).astype(float)
df['death_rate'] = (df['new_deaths_smoothed_per_million'] / df['new_cases_smoothed_per_million']).replace(np.inf,np.nan)
df['death_rate']=df['death_rate'].astype(float)
df=df.fillna('0')
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_per_case,positive_rate,tests_units,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population_coverage,death_rate
0,AFG,Asia,Afghanistan,2019-12-31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38928300.0,54.422,18.6,2.581,1.337,1803.99,0,597.029,9.59,0,0,37.746,0.5,64.83,0.498,0,0
1,AFG,Asia,Afghanistan,2020-01-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38928300.0,54.422,18.6,2.581,1.337,1803.99,0,597.029,9.59,0,0,37.746,0.5,64.83,0.498,0,0
2,AFG,Asia,Afghanistan,2020-01-02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38928300.0,54.422,18.6,2.581,1.337,1803.99,0,597.029,9.59,0,0,37.746,0.5,64.83,0.498,0,0
3,AFG,Asia,Afghanistan,2020-01-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38928300.0,54.422,18.6,2.581,1.337,1803.99,0,597.029,9.59,0,0,37.746,0.5,64.83,0.498,0,0
4,AFG,Asia,Afghanistan,2020-01-04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38928300.0,54.422,18.6,2.581,1.337,1803.99,0,597.029,9.59,0,0,37.746,0.5,64.83,0.498,0,0


In [None]:
df['death_rate']=df['death_rate'].astype(float)
df['population_coverage']=df['population_coverage'].astype(float)
df.dtypes

iso_code                                   object
continent                                  object
location                                   object
date                               datetime64[ns]
total_cases                               float64
new_cases                                 float64
new_cases_smoothed                        float64
total_deaths                              float64
new_deaths                                float64
new_deaths_smoothed                       float64
total_cases_per_million                   float64
new_cases_per_million                     float64
new_cases_smoothed_per_million            float64
total_deaths_per_million                  float64
new_deaths_per_million                    float64
new_deaths_smoothed_per_million           float64
new_tests                                 float64
total_tests                               float64
total_tests_per_thousand                  float64
new_tests_per_thousand                    float64


In [None]:
df_all_countries = df.groupby(['location','continent']).agg({'death_rate':np.mean,'total_deaths': np.sum, 'gdp_per_capita': np.mean, 'new_cases_per_million':np.sum,'human_development_index':np.mean,'population_coverage':np.max,'aged_65_older':np.mean,'extreme_poverty':np.mean,
       'cardiovasc_death_rate':np.mean, 'diabetes_prevalence':np.mean,'handwashing_facilities':np.mean, 'hospital_beds_per_thousand':np.mean}).reset_index()

**Plot of total cases per million and gdp of the countries and comparing the death rates among the countries**


*   Cases in African countries don't seem to depend on gdp, and they relatively have low death rates 
*   European countries have larger number of cases, and it seems like the countries with higher gpa have relatively low death rates

*Hover the plot for more information*




In [None]:
import plotly.express as px
gdp_plot=px.scatter(df_all_countries, 
                x="new_cases_per_million", y="gdp_per_capita", size="death_rate", color="continent",
                 hover_name="location",log_x=True ,size_max=40,labels={
                     "new_cases_per_million": "Cases per million"})

gdp_plot.show()

**Plot of total cases per million and hdi of the countries and comparing the death rates among the countries**


*   Countries with higher hdi seem to have have larger number of cases, and it seems like the countries with lower hdi have relatively low death rates

*Hover the plot for more information*






In [None]:
human_dev=px.scatter(df_all_countries, 
                 x="new_cases_per_million", y="human_development_index", size="death_rate", color="continent",
                 hover_name="location",log_x=True ,size_max=40,labels={
                     "new_cases_per_million": "Cases per million"})
human_dev.show()

In [None]:
hospital=px.scatter(df_all_countries, 
                 y="handwashing_facilities", x="hospital_beds_per_thousand", size="death_rate", color="continent",
                 hover_name="location" ,size_max=40)
hospital.show()

**Countries we are interested in:**

In [None]:
list_country = ['United States', 'Russia', 'Brazil', 'India', 'Italy', 
                'Spain', 'China', 'Sweden', 'United Kingdom', 'Sweden','Nepal','Germany']

In [None]:
df2 = df[df['continent'].notna()]
df2 = df2[df2['location'].isin(list_country)].iloc[:, :8]
df2['total_cases'] = df2['total_cases'].astype(float)


**Animation bar Plot of 12 countries from 2019-12-31 to 2020-10-15**

*Play the animation button*

In [None]:
fig = px.bar(df2, x='location', y='total_cases', height=600, width=1000, 
             color="location",
             animation_frame=pd.DatetimeIndex(df2['date']).strftime('%Y-%m-%d'),
             animation_group="location", log_y=True, title='Rising Covid Cases with time')
fig.show()

**Time series analysis and forecasting for Nepal**

In [None]:
country_df = df.loc[df['location'] == 'Nepal']
df_grouped = country_df.groupby(['date']).agg({'new_deaths': np.sum,  'new_cases':np.sum,'total_cases_per_million':np.mean,'population_coverage':np.max,
}).reset_index()

**Daily new_cases vs time plot**

The shaded area represents the number of daily new cases where as the red line indicates the trend or the total cases **per million** till that time. 

In [None]:
import plotly.express as px
fig = px.area(df_grouped, x='date', y='new_cases', title='Time Series with Rangeslider')
fig.add_scatter(x=df_grouped['date'], y=df_grouped['total_cases_per_million'])
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [None]:
country_df['date'].min(),country_df['date'].max()

(Timestamp('2019-12-31 00:00:00'), Timestamp('2020-10-15 00:00:00'))

In [None]:
country_df.tail()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_per_case,positive_rate,tests_units,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population_coverage,death_rate
32189,NPL,Asia,Nepal,2020-10-11,105684.0,5008.0,3016.286,614.0,14.0,12.286,3627.165,171.879,103.521,21.073,0.48,0.422,12427.0,1176984.0,40.395,0.427,14648.0,0.503,4.856,0.206,tests performed,0.0,29136808.0,204.43,25.0,5.809,3.212,2442.804,15.0,260.797,7.26,9.5,37.8,47.782,0.3,70.78,0.574,0.040395,0.004076
32190,NPL,Asia,Nepal,2020-10-12,107755.0,2071.0,2990.286,636.0,22.0,14.429,3698.243,71.078,102.629,21.828,0.755,0.495,14530.0,1191514.0,40.894,0.499,14755.0,0.506,4.934,0.203,tests performed,0.0,29136808.0,204.43,25.0,5.809,3.212,2442.804,15.0,260.797,7.26,9.5,37.8,47.782,0.3,70.78,0.574,0.040894,0.004823
32191,NPL,Asia,Nepal,2020-10-13,111802.0,4047.0,3219.857,645.0,9.0,13.0,3837.14,138.896,110.508,22.137,0.309,0.446,15577.0,1207091.0,41.428,0.535,15402.0,0.529,4.783,0.209,tests performed,0.0,29136808.0,204.43,25.0,5.809,3.212,2442.804,15.0,260.797,7.26,9.5,37.8,47.782,0.3,70.78,0.574,0.041428,0.004036
32192,NPL,Asia,Nepal,2020-10-14,115358.0,3556.0,3506.286,663.0,18.0,14.286,3959.185,122.045,120.339,22.755,0.618,0.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,29136808.0,204.43,25.0,5.809,3.212,2442.804,15.0,260.797,7.26,9.5,37.8,47.782,0.3,70.78,0.574,0.0,0.004072
32193,NPL,Asia,Nepal,2020-10-15,117996.0,2638.0,3391.857,675.0,12.0,13.857,4049.723,90.538,116.411,23.167,0.412,0.476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,29136808.0,204.43,25.0,5.809,3.212,2442.804,15.0,260.797,7.26,9.5,37.8,47.782,0.3,70.78,0.574,0.0,0.004089


**Time series predictionn using Prophet**

Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

In [None]:
#from datetime import datetime
prediction_df=country_df[['date','new_cases']]
prediction_df.columns = ['ds', 'y']
prediction_df['ds']= pd.to_datetime(prediction_df['ds'])
prediction_df.tail()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ds,y
32189,2020-10-11,5008.0
32190,2020-10-12,2071.0
32191,2020-10-13,4047.0
32192,2020-10-14,3556.0
32193,2020-10-15,2638.0


**Fitting the model on our data**

Also, we're generating our own test date time from 2019-12-31 to 2020-10-15 plus 15 days 

In [None]:
from fbprophet import Prophet
prophet_basic = Prophet()
prophet_basic.fit(prediction_df)
future= prophet_basic.make_future_dataframe(periods=15)
future.tail(15)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Unnamed: 0,ds
290,2020-10-16
291,2020-10-17
292,2020-10-18
293,2020-10-19
294,2020-10-20
295,2020-10-21
296,2020-10-22
297,2020-10-23
298,2020-10-24
299,2020-10-25


In [None]:
forecast = prophet_basic.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(15)


Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
290,2020-10-16,2542.163279,2139.434956,2946.42302
291,2020-10-17,2560.008874,2169.023769,2950.082048
292,2020-10-18,2551.397892,2154.673724,2952.854592
293,2020-10-19,2575.883039,2142.886159,2958.861094
294,2020-10-20,2613.207722,2194.341506,2985.068004
295,2020-10-21,2631.815113,2221.524487,3040.391947
296,2020-10-22,2728.708045,2325.37942,3130.625792
297,2020-10-23,2762.5019,2336.058649,3172.640146
298,2020-10-24,2780.347495,2381.205627,3174.82943
299,2020-10-25,2771.736513,2390.160328,3170.615003


The actual daily case is shown by the scatter plots where as the predicted values by the model is denoted by the blue line with a confidence interval around the line. We can see that from OCT 3 2020, the predicted value has started to divert from the actual values. This might because the government of Nepal ended the lockdown. This kind of external factor (or noise) affected our model and it couldn't predict with proper accuracy and capture the trends.

In [None]:
from fbprophet.plot import plot_plotly, plot_components_plotly

plot_plotly(prophet_basic, forecast)

In [None]:
plot_components_plotly(prophet_basic, forecast)