In [1]:
import pandas as pd
import numpy as np

# COVID-19

In [2]:
# Total cases

In [3]:
world_total = pd.read_csv('output_data/total-world-covid19.csv')
world_total['Date_Confirmed'] = pd.DatetimeIndex(world_total['Date_Confirmed'])
world_total = world_total[['Date_Confirmed','US','United Kingdom','Switzerland','Korea, South','Italy','India','Australia','Mexico']]

In [4]:
# Daily cases

In [5]:
world_daily = pd.read_csv('output_data/daily-world_covid19.csv')
world_daily['Date_Confirmed'] = pd.DatetimeIndex(world_daily['Date_Confirmed'])
world_daily = world_daily[['Date_Confirmed','US','United Kingdom','Switzerland','Korea, South','Italy','India','Australia','Mexico']]

In [6]:
world_daily

Unnamed: 0,Date_Confirmed,US,United Kingdom,Switzerland,"Korea, South",Italy,India,Australia,Mexico
0,2020-01-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-24,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-26,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
116,2020-05-17,18937.0,321.0,15.0,15.0,675.0,5050.0,2.0,2075.0
117,2020-05-18,21551.0,247.0,10.0,13.0,451.0,4630.0,2.0,2414.0
118,2020-05-19,20260.0,221.0,21.0,32.0,813.0,6147.0,0.0,2713.0
119,2020-05-20,23285.0,-47.0,40.0,12.0,665.0,5553.0,1.0,2248.0


In [7]:
# Process

In [8]:
total = world_total
total = total.fillna(method='ffill') # First fill missing time series totals with previous row value
#total = total.replace(0, np.nan) # Now convert 0 to NaN

daily = world_daily

In [9]:
# Make sure datetimes are correct

In [10]:
    # Total
total['Date_Confirmed'] = pd.to_datetime(total.Date_Confirmed, format='%Y-%m-%d', errors='coerce')
total = total.set_index(pd.DatetimeIndex(total['Date_Confirmed']))

    # Daily
daily['Date_Confirmed'] = pd.to_datetime(daily.Date_Confirmed, format='%Y-%m-%d', errors='coerce')
daily = daily.set_index(pd.DatetimeIndex(daily['Date_Confirmed']))

In [11]:
# Apply rolling window

In [12]:
total = total.dropna().rolling(window=7).mean().round().reset_index()
daily = daily.dropna().rolling(window=7).mean().round().reset_index()

In [13]:
# Rename index

In [14]:
total = total.rename(columns={'index':'Date_Confirmed'})
daily = daily.rename(columns={'index':'Date_Confirmed'})

In [15]:
# Melt to long format

In [16]:
tbl_total = pd.DataFrame(total.set_index(['Date_Confirmed']).rename_axis(['Region'], axis=1).stack())

In [17]:
tbl_daily = pd.DataFrame(daily.set_index(['Date_Confirmed']).rename_axis(['Region'], axis=1).stack())

In [18]:
tbl = pd.merge(tbl_total, tbl_daily, how='inner', on=['Date_Confirmed', 'Region'])

In [19]:
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,0_x,0_y
Date_Confirmed,Region,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-28,US,3.0,1.0
2020-01-28,United Kingdom,0.0,0.0
2020-01-28,Switzerland,0.0,0.0
2020-01-28,"Korea, South",2.0,0.0
2020-01-28,Italy,0.0,0.0
...,...,...,...
2020-05-21,"Korea, South",11086.0,18.0
2020-05-21,Italy,226005.0,701.0
2020-05-21,India,101312.0,5176.0
2020-05-21,Australia,883.0,1.0


In [20]:
tbl = tbl.rename(columns={'0_x': 'total', '0_y': 'daily'})

In [21]:
tbl.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total,daily
Date_Confirmed,Region,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-28,US,3.0,1.0
2020-01-28,United Kingdom,0.0,0.0
2020-01-28,Switzerland,0.0,0.0
2020-01-28,"Korea, South",2.0,0.0
2020-01-28,Italy,0.0,0.0


In [22]:
# Convert format to fit this plotly express template (long, instead of wide):
# https://plotly.com/python/animations/#using-a-slider-and-buttons
# https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv
tbl_long = tbl.reset_index()
tbl_long.to_csv('output_data/rolling-time-series.csv', index=False)