In [None]:
import pandas as pd
import pathlib
import numpy as np
import plotly.express as px

In [None]:
# set path to data
data_path = pathlib.Path('../data/raw-case-data/WHO-COVID-19-global-data.csv')

In [None]:
# read in who data to dataframe
who_data = pd.read_csv(data_path)

# need new cases only for each date, for each country - Malaysia, Philippines and Malaysia

In [None]:
# need new cases only for each date, for each country - Malaysia, Philippines and Malaysia
countries = ['Malaysia', 'Philippines', 'Viet Nam']
target_data = who_data.loc[who_data['Country'].isin(countries)]

# set date columb to be datetime and set date column to index
target_data['Date_reported'] = pd.to_datetime(target_data['Date_reported'])
target_data = target_data.set_index('Date_reported')

# drop uneeded columns
target_data = target_data.drop(['Country_code','WHO_region','New_deaths','Cumulative_deaths'], axis=1)

# Convert NaNs to 0
target_data['New_cases'] = target_data['New_cases'].fillna(0)

In [None]:
target_data

In [None]:
# Grab each country and combine
# Malaysia
mys_df = target_data.loc[target_data['Country'] == 'Malaysia']
mys_df = mys_df.rename(columns={"New_cases": "New_cases_MYS", "Cumulative_cases":"Cumulative_cases_MYS"})
mys_df = mys_df.drop(['Country'], axis=1)

# Philippines
phl_df = target_data.loc[target_data['Country'] == 'Philippines']
phl_df = phl_df.rename(columns={"New_cases": "New_case_PHL", "Cumulative_cases":"Cumulative_cases_PHL"})
phl_df = phl_df.drop(['Country'], axis=1)

# Vietnam
vnm_df = target_data.loc[target_data['Country'] == 'Viet Nam']
vnm_df = vnm_df.rename(columns={"New_cases": "New_case_VNM", "Cumulative_cases":"Cumulative_cases_VNM"})
vnm_df = vnm_df.drop(['Country'], axis=1)

In [None]:
# join data frames
merge = pd.merge(mys_df,phl_df, how='inner', left_index=True, right_index=True)
raw_case_df = pd.merge(merge, vnm_df, how='inner', left_index=True, right_index=True)

In [None]:
# exploring different solutions for converting weekly to daily data, using malaysia as a base tool

In [None]:
# solution 1. take weekly cumulative case series - interpolate and take the difference

# drop weekly case series
mys_df = mys_df.drop(columns=['MYS'])

# then upsample to add days, interpolate between and take the diff
mys_df = mys_df.resample('D').interpolate()


In [None]:
# generate new cases
mys_df['new_cases'] = mys_df['Cumulative_cases'].diff(1)

In [None]:
mys_df

In [None]:
fig = px.scatter(x=mys_df.index, y=mys_df['new_cases'])

fig.show()

In [None]:
# try second way
mys_df_2 = target_data.loc[target_data['Country'] == 'Malaysia']
mys_df_2 = mys_df_2.rename(columns={"New_cases": "MYS"})
mys_df_2 = mys_df_2.drop(['Country', 'Cumulative_cases'], axis=1)

In [None]:
mys_df_2 = mys_df_2.resample('D').interpolate()/7

In [None]:
fig = px.scatter(x=mys_df_2.index, y=mys_df_2['MYS'])

fig.show()

In [None]:
# Paths to save data to

# First way dataset
save_path = pathlib.Path('../data/target-data/case-data.csv')

In [None]:
raw_case_df.to_csv(save_path, index = True)

In [None]:
df = pd.read_csv(save_path)

In [None]:
df