# Import datasets taken from MOH

In [1]:
import pandas as pd

In [2]:
cases_df = pd.read_csv("cases_state.csv")
cases_df.head()

Unnamed: 0,date,state,cases_new,cases_import,cases_recovered,cases_active,cases_cluster,cases_unvax,cases_pvax,cases_fvax,...,cases_0_4,cases_5_11,cases_12_17,cases_18_29,cases_30_39,cases_40_49,cases_50_59,cases_60_69,cases_70_79,cases_80
0,2020-01-25,Johor,4,4,0,4,0,4,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2020-01-25,Kedah,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-01-25,Kelantan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2020-01-25,Melaka,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2020-01-25,Negeri Sembilan,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
deaths_df = pd.read_csv("deaths_state.csv")
deaths_df.head()

Unnamed: 0,date,state,deaths_new,deaths_bid,deaths_new_dod,deaths_bid_dod,deaths_unvax,deaths_pvax,deaths_fvax,deaths_boost,deaths_tat
0,2020-03-17,Johor,1,0,1,0,1,0,0,0,0
1,2020-03-17,Kedah,0,0,0,0,0,0,0,0,0
2,2020-03-17,Kelantan,0,0,0,0,0,0,0,0,0
3,2020-03-17,Melaka,0,0,0,0,0,0,0,0,0
4,2020-03-17,Negeri Sembilan,0,0,0,0,0,0,0,0,0


In [4]:
beds_df = pd.read_csv("hospital.csv")
beds_df.head()

Unnamed: 0,date,state,beds,beds_covid,beds_noncrit,admitted_pui,admitted_covid,admitted_total,discharged_pui,discharged_covid,discharged_total,hosp_covid,hosp_pui,hosp_noncovid
0,2020-03-24,Johor,1440,434,1408,0,0,0,0,0,0,18,9,1
1,2020-03-24,Kedah,1218,143,1183,0,0,0,0,0,0,46,5,12
2,2020-03-24,Kelantan,1463,280,1424,9,6,15,1,0,1,78,58,23
3,2020-03-24,Melaka,1091,82,1065,0,0,0,0,0,0,19,10,12
4,2020-03-24,Negeri Sembilan,1223,253,1205,0,0,0,0,0,0,90,13,8


In [5]:
icu_df = pd.read_csv("icu.csv")
icu_df.head()

Unnamed: 0,date,state,beds_icu,beds_icu_rep,beds_icu_total,beds_icu_covid,vent,vent_port,icu_covid,icu_pui,icu_noncovid,vent_covid,vent_pui,vent_noncovid,vent_used,vent_port_used
0,2020-03-24,Johor,10,22,32,10,11,2,0,0,1,0,0,0,8,0
1,2020-03-24,Kedah,35,0,35,16,34,0,2,1,12,1,1,0,19,0
2,2020-03-24,Kelantan,39,0,39,20,42,4,1,0,20,1,0,0,26,0
3,2020-03-24,Melaka,26,0,26,12,23,4,0,2,12,0,1,0,13,5
4,2020-03-24,Negeri Sembilan,18,0,18,10,14,9,0,0,8,0,0,0,1,6


Drop extra columns and convert date to datetime

In [6]:
cases_df = cases_df.loc[:, cases_df.columns.intersection(['date','state', 'cases_new'])]
cases_df['date'] = pd.to_datetime(cases_df['date'])
cases_df.head()

Unnamed: 0,date,state,cases_new
0,2020-01-25,Johor,4
1,2020-01-25,Kedah,0
2,2020-01-25,Kelantan,0
3,2020-01-25,Melaka,0
4,2020-01-25,Negeri Sembilan,0


In [7]:
deaths_df = deaths_df.loc[:, deaths_df.columns.intersection(['date','state', 'deaths_new'])]
deaths_df['date'] = pd.to_datetime(deaths_df['date'])
deaths_df.head()

Unnamed: 0,date,state,deaths_new
0,2020-03-17,Johor,1
1,2020-03-17,Kedah,0
2,2020-03-17,Kelantan,0
3,2020-03-17,Melaka,0
4,2020-03-17,Negeri Sembilan,0


In [8]:
beds_df = pd.merge(beds_df, icu_df, on=['date','state'])
beds_df = beds_df.loc[:, beds_df.columns.intersection(['date','state', 'beds', 'beds_covid', 'beds_icu_covid'])]
beds_df['date'] = pd.to_datetime(beds_df['date'])
beds_df.head()

Unnamed: 0,date,state,beds,beds_covid,beds_icu_covid
0,2020-03-24,Johor,1440,434,10
1,2020-03-24,Kedah,1218,143,16
2,2020-03-24,Kelantan,1463,280,20
3,2020-03-24,Melaka,1091,82,12
4,2020-03-24,Negeri Sembilan,1223,253,10


# Combine datasets

In [9]:
from datetime import date, timedelta, datetime
# format: y/m/d
sdate = date(2020,1,1)
edate = date(2022,2,21)

states = list(cases_df['state'].unique())

new_df=pd.DataFrame(columns=['date','state', 'cases_new', 'deaths_new', 'beds', 'beds_covid', 'beds_icu_covid'])
row_count = 0
for date in pd.date_range(sdate,edate-timedelta(days=1),freq='d'):
    for state in states:
        row_count += 1
        new_df.loc[row_count, 'date'] = datetime.strftime(date, '%Y-%m-%d')
        new_df.loc[row_count, 'state'] = state
# new_df['date'] = pd.date_range(sdate,edate-timedelta(days=1),freq='d')
new_df.head(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
1,2020-01-01,Johor,,,,,
2,2020-01-01,Kedah,,,,,
3,2020-01-01,Kelantan,,,,,
4,2020-01-01,Melaka,,,,,
5,2020-01-01,Negeri Sembilan,,,,,
6,2020-01-01,Pahang,,,,,
7,2020-01-01,Perak,,,,,
8,2020-01-01,Perlis,,,,,
9,2020-01-01,Pulau Pinang,,,,,
10,2020-01-01,Sabah,,,,,


Take data from old datasets into new datasets.

In [10]:
import datetime
def transpose(new_df, old_df, cols):
    for index, row in new_df.iterrows():
        current_date = new_df.loc[index, 'date']
        current_state = new_df.loc[index, 'state']
        for index1, row1 in old_df.loc[old_df['date'] == current_date].iterrows():
            state = old_df.loc[index1, 'state']
            if current_state == state:
                for col in cols:
                    new_df.loc[index, col] = old_df.loc[index1, col]

transpose(new_df, cases_df, ['cases_new'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
12493,2022-02-19,Terengganu,616,,,,
12494,2022-02-19,W.P. Kuala Lumpur,1247,,,,
12495,2022-02-19,W.P. Labuan,368,,,,
12496,2022-02-19,W.P. Putrajaya,144,,,,
12497,2022-02-20,Johor,2644,,,,
12498,2022-02-20,Kedah,2254,,,,
12499,2022-02-20,Kelantan,1605,,,,
12500,2022-02-20,Melaka,713,,,,
12501,2022-02-20,Negeri Sembilan,690,,,,
12502,2022-02-20,Pahang,1581,,,,


In [11]:
transpose(new_df, deaths_df, ['deaths_new'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
12493,2022-02-19,Terengganu,616,1,,,
12494,2022-02-19,W.P. Kuala Lumpur,1247,0,,,
12495,2022-02-19,W.P. Labuan,368,0,,,
12496,2022-02-19,W.P. Putrajaya,144,0,,,
12497,2022-02-20,Johor,2644,9,,,
12498,2022-02-20,Kedah,2254,6,,,
12499,2022-02-20,Kelantan,1605,0,,,
12500,2022-02-20,Melaka,713,1,,,
12501,2022-02-20,Negeri Sembilan,690,0,,,
12502,2022-02-20,Pahang,1581,2,,,


In [12]:
transpose(new_df, beds_df, ['beds', 'beds_covid', 'beds_icu_covid'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
12493,2022-02-19,Terengganu,616,1,1593,401,40
12494,2022-02-19,W.P. Kuala Lumpur,1247,0,3963,582,22
12495,2022-02-19,W.P. Labuan,368,0,202,98,7
12496,2022-02-19,W.P. Putrajaya,144,0,609,73,6
12497,2022-02-20,Johor,2644,9,5209,1473,46
12498,2022-02-20,Kedah,2254,6,2664,1259,130
12499,2022-02-20,Kelantan,1605,0,2684,410,36
12500,2022-02-20,Melaka,713,1,1315,651,24
12501,2022-02-20,Negeri Sembilan,690,0,1749,1084,61
12502,2022-02-20,Pahang,1581,2,1992,655,81


In [13]:
new_df.dtypes

date              object
state             object
cases_new         object
deaths_new        object
beds              object
beds_covid        object
beds_icu_covid    object
dtype: object

In [14]:
new_df[["cases_new", "deaths_new", "beds", "beds_covid", "beds_icu_covid"]] = new_df[["cases_new", "deaths_new", "beds", "beds_covid", "beds_icu_covid"]].apply(pd.to_numeric)
new_df.dtypes

date               object
state              object
cases_new         float64
deaths_new        float64
beds              float64
beds_covid        float64
beds_icu_covid    float64
dtype: object

In [15]:
new_df=new_df.sort_values(['date']).reset_index(drop=True)
new_df["cum_cases"]=new_df.groupby(['state'])['cases_new'].cumsum(axis=0)
new_df["cum_deaths"]=new_df.groupby(['state'])['deaths_new'].cumsum(axis=0)
new_df.sort_values(['date', 'state'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid,cum_cases,cum_deaths
12492,2022-02-19,Kelantan,1885.0,1.0,2684.0,410.0,36.0,199711.0,1293.0
12493,2022-02-19,Kedah,2429.0,2.0,2664.0,1259.0,130.0,209684.0,2213.0
12494,2022-02-19,Johor,2715.0,8.0,5209.0,1473.0,46.0,298900.0,3977.0
12495,2022-02-19,Perlis,191.0,0.0,503.0,111.0,11.0,9900.0,139.0
12496,2022-02-20,W.P. Kuala Lumpur,1428.0,0.0,3963.0,582.0,26.0,239491.0,2706.0
12497,2022-02-20,Terengganu,608.0,0.0,1593.0,421.0,40.0,91686.0,769.0
12498,2022-02-20,Selangor,7180.0,7.0,5378.0,1203.0,74.0,896324.0,10118.0
12499,2022-02-20,Sarawak,309.0,0.0,3571.0,678.0,71.0,256377.0,1623.0
12500,2022-02-20,Sabah,5007.0,4.0,4342.0,1717.0,122.0,308476.0,2904.0
12501,2022-02-20,Pulau Pinang,1811.0,3.0,2057.0,597.0,42.0,192580.0,1787.0


In [16]:
new_df['state'] = new_df['state'].replace(['Pulau Pinang'],'Penang')
new_df['state'] = new_df['state'].replace(['W.P. Kuala Lumpur'],'Kuala Lumpur')
new_df['state'] = new_df['state'].replace(['W.P. Labuan'],'Labuan')
new_df['state'] = new_df['state'].replace(['W.P. Putrajaya'],'Putrajaya')
new_df['state'] = new_df['state'].replace(['Melaka'],'Malacca')
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid,cum_cases,cum_deaths
12492,2022-02-19,Kelantan,1885.0,1.0,2684.0,410.0,36.0,199711.0,1293.0
12493,2022-02-19,Kedah,2429.0,2.0,2664.0,1259.0,130.0,209684.0,2213.0
12494,2022-02-19,Johor,2715.0,8.0,5209.0,1473.0,46.0,298900.0,3977.0
12495,2022-02-19,Perlis,191.0,0.0,503.0,111.0,11.0,9900.0,139.0
12496,2022-02-20,Kuala Lumpur,1428.0,0.0,3963.0,582.0,26.0,239491.0,2706.0
12497,2022-02-20,Terengganu,608.0,0.0,1593.0,421.0,40.0,91686.0,769.0
12498,2022-02-20,Selangor,7180.0,7.0,5378.0,1203.0,74.0,896324.0,10118.0
12499,2022-02-20,Sarawak,309.0,0.0,3571.0,678.0,71.0,256377.0,1623.0
12500,2022-02-20,Sabah,5007.0,4.0,4342.0,1717.0,122.0,308476.0,2904.0
12501,2022-02-20,Penang,1811.0,3.0,2057.0,597.0,42.0,192580.0,1787.0


In [17]:
new_df.to_csv("daily.csv", index=False)