# Import datasets taken from MOH

In [1]:
import pandas as pd

In [2]:
cases_df = pd.read_csv("cases_state.csv")
cases_df.head()

Unnamed: 0,date,state,cases_new,cases_import,cases_recovered,cases_active,cases_cluster,cases_pvax,cases_fvax,cases_child,cases_adolescent,cases_adult,cases_elderly
0,2020-01-25,Johor,4,4,0,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2020-01-25,Kedah,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-25,Kelantan,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-25,Melaka,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-25,Negeri Sembilan,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
deaths_df = pd.read_csv("deaths_state.csv")
deaths_df.head()

Unnamed: 0,date,state,deaths_new,deaths_bid,deaths_new_dod,deaths_bid_dod,deaths_pvax,deaths_fvax,deaths_tat
0,2020-03-17,Johor,1,0,1,0,0,0,0
1,2020-03-17,Kedah,0,0,0,0,0,0,0
2,2020-03-17,Kelantan,0,0,0,0,0,0,0
3,2020-03-17,Melaka,0,0,0,0,0,0,0
4,2020-03-17,Negeri Sembilan,0,0,0,0,0,0,0


In [4]:
beds_df = pd.read_csv("hospital.csv")
beds_df.head()

Unnamed: 0,date,state,beds,beds_covid,beds_noncrit,admitted_pui,admitted_covid,admitted_total,discharged_pui,discharged_covid,discharged_total,hosp_covid,hosp_pui,hosp_noncovid
0,2020-03-24,Johor,1440,434,1408,0,0,0,0,0,0,18,9,1
1,2020-03-24,Kedah,1218,143,1183,0,0,0,0,0,0,46,5,12
2,2020-03-24,Kelantan,1463,280,1424,9,6,15,1,0,1,78,58,23
3,2020-03-24,Melaka,1091,82,1065,0,0,0,0,0,0,19,10,12
4,2020-03-24,Negeri Sembilan,1223,253,1205,0,0,0,0,0,0,90,13,8


In [5]:
icu_df = pd.read_csv("icu.csv")
icu_df.head()

Unnamed: 0,date,state,beds_icu,beds_icu_rep,beds_icu_total,beds_icu_covid,vent,vent_port,icu_covid,icu_pui,icu_noncovid,vent_covid,vent_pui,vent_noncovid,vent_used,vent_port_used
0,2020-03-24,Johor,10,22,32,10,11,2,0,0,1,0,0,0,8,0
1,2020-03-24,Kedah,35,0,35,16,34,0,2,1,12,1,1,0,19,0
2,2020-03-24,Kelantan,39,0,39,20,42,4,1,0,20,1,0,0,26,0
3,2020-03-24,Melaka,26,0,26,12,23,4,0,2,12,0,1,0,13,5
4,2020-03-24,Negeri Sembilan,18,0,18,10,14,9,0,0,8,0,0,0,1,6


Drop extra columns and convert date to datetime

In [6]:
cases_df = cases_df.loc[:, cases_df.columns.intersection(['date','state', 'cases_new'])]
cases_df['date'] = pd.to_datetime(cases_df['date'])
cases_df.head()

Unnamed: 0,date,state,cases_new
0,2020-01-25,Johor,4
1,2020-01-25,Kedah,0
2,2020-01-25,Kelantan,0
3,2020-01-25,Melaka,0
4,2020-01-25,Negeri Sembilan,0


In [7]:
deaths_df = deaths_df.loc[:, deaths_df.columns.intersection(['date','state', 'deaths_new'])]
deaths_df['date'] = pd.to_datetime(deaths_df['date'])
deaths_df.head()

Unnamed: 0,date,state,deaths_new
0,2020-03-17,Johor,1
1,2020-03-17,Kedah,0
2,2020-03-17,Kelantan,0
3,2020-03-17,Melaka,0
4,2020-03-17,Negeri Sembilan,0


In [8]:
beds_df = pd.merge(beds_df, icu_df, on=['date','state'])
beds_df = beds_df.loc[:, beds_df.columns.intersection(['date','state', 'beds', 'beds_covid', 'beds_icu_covid'])]
beds_df['date'] = pd.to_datetime(beds_df['date'])
beds_df.head()

Unnamed: 0,date,state,beds,beds_covid,beds_icu_covid
0,2020-03-24,Johor,1440,434,10
1,2020-03-24,Kedah,1218,143,16
2,2020-03-24,Kelantan,1463,280,20
3,2020-03-24,Melaka,1091,82,12
4,2020-03-24,Negeri Sembilan,1223,253,10


# Combine datasets

In [9]:
from datetime import date, timedelta, datetime
# format: y/m/d
sdate = date(2020,1,1)
edate = date(2021,10,10)

states = list(cases_df['state'].unique())

new_df=pd.DataFrame(columns=['date','state', 'cases_new', 'deaths_new', 'beds', 'beds_covid', 'beds_icu_covid'])
row_count = 0
for date in pd.date_range(sdate,edate-timedelta(days=1),freq='d'):
    for state in states:
        row_count += 1
        new_df.loc[row_count, 'date'] = datetime.strftime(date, '%Y-%m-%d')
        new_df.loc[row_count, 'state'] = state
# new_df['date'] = pd.date_range(sdate,edate-timedelta(days=1),freq='d')
new_df.head(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
1,2020-01-01,Johor,,,,,
2,2020-01-01,Kedah,,,,,
3,2020-01-01,Kelantan,,,,,
4,2020-01-01,Melaka,,,,,
5,2020-01-01,Negeri Sembilan,,,,,
6,2020-01-01,Pahang,,,,,
7,2020-01-01,Perak,,,,,
8,2020-01-01,Perlis,,,,,
9,2020-01-01,Pulau Pinang,,,,,
10,2020-01-01,Sabah,,,,,


Take data from old datasets into new datasets.

In [10]:
import datetime
def transpose(new_df, old_df, cols):
    for index, row in new_df.iterrows():
        current_date = new_df.loc[index, 'date']
        current_state = new_df.loc[index, 'state']
        for index1, row1 in old_df.loc[old_df['date'] == current_date].iterrows():
            state = old_df.loc[index1, 'state']
            if current_state == state:
                for col in cols:
                    new_df.loc[index, col] = old_df.loc[index1, col]

transpose(new_df, cases_df, ['cases_new'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
10333,2021-10-07,Terengganu,699,,,,
10334,2021-10-07,W.P. Kuala Lumpur,250,,,,
10335,2021-10-07,W.P. Labuan,0,,,,
10336,2021-10-07,W.P. Putrajaya,59,,,,
10337,2021-10-08,Johor,864,,,,
10338,2021-10-08,Kedah,578,,,,
10339,2021-10-08,Kelantan,1052,,,,
10340,2021-10-08,Melaka,244,,,,
10341,2021-10-08,Negeri Sembilan,243,,,,
10342,2021-10-08,Pahang,551,,,,


In [11]:
transpose(new_df, deaths_df, ['deaths_new'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
10333,2021-10-07,Terengganu,699,6,,,
10334,2021-10-07,W.P. Kuala Lumpur,250,1,,,
10335,2021-10-07,W.P. Labuan,0,0,,,
10336,2021-10-07,W.P. Putrajaya,59,0,,,
10337,2021-10-08,Johor,864,13,,,
10338,2021-10-08,Kedah,578,4,,,
10339,2021-10-08,Kelantan,1052,8,,,
10340,2021-10-08,Melaka,244,1,,,
10341,2021-10-08,Negeri Sembilan,243,1,,,
10342,2021-10-08,Pahang,551,1,,,


In [12]:
transpose(new_df, beds_df, ['beds', 'beds_covid', 'beds_icu_covid'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
10333,2021-10-07,Terengganu,699,6,1557,491,38
10334,2021-10-07,W.P. Kuala Lumpur,250,1,4201,786,63
10335,2021-10-07,W.P. Labuan,0,0,122,106,18
10336,2021-10-07,W.P. Putrajaya,59,0,609,84,6
10337,2021-10-08,Johor,864,13,5160,2753,110
10338,2021-10-08,Kedah,578,4,2664,1259,130
10339,2021-10-08,Kelantan,1052,8,2815,951,89
10340,2021-10-08,Melaka,244,1,1317,651,88
10341,2021-10-08,Negeri Sembilan,243,1,1749,1084,88
10342,2021-10-08,Pahang,551,1,2066,709,81


In [13]:
new_df.dtypes

date              object
state             object
cases_new         object
deaths_new        object
beds              object
beds_covid        object
beds_icu_covid    object
dtype: object

In [14]:
new_df[["cases_new", "deaths_new", "beds", "beds_covid", "beds_icu_covid"]] = new_df[["cases_new", "deaths_new", "beds", "beds_covid", "beds_icu_covid"]].apply(pd.to_numeric)
new_df.dtypes

date               object
state              object
cases_new         float64
deaths_new        float64
beds              float64
beds_covid        float64
beds_icu_covid    float64
dtype: object

In [15]:
new_df=new_df.sort_values(['date']).reset_index(drop=True)
new_df["cum_cases"]=new_df.groupby(['state'])['cases_new'].cumsum(axis=0)
new_df["cum_deaths"]=new_df.groupby(['state'])['deaths_new'].cumsum(axis=0)
new_df.sort_values(['date', 'state'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid,cum_cases,cum_deaths
10332,2021-10-07,Kelantan,1032.0,6.0,2815.0,951.0,89.0,122535.0,875.0
10333,2021-10-07,Kedah,503.0,3.0,2664.0,1259.0,130.0,139052.0,1891.0
10334,2021-10-07,Johor,993.0,15.0,5160.0,2753.0,110.0,202932.0,3400.0
10335,2021-10-07,Perlis,200.0,2.0,487.0,211.0,11.0,4422.0,84.0
10336,2021-10-08,W.P. Kuala Lumpur,313.0,2.0,4201.0,774.0,63.0,187864.0,2512.0
10337,2021-10-08,Terengganu,650.0,0.0,1557.0,491.0,38.0,61034.0,403.0
10338,2021-10-08,Selangor,1796.0,12.0,5902.0,1967.0,190.0,687966.0,9444.0
10339,2021-10-08,Sarawak,1339.0,8.0,4055.0,1377.0,152.0,222381.0,968.0
10340,2021-10-08,Sabah,705.0,12.0,4109.0,1797.0,227.0,204193.0,2326.0
10341,2021-10-08,Pulau Pinang,632.0,10.0,1985.0,826.0,93.0,135800.0,1460.0


In [16]:
new_df['state'] = new_df['state'].replace(['Pulau Pinang'],'Penang')
new_df['state'] = new_df['state'].replace(['W.P. Kuala Lumpur'],'Kuala Lumpur')
new_df['state'] = new_df['state'].replace(['W.P. Labuan'],'Labuan')
new_df['state'] = new_df['state'].replace(['W.P. Putrajaya'],'Putrajaya')
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid,cum_cases,cum_deaths
10332,2021-10-07,Kelantan,1032.0,6.0,2815.0,951.0,89.0,122535.0,875.0
10333,2021-10-07,Kedah,503.0,3.0,2664.0,1259.0,130.0,139052.0,1891.0
10334,2021-10-07,Johor,993.0,15.0,5160.0,2753.0,110.0,202932.0,3400.0
10335,2021-10-07,Perlis,200.0,2.0,487.0,211.0,11.0,4422.0,84.0
10336,2021-10-08,Kuala Lumpur,313.0,2.0,4201.0,774.0,63.0,187864.0,2512.0
10337,2021-10-08,Terengganu,650.0,0.0,1557.0,491.0,38.0,61034.0,403.0
10338,2021-10-08,Selangor,1796.0,12.0,5902.0,1967.0,190.0,687966.0,9444.0
10339,2021-10-08,Sarawak,1339.0,8.0,4055.0,1377.0,152.0,222381.0,968.0
10340,2021-10-08,Sabah,705.0,12.0,4109.0,1797.0,227.0,204193.0,2326.0
10341,2021-10-08,Penang,632.0,10.0,1985.0,826.0,93.0,135800.0,1460.0


In [17]:
new_df.to_csv("daily.csv", index=False)