# Import datasets taken from MOH

In [1]:
import pandas as pd

In [2]:
cases_df = pd.read_csv("cases_state.csv")
cases_df.head()

Unnamed: 0,date,state,cases_new,cases_import,cases_recovered,cases_active,cases_cluster,cases_pvax,cases_fvax,cases_child,cases_adolescent,cases_adult,cases_elderly
0,2020-01-25,Johor,4,4,0,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2020-01-25,Kedah,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-25,Kelantan,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-25,Melaka,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-25,Negeri Sembilan,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
deaths_df = pd.read_csv("deaths_state.csv")
deaths_df.head()

Unnamed: 0,date,state,deaths_new,deaths_bid,deaths_new_dod,deaths_bid_dod,deaths_pvax,deaths_fvax,deaths_tat
0,2020-03-17,Johor,1,0,1,0,0,0,0
1,2020-03-17,Kedah,0,0,0,0,0,0,0
2,2020-03-17,Kelantan,0,0,0,0,0,0,0
3,2020-03-17,Melaka,0,0,0,0,0,0,0
4,2020-03-17,Negeri Sembilan,0,0,0,0,0,0,0


In [4]:
beds_df = pd.read_csv("hospital.csv")
beds_df.head()

Unnamed: 0,date,state,beds,beds_covid,beds_noncrit,admitted_pui,admitted_covid,admitted_total,discharged_pui,discharged_covid,discharged_total,hosp_covid,hosp_pui,hosp_noncovid
0,2020-03-24,Johor,1440,434,1408,0,0,0,0,0,0,18,9,1
1,2020-03-24,Kedah,1218,143,1183,0,0,0,0,0,0,46,5,12
2,2020-03-24,Kelantan,1463,280,1424,9,6,15,1,0,1,78,58,23
3,2020-03-24,Melaka,1091,82,1065,0,0,0,0,0,0,19,10,12
4,2020-03-24,Negeri Sembilan,1223,253,1205,0,0,0,0,0,0,90,13,8


In [5]:
icu_df = pd.read_csv("icu.csv")
icu_df.head()

Unnamed: 0,date,state,beds_icu,beds_icu_rep,beds_icu_total,beds_icu_covid,vent,vent_port,icu_covid,icu_pui,icu_noncovid,vent_covid,vent_pui,vent_noncovid,vent_used,vent_port_used
0,2020-03-24,Johor,10,22,32,10,11,2,0,0,1,0,0,0,8,0
1,2020-03-24,Kedah,35,0,35,16,34,0,2,1,12,1,1,0,19,0
2,2020-03-24,Kelantan,39,0,39,20,42,4,1,0,20,1,0,0,26,0
3,2020-03-24,Melaka,26,0,26,12,23,4,0,2,12,0,1,0,13,5
4,2020-03-24,Negeri Sembilan,18,0,18,10,14,9,0,0,8,0,0,0,1,6


Drop extra columns and convert date to datetime

In [6]:
cases_df = cases_df.loc[:, cases_df.columns.intersection(['date','state', 'cases_new'])]
cases_df['date'] = pd.to_datetime(cases_df['date'])
cases_df.head()

Unnamed: 0,date,state,cases_new
0,2020-01-25,Johor,4
1,2020-01-25,Kedah,0
2,2020-01-25,Kelantan,0
3,2020-01-25,Melaka,0
4,2020-01-25,Negeri Sembilan,0


In [7]:
deaths_df = deaths_df.loc[:, deaths_df.columns.intersection(['date','state', 'deaths_new'])]
deaths_df['date'] = pd.to_datetime(deaths_df['date'])
deaths_df.head()

Unnamed: 0,date,state,deaths_new
0,2020-03-17,Johor,1
1,2020-03-17,Kedah,0
2,2020-03-17,Kelantan,0
3,2020-03-17,Melaka,0
4,2020-03-17,Negeri Sembilan,0


In [8]:
beds_df = pd.merge(beds_df, icu_df, on=['date','state'])
beds_df = beds_df.loc[:, beds_df.columns.intersection(['date','state', 'beds', 'beds_covid', 'beds_icu_covid'])]
beds_df['date'] = pd.to_datetime(beds_df['date'])
beds_df.head()

Unnamed: 0,date,state,beds,beds_covid,beds_icu_covid
0,2020-03-24,Johor,1440,434,10
1,2020-03-24,Kedah,1218,143,16
2,2020-03-24,Kelantan,1463,280,20
3,2020-03-24,Melaka,1091,82,12
4,2020-03-24,Negeri Sembilan,1223,253,10


# Combine datasets

In [9]:
from datetime import date, timedelta, datetime
# format: y/m/d
sdate = date(2020,1,1)
edate = date(2021,11,4)

states = list(cases_df['state'].unique())

new_df=pd.DataFrame(columns=['date','state', 'cases_new', 'deaths_new', 'beds', 'beds_covid', 'beds_icu_covid'])
row_count = 0
for date in pd.date_range(sdate,edate-timedelta(days=1),freq='d'):
    for state in states:
        row_count += 1
        new_df.loc[row_count, 'date'] = datetime.strftime(date, '%Y-%m-%d')
        new_df.loc[row_count, 'state'] = state
# new_df['date'] = pd.date_range(sdate,edate-timedelta(days=1),freq='d')
new_df.head(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
1,2020-01-01,Johor,,,,,
2,2020-01-01,Kedah,,,,,
3,2020-01-01,Kelantan,,,,,
4,2020-01-01,Melaka,,,,,
5,2020-01-01,Negeri Sembilan,,,,,
6,2020-01-01,Pahang,,,,,
7,2020-01-01,Perak,,,,,
8,2020-01-01,Perlis,,,,,
9,2020-01-01,Pulau Pinang,,,,,
10,2020-01-01,Sabah,,,,,


Take data from old datasets into new datasets.

In [10]:
import datetime
def transpose(new_df, old_df, cols):
    for index, row in new_df.iterrows():
        current_date = new_df.loc[index, 'date']
        current_state = new_df.loc[index, 'state']
        for index1, row1 in old_df.loc[old_df['date'] == current_date].iterrows():
            state = old_df.loc[index1, 'state']
            if current_state == state:
                for col in cols:
                    new_df.loc[index, col] = old_df.loc[index1, col]

transpose(new_df, cases_df, ['cases_new'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
10749,2021-11-02,Terengganu,217,,,,
10750,2021-11-02,W.P. Kuala Lumpur,259,,,,
10751,2021-11-02,W.P. Labuan,0,,,,
10752,2021-11-02,W.P. Putrajaya,33,,,,
10753,2021-11-03,Johor,441,,,,
10754,2021-11-03,Kedah,381,,,,
10755,2021-11-03,Kelantan,565,,,,
10756,2021-11-03,Melaka,154,,,,
10757,2021-11-03,Negeri Sembilan,156,,,,
10758,2021-11-03,Pahang,316,,,,


In [11]:
transpose(new_df, deaths_df, ['deaths_new'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
10749,2021-11-02,Terengganu,217,1,,,
10750,2021-11-02,W.P. Kuala Lumpur,259,1,,,
10751,2021-11-02,W.P. Labuan,0,0,,,
10752,2021-11-02,W.P. Putrajaya,33,0,,,
10753,2021-11-03,Johor,441,8,,,
10754,2021-11-03,Kedah,381,3,,,
10755,2021-11-03,Kelantan,565,9,,,
10756,2021-11-03,Melaka,154,2,,,
10757,2021-11-03,Negeri Sembilan,156,1,,,
10758,2021-11-03,Pahang,316,0,,,


In [12]:
transpose(new_df, beds_df, ['beds', 'beds_covid', 'beds_icu_covid'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid
10749,2021-11-02,Terengganu,217,1,1557,491,37
10750,2021-11-02,W.P. Kuala Lumpur,259,1,3963,608,43
10751,2021-11-02,W.P. Labuan,0,0,122,12,7
10752,2021-11-02,W.P. Putrajaya,33,0,609,84,6
10753,2021-11-03,Johor,441,8,5160,2753,102
10754,2021-11-03,Kedah,381,3,2664,1259,130
10755,2021-11-03,Kelantan,565,9,2771,791,85
10756,2021-11-03,Melaka,154,2,1317,651,38
10757,2021-11-03,Negeri Sembilan,156,1,1749,1084,88
10758,2021-11-03,Pahang,316,0,2066,709,81


In [13]:
new_df.dtypes

date              object
state             object
cases_new         object
deaths_new        object
beds              object
beds_covid        object
beds_icu_covid    object
dtype: object

In [14]:
new_df[["cases_new", "deaths_new", "beds", "beds_covid", "beds_icu_covid"]] = new_df[["cases_new", "deaths_new", "beds", "beds_covid", "beds_icu_covid"]].apply(pd.to_numeric)
new_df.dtypes

date               object
state              object
cases_new         float64
deaths_new        float64
beds              float64
beds_covid        float64
beds_icu_covid    float64
dtype: object

In [15]:
new_df=new_df.sort_values(['date']).reset_index(drop=True)
new_df["cum_cases"]=new_df.groupby(['state'])['cases_new'].cumsum(axis=0)
new_df["cum_deaths"]=new_df.groupby(['state'])['deaths_new'].cumsum(axis=0)
new_df.sort_values(['date', 'state'])
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid,cum_cases,cum_deaths
10748,2021-11-02,Kelantan,688.0,11.0,2771.0,791.0,85.0,140218.0,1038.0
10749,2021-11-02,Kedah,347.0,6.0,2664.0,1259.0,130.0,149349.0,1989.0
10750,2021-11-02,Johor,419.0,4.0,5160.0,2753.0,102.0,218964.0,3663.0
10751,2021-11-02,Perlis,54.0,0.0,487.0,190.0,11.0,5531.0,115.0
10752,2021-11-03,W.P. Kuala Lumpur,199.0,1.0,3963.0,608.0,43.0,194380.0,2561.0
10753,2021-11-03,Terengganu,209.0,1.0,1557.0,491.0,37.0,70311.0,465.0
10754,2021-11-03,Selangor,1167.0,1.0,5338.0,1330.0,156.0,717629.0,9614.0
10755,2021-11-03,Sarawak,540.0,10.0,4055.0,1552.0,161.0,242592.0,1380.0
10756,2021-11-03,Sabah,675.0,4.0,4103.0,1689.0,229.0,219566.0,2537.0
10757,2021-11-03,Pulau Pinang,246.0,2.0,1985.0,638.0,75.0,145687.0,1609.0


In [16]:
new_df['state'] = new_df['state'].replace(['Pulau Pinang'],'Penang')
new_df['state'] = new_df['state'].replace(['W.P. Kuala Lumpur'],'Kuala Lumpur')
new_df['state'] = new_df['state'].replace(['W.P. Labuan'],'Labuan')
new_df['state'] = new_df['state'].replace(['W.P. Putrajaya'],'Putrajaya')
new_df['state'] = new_df['state'].replace(['Melaka'],'Malacca')
new_df.tail(20)

Unnamed: 0,date,state,cases_new,deaths_new,beds,beds_covid,beds_icu_covid,cum_cases,cum_deaths
10748,2021-11-02,Kelantan,688.0,11.0,2771.0,791.0,85.0,140218.0,1038.0
10749,2021-11-02,Kedah,347.0,6.0,2664.0,1259.0,130.0,149349.0,1989.0
10750,2021-11-02,Johor,419.0,4.0,5160.0,2753.0,102.0,218964.0,3663.0
10751,2021-11-02,Perlis,54.0,0.0,487.0,190.0,11.0,5531.0,115.0
10752,2021-11-03,Kuala Lumpur,199.0,1.0,3963.0,608.0,43.0,194380.0,2561.0
10753,2021-11-03,Terengganu,209.0,1.0,1557.0,491.0,37.0,70311.0,465.0
10754,2021-11-03,Selangor,1167.0,1.0,5338.0,1330.0,156.0,717629.0,9614.0
10755,2021-11-03,Sarawak,540.0,10.0,4055.0,1552.0,161.0,242592.0,1380.0
10756,2021-11-03,Sabah,675.0,4.0,4103.0,1689.0,229.0,219566.0,2537.0
10757,2021-11-03,Penang,246.0,2.0,1985.0,638.0,75.0,145687.0,1609.0


In [17]:
new_df.to_csv("daily.csv", index=False)