# pandemic

This is a simple Jupyter Notebook for Covid-19 Pandemic Analysis.

Load packages etc...

In [5]:
import pandas as pd
import numpy as np

Download data from `CSSEGISandData`:

In [6]:
from urllib import request
confirmed_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file = './data/raw/confirmed.csv'
request.urlretrieve(confirmed_url, confirmed_file)
deaths_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file = './data/raw/deaths.csv'
request.urlretrieve(deaths_url, deaths_file)

('./data/raw/deaths.csv', <http.client.HTTPMessage at 0x26938c80ac0>)

Load data into dataframes:

In [7]:
confirmed = pd.read_csv(confirmed_file, index_col=False)
deaths = pd.read_csv(deaths_file, index_col=False)

Show some info about data:

In [8]:
confirmed.info()
confirmed.describe()

deaths.info()
deaths.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Columns: 677 entries, Province/State to 11/24/21
dtypes: float64(2), int64(673), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Columns: 677 entries, Province/State to 11/24/21
dtypes: float64(2), int64(673), object(2)
memory usage: 1.4+ MB


Unnamed: 0,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,11/15/21,11/16/21,11/17/21,11/18/21,11/19/21,11/20/21,11/21/21,11/22/21,11/23/21,11/24/21
count,278.0,278.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,...,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0
mean,20.156042,21.788955,0.060714,0.064286,0.092857,0.15,0.2,0.292857,0.467857,0.475,...,18240.192857,18267.407143,18300.007143,18329.292857,18358.996429,18380.239286,18395.957143,18421.782143,18450.460714,18480.932143
std,25.283318,76.200169,1.015944,1.017487,1.436326,2.391517,3.109011,4.5429,7.470302,7.470809,...,72167.456781,72249.836109,72360.662293,72454.594348,72564.500849,72614.567863,72647.989368,72724.910851,72820.31755,72925.654515
min,-51.7963,-178.1165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.643279,-37.713675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21.75,21.75,21.75,21.75,21.75,21.75,21.75,21.75,21.75,22.0
50%,21.51717,20.921188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,663.0,665.0,665.0,666.0,666.0,666.0,666.5,669.0,671.5,674.5
75%,40.39335,84.992575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6736.25,6755.25,6773.5,6787.0,6799.5,6816.75,6824.25,6835.75,6845.75,6851.0
max,71.7069,178.065,17.0,17.0,24.0,40.0,52.0,76.0,125.0,125.0,...,764533.0,765811.0,767433.0,768695.0,770691.0,771013.0,771118.0,772344.0,773770.0,775403.0


Show some top data about `Poland`:

In [9]:
confirmed[confirmed['Country/Region'] == 'Poland'].head(10)
deaths[deaths['Country/Region'] == 'Poland'].head(10)


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/15/21,11/16/21,11/17/21,11/18/21,11/19/21,11/20/21,11/21/21,11/22/21,11/23/21,11/24/21
213,,Poland,51.9194,19.1451,0,0,0,0,0,0,...,78879,79161,79624,79994,80399,80781,80822,80830,81228,81688


Clean up data:

In [10]:
confirmed = confirmed[confirmed['Country/Region'] == 'Poland']
confirmed = confirmed.drop(columns=['Province/State','Lat','Long', 'Country/Region'])
deaths = deaths[deaths['Country/Region'] == 'Poland']
deaths = deaths.drop(columns=['Province/State','Lat','Long', 'Country/Region'])

Calculate differences:

In [11]:
confirmed_daily = confirmed.diff(periods=1, axis=1)
confirmed_daily = confirmed_daily.replace(np.nan, 0)
deaths_daily = deaths.diff(periods=1, axis=1)
deaths_daily = deaths_daily.replace(np.nan, 0)


Unpivot data:

In [12]:
confirmed_total = confirmed.melt(
        var_name="date",
        value_name="total_cases")

confirmed_daily = confirmed_daily.melt(
        var_name="date",
        value_name="confirmed_by_day")

deaths_total = deaths.melt(
        var_name="date",
        value_name="total_deaths")

deaths_daily = deaths_daily.melt(
        var_name="date",
        value_name="deaths_by_day")

Merge data:

In [13]:
confirmed_temp = pd.merge(confirmed_total, confirmed_daily, how='left', on='date')
deaths_temp = pd.merge(deaths_total, deaths_daily, how='left', on='date')
poland_cases = pd.merge(confirmed_temp, deaths_temp, how='left', on='date')
poland_cases.replace(np.nan, 0)
poland_cases.confirmed_by_day = poland_cases.confirmed_by_day.astype(int)
poland_cases.deaths_by_day = poland_cases.deaths_by_day.astype(int)

poland_cases.tail(50)


Unnamed: 0,date,total_cases,confirmed_by_day,total_deaths,deaths_by_day
623,10/6/21,2914962,2086,75774,33
624,10/7/21,2916969,2007,75803,29
625,10/8/21,2918863,1894,75834,31
626,10/9/21,2920874,2011,75864,30
627,10/10/21,2922401,1527,75869,5
628,10/11/21,2923304,903,75869,0
629,10/12/21,2925425,2121,75918,49
630,10/13/21,2928065,2640,75958,40
631,10/14/21,2931064,2999,76018,60
632,10/15/21,2933834,2770,76067,49
