# pandemic

This is a simple Jupyter Notebook for Covid-19 Pandemic Analysis.

Load packages etc...

In [1]:
import pandas as pd
import numpy as np

Download data from `CSSEGISandData`:

In [2]:
from urllib import request
confirmed_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_file = './data/raw/confirmed.csv'
request.urlretrieve(confirmed_url, confirmed_file)
deaths_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths_file = './data/raw/deaths.csv'
request.urlretrieve(deaths_url, deaths_file)

('./data/raw/deaths.csv', <http.client.HTTPMessage at 0x193793dd3a0>)

Load data into dataframes:

In [3]:
confirmed = pd.read_csv(confirmed_file, index_col=False)
deaths = pd.read_csv(deaths_file, index_col=False)

Show some info about data:

In [4]:
confirmed.info()
confirmed.describe()

deaths.info()
deaths.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Columns: 773 entries, Province/State to 2/28/22
dtypes: float64(2), int64(769), object(2)
memory usage: 1.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Columns: 773 entries, Province/State to 2/28/22
dtypes: float64(2), int64(769), object(2)
memory usage: 1.7+ MB


Unnamed: 0,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
count,282.0,282.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,...,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0
mean,20.106368,21.958718,0.059859,0.06338,0.091549,0.147887,0.197183,0.288732,0.461268,0.46831,...,20715.598592,20734.211268,20746.489437,20795.71831,20836.961268,20874.894366,20908.105634,20930.71831,20946.295775,20973.623239
std,25.841453,75.893366,1.008764,1.010299,1.426181,2.374622,3.087051,4.510813,7.417526,7.418036,...,82527.302768,82576.926519,82626.01719,82783.433092,82965.661312,83142.820644,83297.113893,83375.43663,83409.570745,83522.449513
min,-71.9499,-178.1165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.643279,-22.03655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,35.0,35.25,35.25,35.25,35.25,35.25,35.25,35.25,35.5,35.5
50%,21.607878,20.921188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,788.5,788.5,788.5,788.5,788.5,788.5,789.0,789.0,789.0,789.5
75%,40.950592,84.992575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7462.0,7472.25,7518.25,7578.5,7602.75,7629.25,7658.0,7674.75,7694.5,7707.75
max,71.7069,178.065,17.0,17.0,24.0,40.0,52.0,76.0,125.0,125.0,...,935725.0,936109.0,936957.0,939064.0,941889.0,944830.0,947417.0,948215.0,948397.0,950490.0


Show some top data about `Poland`:

In [5]:
confirmed[confirmed['Country/Region'] == 'Poland'].head(10)
deaths[deaths['Country/Region'] == 'Poland'].head(10)


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
214,,Poland,51.9194,19.1451,0,0,0,0,0,0,...,109792,109817,109833,110157,110517,110858,111056,111277,111316,111317


Clean up data:

In [6]:
confirmed = confirmed[confirmed['Country/Region'] == 'Poland']
confirmed = confirmed.drop(columns=['Province/State','Lat','Long', 'Country/Region'])
deaths = deaths[deaths['Country/Region'] == 'Poland']
deaths = deaths.drop(columns=['Province/State','Lat','Long', 'Country/Region'])

Calculate differences:

In [7]:
confirmed_daily = confirmed.diff(periods=1, axis=1)
confirmed_daily = confirmed_daily.replace(np.nan, 0)
deaths_daily = deaths.diff(periods=1, axis=1)
deaths_daily = deaths_daily.replace(np.nan, 0)


Unpivot data:

In [8]:
confirmed_total = confirmed.melt(
        var_name="date",
        value_name="total_cases")

confirmed_daily = confirmed_daily.melt(
        var_name="date",
        value_name="confirmed_by_day")

deaths_total = deaths.melt(
        var_name="date",
        value_name="total_deaths")

deaths_daily = deaths_daily.melt(
        var_name="date",
        value_name="deaths_by_day")

Merge data:

In [9]:
confirmed_temp = pd.merge(confirmed_total, confirmed_daily, how='left', on='date')
deaths_temp = pd.merge(deaths_total, deaths_daily, how='left', on='date')
poland_cases = pd.merge(confirmed_temp, deaths_temp, how='left', on='date')
poland_cases.replace(np.nan, 0)
poland_cases.confirmed_by_day = poland_cases.confirmed_by_day.astype(int)
poland_cases.deaths_by_day = poland_cases.deaths_by_day.astype(int)

#poland_cases.tail(50)

poland_cases_filtered = poland_cases[['date','confirmed_by_day','deaths_by_day']]
poland_cases_filtered.tail(100)


Unnamed: 0,date,confirmed_by_day,deaths_by_day
669,11/21/21,18924,41
670,11/22/21,12375,8
671,11/23/21,19935,398
672,11/24/21,28431,460
673,11/25/21,28143,498
...,...,...,...
764,2/24/22,18266,341
765,2/25/22,16700,198
766,2/26/22,13950,221
767,2/27/22,8897,39
