## In this notebook I will gather and prepare the datasets needed to be used to build the Covid-19 Dashboard

In [1]:
import pandas as pd


from create_new_directory import new_folder
from get_dataset import get_dataset
from reshape_dataset import reshape

## Gathering Data

#### Collect the daily covid-19 confirmed cases dataset

In [2]:
# Create a new directory in the current running directory if it does not exist 

folder_name = 'dataset'
new_folder(folder_name)

In [3]:
# Download the confirmed covid-19 cases from its source https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
file_name = 'covid19_confirmed_cases.csv'

# The functoin of get_dataset takes url and file name
data = get_dataset(URL,file_name)
confirmed_df = pd.read_csv('dataset'+ '/'+file_name)


In [4]:
confirmed_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/15/21,9/16/21,9/17/21,9/18/21,9/19/21,9/20/21,9/21/21,9/22/21,9/23/21,9/24/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,154283,154361,154487,154487,154487,154585,154712,154757,154800,154960
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,159423,160365,161324,162173,162953,163404,164276,165096,165864,166690
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,200770,200989,201224,201425,201600,201766,201948,202122,202283,202449
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,15108,15113,15124,15124,15124,15140,15140,15153,15156,15167
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,51407,51827,52208,52307,52307,52644,52968,53387,53840,54280


#### Collect the daily covid-19 death cases dataset

In [12]:
# Download the confirmed covid-19 cases from its source https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

URL = 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
file_name = 'covid19_death_cases.csv'

# The functoin of get_dataset takes url and file name
data = get_dataset(URL,file_name)
death_df = pd.read_csv('dataset'+ '/'+file_name)

death_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/15/21,9/16/21,9/17/21,9/18/21,9/19/21,9/20/21,9/21/21,9/22/21,9/23/21,9/24/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7174,7183,7186,7186,7186,7199,7199,7199,7199,7199
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2557,2563,2569,2574,2580,2587,2594,2601,2609,2619
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,5630,5651,5670,5681,5694,5709,5725,5739,5748,5758
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,130,130,130,130,130,130,130,130,130,130
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1360,1371,1378,1388,1388,1409,1414,1434,1460,1471


#### Collect the data of the daily recovery from covid19 

In [13]:
URL = 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
file_name = 'covid19_recovered_cases.csv'

# The functoin of get_dataset takes url and file name
data = get_dataset(URL,file_name)
recovery_df = pd.read_csv('dataset'+ '/'+file_name)

recovery_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/15/21,9/16/21,9/17/21,9/18/21,9/19/21,9/20/21,9/21/21,9/22/21,9/23/21,9/24/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Prepare the three datasets to be in an appreciate format and shape

In [15]:
confirmed = reshape(confirmed_df,
               'date',
               'deaths')
confirmed.shape

(170748, 6)

In [16]:
confirmed.sample(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,deaths
71357,,Philippines,12.879721,121.774017,10/3/20,319330
107402,Montserrat,United Kingdom,16.742498,-62.187366,2/9/21,18
24830,,Zimbabwe,-19.015438,29.154857,4/19/20,25
75078,,Bolivia,-16.2902,-63.5887,10/17/20,139710
77562,,Afghanistan,33.93911,67.709953,10/26/20,40833


In [17]:
death = reshape(death_df,
               'date',
               'deaths')
death.shape

(170748, 6)

In [18]:
death.sample(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,deaths
137206,,Russia,61.52401,105.318756,5/27/21,117990
85882,,Sierra Leone,8.460555,-11.779889,11/24/20,74
112311,,Italy,41.87194,12.56738,2/27/21,97507
34078,British Columbia,Canada,53.7267,-127.6476,5/23/20,157
40525,Hong Kong,China,22.3,114.2,6/15/20,4


In [19]:
recovery = reshape(recovery_df,
               'date',
               'deaths')
recovery.shape

(161568, 6)

In [20]:
recovery.sample(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,deaths
75902,,Iran,32.427908,53.688046,11/4/20,500400
154435,,Vietnam,14.058324,108.277199,8/28/21,0
140554,Guadeloupe,France,16.265,-61.551,7/7/21,2250
154126,,Sierra Leone,8.460555,-11.779889,8/27/21,0
82539,,Monaco,43.7333,7.4167,11/29/20,537
