## In this notebook I will gather and prepare the datasets needed to be used to build the Covid-19 Dashboard

In [1]:
import pandas as pd


from create_new_directory import new_folder
from get_dataset import get_dataset
from reshape_dataset import reshape

## Gathering Data

#### Collect the daily covid-19 confirmed cases dataset

In [2]:
# Create a new directory in the current running directory if it does not exist 

folder_name = 'dataset'
new_folder(folder_name)

In [3]:
# Download the confirmed covid-19 cases from its source https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
file_name = 'covid19_confirmed_cases.csv'

# The functoin of get_dataset takes url and file name
data = get_dataset(URL,file_name)
confirmed_df = pd.read_csv('dataset'+ '/'+file_name)


In [4]:
confirmed_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/15/21,9/16/21,9/17/21,9/18/21,9/19/21,9/20/21,9/21/21,9/22/21,9/23/21,9/24/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,154283,154361,154487,154487,154487,154585,154712,154757,154800,154960
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,159423,160365,161324,162173,162953,163404,164276,165096,165864,166690
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,200770,200989,201224,201425,201600,201766,201948,202122,202283,202449
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,15108,15113,15124,15124,15124,15140,15140,15153,15156,15167
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,51407,51827,52208,52307,52307,52644,52968,53387,53840,54280


#### Collect the daily covid-19 death cases dataset

In [5]:
# Download the confirmed covid-19 cases from its source https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

URL = 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
file_name = 'covid19_death_cases.csv'

# The functoin of get_dataset takes url and file name
data = get_dataset(URL,file_name)
death_df = pd.read_csv('dataset'+ '/'+file_name)

death_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/15/21,9/16/21,9/17/21,9/18/21,9/19/21,9/20/21,9/21/21,9/22/21,9/23/21,9/24/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7174,7183,7186,7186,7186,7199,7199,7199,7199,7199
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2557,2563,2569,2574,2580,2587,2594,2601,2609,2619
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,5630,5651,5670,5681,5694,5709,5725,5739,5748,5758
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,130,130,130,130,130,130,130,130,130,130
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1360,1371,1378,1388,1388,1409,1414,1434,1460,1471


## Prepare the three datasets to be in an appreciate format and shape

In [6]:
confirmed = pd.melt(confirmed_df, id_vars=confirmed_df.columns[:4], 
                    value_vars = confirmed_df.columns[4:], 
                    var_name = 'date', 
                    value_name = 'confirmed')
confirmed.shape

(170748, 6)

In [7]:
death = reshape(death_df
               'date',
               'deaths')
death