In [1]:
# Import neccasary libraries
import numpy as np
import pandas as pd 
import os
import urllib
# %matplotlib inline # Allows jyupter notebook to render (not used)



In [2]:
# Get the url
url = "https://covid19.who.int/WHO-COVID-19-global-data.csv"
# Store new path for download in folder data > covid
file_path = os.path.join("data", "covid")

In [3]:
# Make the directory using the file_path variable
os.makedirs(file_path, exist_ok=True)
csv_path = os.path.join(file_path, "WHO-COVID-19-global-data.csv")
urllib.request.urlretrieve(url, csv_path)   # save content from url to the path

('data\\covid\\WHO-COVID-19-global-data.csv',
 <http.client.HTTPMessage at 0x1871f7bf460>)

In [4]:
# Make our data frame
df = pd.read_csv(csv_path)


In [5]:
df.shape

(112812, 8)

In [6]:
df.head()

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
0,2020-01-03,AF,Afghanistan,EMRO,0,0,0,0
1,2020-01-04,AF,Afghanistan,EMRO,0,0,0,0
2,2020-01-05,AF,Afghanistan,EMRO,0,0,0,0
3,2020-01-06,AF,Afghanistan,EMRO,0,0,0,0
4,2020-01-07,AF,Afghanistan,EMRO,0,0,0,0


In [7]:
df.describe()

Unnamed: 0,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
count,112812.0,112812.0,112812.0,112812.0
mean,1271.546245,179103.4,27.051519,4404.729204
std,7952.6709,1167906.0,151.767327,24056.899783
min,-32952.0,0.0,-514.0,0.0
25%,0.0,13.0,0.0,0.0
50%,5.0,1633.0,0.0,27.0
75%,247.0,31408.25,4.0,556.0
max,402270.0,31466880.0,6409.0,563224.0


In [8]:
# Go through all columns and get rid of whitespaces in column names
df.columns = [col.strip() for col in df.columns]
df.columns

Index(['Date_reported', 'Country_code', 'Country', 'WHO_region', 'New_cases',
       'Cumulative_cases', 'New_deaths', 'Cumulative_deaths'],
      dtype='object')

In [9]:
df.loc[1:4, "Country"]  # Get entries 1-4 from country column

1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: Country, dtype: object

In [10]:
df.loc[1:8,["Country", "New_cases", "New_deaths"]]  # Get 3 columns at a time

Unnamed: 0,Country,New_cases,New_deaths
1,Afghanistan,0,0
2,Afghanistan,0,0
3,Afghanistan,0,0
4,Afghanistan,0,0
5,Afghanistan,0,0
6,Afghanistan,0,0
7,Afghanistan,0,0
8,Afghanistan,0,0


In [11]:
df[df.Country == "United States of America"]  # Get only USA data

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
107576,2020-01-03,US,United States of America,AMRO,0,0,0,0
107577,2020-01-04,US,United States of America,AMRO,0,0,0,0
107578,2020-01-05,US,United States of America,AMRO,0,0,0,0
107579,2020-01-06,US,United States of America,AMRO,0,0,0,0
107580,2020-01-07,US,United States of America,AMRO,0,0,0,0
...,...,...,...,...,...,...,...,...
108047,2021-04-18,US,United States of America,AMRO,73697,31250635,911,560858
108048,2021-04-19,US,United States of America,AMRO,61306,31311941,758,561616
108049,2021-04-20,US,United States of America,AMRO,38084,31350025,305,561921
108050,2021-04-21,US,United States of America,AMRO,57164,31407189,534,562455


In [12]:
# New deaths greater then or equal 1000, and show only 2 specific columns
df.loc[df.New_deaths >= 1000, ["New_deaths", "Country"]]

Unnamed: 0,New_deaths,Country
4082,3351,Argentina
11673,1610,Bolivia (Plurinational State of)
13467,1179,Brazil
13469,1188,Brazil
13470,1001,Brazil
...,...,...
108024,1293,United States of America
108025,1403,United States of America
108026,1280,United States of America
108031,1055,United States of America


In [13]:
# Chain conditions
df.loc[(df.New_deaths > 1000) & (df.Country_code == 'US'), ["Date_reported","Country","New_cases","New_deaths"]]

Unnamed: 0,Date_reported,Country,New_cases,New_deaths
107668,2020-04-04,United States of America,28103,1061
107669,2020-04-05,United States of America,32105,1166
107670,2020-04-06,United States of America,33510,1338
107671,2020-04-07,United States of America,26493,1201
107672,2020-04-08,United States of America,29510,1286
...,...,...,...,...
108024,2021-03-26,United States of America,65326,1293
108025,2021-03-27,United States of America,69589,1403
108026,2021-03-28,United States of America,71187,1280
108031,2021-04-02,United States of America,68409,1055


In [14]:
df.loc[df.Country_code == "US", ["New_cases"]].sum()

New_cases    31466876
dtype: int64

In [15]:
df[df.New_deaths < 0]  # Shows invalid data, good for data exploration

Unnamed: 0,Date_reported,Country_code,Country,WHO_region,New_cases,Cumulative_cases,New_deaths,Cumulative_deaths
1166,2020-08-04,DZ,Algeria,AFRO,507,31972,-2,1229
2656,2020-10-05,AO,Angola,AFRO,0,5370,-4,185
6904,2020-08-30,BS,Bahamas,AMRO,37,2057,-10,40
7121,2021-04-04,BS,Bahamas,AMRO,35,9234,-1,188
15424,2020-07-13,BF,Burkina Faso,AFRO,13,1033,-1,53
21578,2020-06-09,CG,Congo,AFRO,0,683,-2,20
24025,2020-08-15,CU,Cuba,AMRO,55,3229,-1,88
25412,2020-07-05,CZ,Czechia,EURO,121,12440,-1,351
25413,2020-07-06,CZ,Czechia,EURO,75,12515,-3,348
30826,2020-12-30,ER,Eritrea,AFRO,181,1220,-1,1


In [16]:
# Create new column to find percentage of cases today as a percentage of total
df["pct_cases"] = (df["New_cases"] / df["Cumulative_cases"]) * 100