In [1]:
#Import dependencies

import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Load the appropriate datasets and read them

vaccination_data_to_load = Path("/Users/ahmedmansour/Desktop/datacourse/project1_group2/Ahmad/Resources/vaccination-data.csv")
manufacturer_data_to_load = Path("/Users/ahmedmansour/Desktop/datacourse/project1_group2/Ahmad/Resources/vaccinations-by-manufacturer.csv")

vaccination_data = pd.read_csv(vaccination_data_to_load)
manufacturer_data = pd.read_csv(manufacturer_data_to_load)

In [3]:
vaccination_data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,


In [4]:
manufacturer_data.head()

Unnamed: 0,location,date,vaccine,total_vaccinations
0,Argentina,2020-12-29,Oxford/AstraZeneca,1
1,Argentina,2020-12-29,Sinopharm/Beijing,1
2,Argentina,2020-12-29,Sputnik V,20493
3,Argentina,2020-12-30,Sputnik V,40595
4,Argentina,2020-12-31,Sputnik V,43401


In [5]:
#Drop the unnecessary columns in the first dataset
columns_to_keep = ["continent", "location", "date", "total_vaccinations","people_vaccinated", "people_fully_vaccinated"]
new_vaccine_df = vaccination_data[columns_to_keep]
new_vaccine_df.head()

Unnamed: 0,continent,location,date,total_vaccinations,people_vaccinated,people_fully_vaccinated
0,Asia,Afghanistan,2020-02-24,,,
1,Asia,Afghanistan,2020-02-25,,,
2,Asia,Afghanistan,2020-02-26,,,
3,Asia,Afghanistan,2020-02-27,,,
4,Asia,Afghanistan,2020-02-28,,,


In [6]:
#rename columns
vaccination_cleaned_df = new_vaccine_df.rename(columns={"total_vaccinations": "Total Vaccinations per Location", "people_vaccinated": "People Vaccinated", "people_fully_vaccinated": "People Fully Vaccinated"})
manufacturer_data_cleaned = manufacturer_data.rename(columns={"total_vaccinations": "Total Vaccinations per Manufacturer"})

In [7]:
vaccination_cleaned_df.head()

Unnamed: 0,continent,location,date,Total Vaccinations per Location,People Vaccinated,People Fully Vaccinated
0,Asia,Afghanistan,2020-02-24,,,
1,Asia,Afghanistan,2020-02-25,,,
2,Asia,Afghanistan,2020-02-26,,,
3,Asia,Afghanistan,2020-02-27,,,
4,Asia,Afghanistan,2020-02-28,,,


In [8]:
manufacturer_data_cleaned.head()

Unnamed: 0,location,date,vaccine,Total Vaccinations per Manufacturer
0,Argentina,2020-12-29,Oxford/AstraZeneca,1
1,Argentina,2020-12-29,Sinopharm/Beijing,1
2,Argentina,2020-12-29,Sputnik V,20493
3,Argentina,2020-12-30,Sputnik V,40595
4,Argentina,2020-12-31,Sputnik V,43401


In [9]:
#Find unique manufacturers to help teammate know which stock data to collect
unique_manufacturers = manufacturer_data_cleaned["vaccine"].unique()
print(unique_manufacturers)

['Oxford/AstraZeneca' 'Sinopharm/Beijing' 'Sputnik V' 'Pfizer/BioNTech'
 'Moderna' 'CanSino' 'Sputnik Light' 'Johnson&Johnson' 'Novavax'
 'Sanofi/GSK' 'Valneva' 'Medicago' 'Sinovac' 'Covaxin' 'SKYCovione']


In [10]:
#Find unique countries to know how to group by continent, preferred method for for visualization to avoid crowding if done by location
unique_countries = manufacturer_data_cleaned["location"].unique()
unique_countries

array(['Argentina', 'Austria', 'Belgium', 'Bulgaria', 'Canada', 'Chile',
       'Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Ecuador', 'Estonia',
       'Finland', 'France', 'Germany', 'Hong Kong', 'Hungary', 'Iceland',
       'Ireland', 'Italy', 'Japan', 'Latvia', 'Liechtenstein',
       'Lithuania', 'Luxembourg', 'Malta', 'Nepal', 'Netherlands',
       'Norway', 'Peru', 'Poland', 'Portugal', 'Romania', 'Slovakia',
       'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sweden',
       'Switzerland', 'Ukraine', 'United States', 'Uruguay',
       'European Union'], dtype=object)

In [11]:
#Group locations in manufacturer dataset by continent in order to facilitate subsequent visualizations
continents = {
    "Argentina": "South America", 
    "Chile": "South America", 
    "Ecuador": "South America",
    "Peru": "South America", 
    "Uruguay": "South America",
    "Canada": "North America",
    "United States": "North America",
    "Austria": "Europe", 
    "Belgium": "Europe", 
    "Bulgaria": "Europe", 
    "Croatia": "Europe", 
    "Cyprus": "Europe", 
    "Czechia": "Europe", 
    "Denmark": "Europe", 
    "Estonia":"Europe" ,
    "Finland": "Europe", 
    "France": "Europe", 
    "Germany": "Europe",
    "Hungary": "Europe", 
    "Iceland": "Europe",
    'Ireland': "Europe", 
    "Italy": "Europe", 
    "Latvia": "Europe", 
    "Liechtenstein":"Europe",
    "Lithuania": "Europe", 
    "Luxembourg": "Europe", 
    "Malta": "Europe", 
    "Netherlands": "Europe",
    "Norway": "Europe", 
    "Poland": "Europe", 
    "Portugal": "Europe",
    "Romania": "Europe", 
    "Slovakia": "Europe",
    "Slovenia": "Europe", 
    "Spain": "Europe", 
    "Sweden": "Europe",
    "Switzerland": "Europe", 
    "Ukraine": "Europe",
    "European Union": "Europe",
    "Hong Kong": "Asia",
    "Japan": "Asia", 
    "Nepal": "Asia",
    "South Korea": "Asia",
    "South Africa": "Africa"

}
manufacturer_data_cleaned["continent"] = manufacturer_data_cleaned["location"].map(continents)
manufacturer_data_cleaned.head()

Unnamed: 0,location,date,vaccine,Total Vaccinations per Manufacturer,continent
0,Argentina,2020-12-29,Oxford/AstraZeneca,1,South America
1,Argentina,2020-12-29,Sinopharm/Beijing,1,South America
2,Argentina,2020-12-29,Sputnik V,20493,South America
3,Argentina,2020-12-30,Sputnik V,40595,South America
4,Argentina,2020-12-31,Sputnik V,43401,South America


In [13]:
ok = new_vaccine_df["location"].nunique()
ok


230