Introduction (What we want to do and what data we are looking at):
A novel corona virus (COVID 19) was identified in 2019 in Wuhan China. It has spread rapidly worldwide and was officially declared to be a pandemic by the WHO. To better understand the data available about it, we will be doing exploratory data analysis of the available COVID 19 data. The goal of the project is to study the impact of COVID 19 across the world using Python, Pandas and Matplotlib and present visualizations to show our analysis.

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import json
import time
from scipy.stats import linregress
from datetime import datetime, timedelta
import plotly
import plotly.express as px
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.io as pio
pio.renderers.default = "notebook_connected"


In [3]:
# Loading the data
file = "Data/owid-covid-data.csv"

# Reading the data into a pandas dataframe
data = pd.read_csv(file)

In [4]:
# Get all the columns in the dataframe to look for the data needed
data.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [5]:
# Change the date format to datetime
#data['date'] = pd.to_datetime(data['date'],errors='ignore')

# Trend of Covid related total deaths by country - Vasanta

In [6]:
# Create a new dataframe with only the columns we want
covid_df = data[['iso_code','continent','location','date','people_fully_vaccinated','people_fully_vaccinated_per_hundred','total_deaths', 'new_deaths']]
covid_df

Unnamed: 0,iso_code,continent,location,date,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_deaths,new_deaths
0,AFG,Asia,Afghanistan,2020-02-24,,,,
1,AFG,Asia,Afghanistan,2020-02-25,,,,
2,AFG,Asia,Afghanistan,2020-02-26,,,,
3,AFG,Asia,Afghanistan,2020-02-27,,,,
4,AFG,Asia,Afghanistan,2020-02-28,,,,
...,...,...,...,...,...,...,...,...
181458,ZWE,Africa,Zimbabwe,2022-04-20,,,5466.0,2.0
181459,ZWE,Africa,Zimbabwe,2022-04-21,3624003.0,24.01,5467.0,1.0
181460,ZWE,Africa,Zimbabwe,2022-04-22,3630122.0,24.05,5468.0,1.0
181461,ZWE,Africa,Zimbabwe,2022-04-23,3636597.0,24.10,5468.0,0.0


## Clean the data

In [7]:
# Drop the rows from continents rows with no data
covid_df_continent_clean = covid_df.dropna(subset=['continent'])
covid_df_continent_clean

Unnamed: 0,iso_code,continent,location,date,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_deaths,new_deaths
0,AFG,Asia,Afghanistan,2020-02-24,,,,
1,AFG,Asia,Afghanistan,2020-02-25,,,,
2,AFG,Asia,Afghanistan,2020-02-26,,,,
3,AFG,Asia,Afghanistan,2020-02-27,,,,
4,AFG,Asia,Afghanistan,2020-02-28,,,,
...,...,...,...,...,...,...,...,...
181458,ZWE,Africa,Zimbabwe,2022-04-20,,,5466.0,2.0
181459,ZWE,Africa,Zimbabwe,2022-04-21,3624003.0,24.01,5467.0,1.0
181460,ZWE,Africa,Zimbabwe,2022-04-22,3630122.0,24.05,5468.0,1.0
181461,ZWE,Africa,Zimbabwe,2022-04-23,3636597.0,24.10,5468.0,0.0


### Cleaned data in clean covid dataframe

In [8]:
# Fill the missing values in the columns with 0
clean_covid_df = covid_df_continent_clean.fillna(value=0)
clean_covid_df

Unnamed: 0,iso_code,continent,location,date,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_deaths,new_deaths
0,AFG,Asia,Afghanistan,2020-02-24,0.0,0.00,0.0,0.0
1,AFG,Asia,Afghanistan,2020-02-25,0.0,0.00,0.0,0.0
2,AFG,Asia,Afghanistan,2020-02-26,0.0,0.00,0.0,0.0
3,AFG,Asia,Afghanistan,2020-02-27,0.0,0.00,0.0,0.0
4,AFG,Asia,Afghanistan,2020-02-28,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...
181458,ZWE,Africa,Zimbabwe,2022-04-20,0.0,0.00,5466.0,2.0
181459,ZWE,Africa,Zimbabwe,2022-04-21,3624003.0,24.01,5467.0,1.0
181460,ZWE,Africa,Zimbabwe,2022-04-22,3630122.0,24.05,5468.0,1.0
181461,ZWE,Africa,Zimbabwe,2022-04-23,3636597.0,24.10,5468.0,0.0


In [9]:
# Add a column for the max of total deaths to clean covid dataframe as it is cumulative
clean_covid_df['max_total_deaths'] = clean_covid_df.groupby('location')['total_deaths'].transform('max')
sorted_max_10deaths_df = clean_covid_df.sort_values(by='max_total_deaths', ascending=False).drop_duplicates(subset='max_total_deaths').nlargest(10, 'max_total_deaths')
sorted_max_50deaths_df = clean_covid_df.sort_values(by='max_total_deaths', ascending=False).drop_duplicates(subset='max_total_deaths').nlargest(50, 'max_total_deaths')
#sorted_max_deaths_df.reset_index(inplace=True)
#sorted_max_deaths_df1
sorted_max_10deaths_df

Unnamed: 0,iso_code,continent,location,date,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_deaths,new_deaths,max_total_deaths
171362,USA,North America,United States,2021-03-10,39781670.0,11.98,527025.0,1540.0,991254.0
23320,BRA,South America,Brazil,2021-09-14,75814267.0,35.43,588078.0,678.0,662891.0
75228,IND,Asia,India,2020-02-11,0.0,0.0,0.0,0.0,522223.0
134818,RUS,Europe,Russia,2020-02-19,0.0,0.0,0.0,0.0,367366.0
106282,MEX,North America,Mexico,2021-07-25,23955609.0,18.39,238424.0,108.0,324129.0
129721,PER,South America,Peru,2021-08-15,7239915.0,21.7,197393.0,53.0,212742.0
170208,GBR,Europe,United Kingdom,2020-04-14,0.0,0.0,14128.0,1077.0,173518.0
81635,ITA,Europe,Italy,2020-04-17,0.0,0.0,22745.0,575.0,162688.0
76566,IDN,Asia,Indonesia,2021-08-18,29403345.0,10.64,121141.0,1128.0,156100.0
57731,FRA,Europe,France,2020-08-08,0.0,0.0,30329.0,1.0,145129.0


In [10]:
#scatter plot of Top 50 countries with the highest number of deaths till that date
fig = px.scatter(sorted_max_50deaths_df, x='date',y='max_total_deaths', color='location',title='Total Deaths')
fig.show()

In [11]:
#scatter plot of Top 50 countries with the highest number of deaths
fig = px.scatter(sorted_max_50deaths_df, x='location',y='max_total_deaths', color='location',title='Top 50 countries with the highest number of deaths')
fig.show()

In [12]:
# pie chart showing a closer look of the top 10 countries with the highest number of deaths  
fig = px.pie(sorted_max_10deaths_df, values='max_total_deaths', names='location',labels='location', title='Top 10 Countries with the Most Deaths',opacity=0.9)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

# Progress of fully vaccinated people by country -Vasanta

In [13]:
# Top 10 fully vaccinated countries
sorted_max_10vaccinated_df = clean_covid_df.sort_values(by='people_fully_vaccinated', ascending=False).drop_duplicates(subset='location').nlargest(10, 'people_fully_vaccinated')
#sorted_max_10vacci_df = clean_covid_df.groupby('location')
#sorted_max_10vacci_df = sorted_max_10vacci_df.sort_values(ascending=False).drop_duplicates(subset='people_fully_vaccinated').nlargest(10, 'people_fully_vaccinated')
##sorted_max_10vacci_df = clean_covid_df.sort_values(by='total_fully_vaccinated', ascending=False).drop_duplicates(subset='total_fully_vaccinated').nlargest(10, 'total_fully_vaccinated')
#sorted_max_50vacci_df = clean_covid_df.sort_values(by='total_fully_vaccinated', ascending=False).drop_duplicates(subset='total_fully_vaccinated').nlargest(50, 'total_fully_vaccinated')
#sorted_max_10vaccinated_df.drop(columns=['iso_code','total_deaths','new_deaths','total_fully_vaccinated','max_total_deaths'], inplace=True)
sorted_max_10vaccinated_df

Unnamed: 0,iso_code,continent,location,date,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_deaths,new_deaths,max_total_deaths
34495,CHN,Asia,China,2022-04-18,1246769000.0,86.33,4648.0,10.0,4725.0
76031,IND,Asia,India,2022-04-24,850179600.0,61.01,522223.0,30.0,522223.0
171771,USA,North America,United States,2022-04-23,219338300.0,66.06,991231.0,62.0,991254.0
76815,IDN,Asia,Indonesia,2022-04-24,163954000.0,59.33,156100.0,33.0,156100.0
23542,BRA,South America,Brazil,2022-04-24,163322600.0,76.32,662891.0,36.0,662891.0
125843,PAK,Asia,Pakistan,2022-04-23,121234300.0,53.83,30369.0,0.0,30369.0
14237,BGD,Asia,Bangladesh,2022-04-20,115817400.0,69.64,29127.0,1.0,29127.0
83971,JPN,Asia,Japan,2022-04-24,101354900.0,80.41,29305.0,15.0,29305.0
106548,MEX,North America,Mexico,2022-04-17,79945000.0,61.37,323944.0,6.0,324129.0
177772,VNM,Asia,Vietnam,2022-03-22,77754110.0,79.2,42014.0,65.0,43013.0


In [14]:
# find sorted top 10 countries with the highest number of fully vaccinated people
top_10_vacci = sorted_max_10vaccinated_df['location'].tolist()
top_10_vacci_df = clean_covid_df[clean_covid_df['location'].isin(top_10_vacci)]
top_10_vacci_df

Unnamed: 0,iso_code,continent,location,date,people_fully_vaccinated,people_fully_vaccinated_per_hundred,total_deaths,new_deaths,max_total_deaths
13464,BGD,Asia,Bangladesh,2020-03-08,0.0,0.0,0.0,0.0,29127.0
13465,BGD,Asia,Bangladesh,2020-03-09,0.0,0.0,0.0,0.0,29127.0
13466,BGD,Asia,Bangladesh,2020-03-10,0.0,0.0,0.0,0.0,29127.0
13467,BGD,Asia,Bangladesh,2020-03-11,0.0,0.0,0.0,0.0,29127.0
13468,BGD,Asia,Bangladesh,2020-03-12,0.0,0.0,0.0,0.0,29127.0
...,...,...,...,...,...,...,...,...,...
177801,VNM,Asia,Vietnam,2022-04-20,0.0,0.0,42982.0,7.0,43013.0
177802,VNM,Asia,Vietnam,2022-04-21,0.0,0.0,42991.0,9.0,43013.0
177803,VNM,Asia,Vietnam,2022-04-22,0.0,0.0,42998.0,7.0,43013.0
177804,VNM,Asia,Vietnam,2022-04-23,0.0,0.0,43004.0,6.0,43013.0


In [15]:
# line plot of Top 10 countries with the highest number of fully vaccinated people
fig = px.line(top_10_vacci_df, x='date',y='people_fully_vaccinated_per_hundred', color='location',title='Top 10 Countries with the Most Fully Vaccinated')
fig.show()


In [16]:
fig = px.choropleth(top_10_vacci_df, locations='location',locationmode='country names', color='people_fully_vaccinated_per_hundred',
                           color_continuous_scale="Purpor"
                          )
fig.update_layout(title="Top 10 Countries with the Most Fully Vaccinated",
                  titlefont={'size': 20},
                  paper_bgcolor='aliceblue'        
                  )
fig.show()

In [17]:
fig = px.choropleth(clean_covid_df, locations='location',locationmode='country names', hover_name='location', color='people_fully_vaccinated_per_hundred', animation_frame='date',
                           color_continuous_scale="Purpor",projection="orthographic"
                          )
fig.update_layout(title="Top 10 Countries with the most fully vaccinated people",
                  titlefont={'size': 20},
                  paper_bgcolor='aliceblue'        
                  )
fig.show()