# **Analyzing COVID-19 data from January 2020 - December 2021**
[data citation](https://www.kaggle.com/georgesaavedra/covid19-dataset)

In [147]:
import pandas as pd
import numpy as np
import datetime
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [148]:
data = pd.read_csv('./owid-covid-data.csv')

In [149]:
data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [150]:
data

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164249,ZWE,Africa,Zimbabwe,2022-02-20,233352.0,128.0,281.571,5386.0,0.0,1.714,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
164250,ZWE,Africa,Zimbabwe,2022-02-21,233571.0,219.0,281.143,5386.0,0.0,1.714,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
164251,ZWE,Africa,Zimbabwe,2022-02-22,233980.0,409.0,339.571,5388.0,2.0,2.000,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
164252,ZWE,Africa,Zimbabwe,2022-02-23,234589.0,609.0,339.429,5388.0,0.0,1.286,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,


#### **Trim the columns to just those of interest**

In [151]:
list(data.columns)

['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'total_boosters',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'peo

In [152]:
dataTest = data[['continent', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'hospital_beds_per_thousand', 'life_expectancy', 'gdp_per_capita', 'reproduction_rate', 'extreme_poverty', 'population', 'median_age', 'people_vaccinated',
 'people_fully_vaccinated', 'cardiovasc_death_rate', 'diabetes_prevalence',]]

In [153]:
dataTest.head()

Unnamed: 0,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,hospital_beds_per_thousand,life_expectancy,gdp_per_capita,reproduction_rate,extreme_poverty,population,median_age,people_vaccinated,people_fully_vaccinated,cardiovasc_death_rate,diabetes_prevalence
0,Asia,Afghanistan,2020-02-24,5.0,5.0,,,0.5,64.83,1803.987,,,39835428.0,18.6,,,597.029,9.59
1,Asia,Afghanistan,2020-02-25,5.0,0.0,,,0.5,64.83,1803.987,,,39835428.0,18.6,,,597.029,9.59
2,Asia,Afghanistan,2020-02-26,5.0,0.0,,,0.5,64.83,1803.987,,,39835428.0,18.6,,,597.029,9.59
3,Asia,Afghanistan,2020-02-27,5.0,0.0,,,0.5,64.83,1803.987,,,39835428.0,18.6,,,597.029,9.59
4,Asia,Afghanistan,2020-02-28,5.0,0.0,,,0.5,64.83,1803.987,,,39835428.0,18.6,,,597.029,9.59


#### **Find the all unique dates**

In [154]:
dataTest['date'].unique()

array(['2020-02-24', '2020-02-25', '2020-02-26', '2020-02-27',
       '2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02',
       '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06',
       '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10',
       '2020-03-11', '2020-03-12', '2020-03-13', '2020-03-14',
       '2020-03-15', '2020-03-16', '2020-03-17', '2020-03-18',
       '2020-03-19', '2020-03-20', '2020-03-21', '2020-03-22',
       '2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26',
       '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30',
       '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03',
       '2020-04-04', '2020-04-05', '2020-04-06', '2020-04-07',
       '2020-04-08', '2020-04-09', '2020-04-10', '2020-04-11',
       '2020-04-12', '2020-04-13', '2020-04-14', '2020-04-15',
       '2020-04-16', '2020-04-17', '2020-04-18', '2020-04-19',
       '2020-04-20', '2020-04-21', '2020-04-22', '2020-04-23',
       '2020-04-24', '2020-04-25', '2020-04-26', '2020-

In [155]:
dataTest['date'].describe()

count         164254
unique           786
top       2021-08-27
freq             238
Name: date, dtype: object

#### **Split up the date column**

In [156]:
df = dataTest['date'].str.split('-', expand = True)

In [157]:
df.head()

Unnamed: 0,0,1,2
0,2020,2,24
1,2020,2,25
2,2020,2,26
3,2020,2,27
4,2020,2,28


#### **Rename the dataframe columns**

In [158]:
df1 = df.rename(columns={0: 'year', 1: 'month', 2: 'day'})

In [159]:
df1.head()

Unnamed: 0,year,month,day
0,2020,2,24
1,2020,2,25
2,2020,2,26
3,2020,2,27
4,2020,2,28


#### **Bring back into the original dataset**

In [160]:
MDYdf = pd.concat([dataTest, df1], axis =1)

In [161]:
MDYdf.head()

Unnamed: 0,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,hospital_beds_per_thousand,life_expectancy,gdp_per_capita,...,extreme_poverty,population,median_age,people_vaccinated,people_fully_vaccinated,cardiovasc_death_rate,diabetes_prevalence,year,month,day
0,Asia,Afghanistan,2020-02-24,5.0,5.0,,,0.5,64.83,1803.987,...,,39835428.0,18.6,,,597.029,9.59,2020,2,24
1,Asia,Afghanistan,2020-02-25,5.0,0.0,,,0.5,64.83,1803.987,...,,39835428.0,18.6,,,597.029,9.59,2020,2,25
2,Asia,Afghanistan,2020-02-26,5.0,0.0,,,0.5,64.83,1803.987,...,,39835428.0,18.6,,,597.029,9.59,2020,2,26
3,Asia,Afghanistan,2020-02-27,5.0,0.0,,,0.5,64.83,1803.987,...,,39835428.0,18.6,,,597.029,9.59,2020,2,27
4,Asia,Afghanistan,2020-02-28,5.0,0.0,,,0.5,64.83,1803.987,...,,39835428.0,18.6,,,597.029,9.59,2020,2,28


#### **Group by the geographic area**

In [162]:
MDYdf1 = MDYdf.groupby('location').median()

In [163]:
MDYdf1.head()

Unnamed: 0_level_0,total_cases,new_cases,total_deaths,new_deaths,hospital_beds_per_thousand,life_expectancy,gdp_per_capita,reproduction_rate,extreme_poverty,population,median_age,people_vaccinated,people_fully_vaccinated,cardiovasc_death_rate,diabetes_prevalence
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Afghanistan,55655.0,77.0,2451.0,4.0,0.5,64.83,1803.987,1.02,,39835430.0,18.6,735213.0,324951.5,597.029,9.59
Africa,3804761.0,11302.0,104384.0,274.0,,,,,,1373486000.0,,47272050.0,28638137.0,,
Albania,109248.5,183.5,1866.0,4.0,2.89,78.57,11803.431,1.04,1.1,2872934.0,38.0,804881.0,652115.5,304.195,10.08
Algeria,112461.0,240.0,3002.0,8.0,1.9,76.88,13913.839,1.02,0.5,44616630.0,29.1,6409018.0,5114493.5,278.364,6.73
Andorra,10849.0,12.0,112.0,0.0,,83.73,,0.99,,77354.0,,32590.5,26811.5,109.135,7.97
