# Overview

* We will analyze two datasets related to Covid-19, Covid-19 World Vaccination Progress by country and Covid -19 death/cases data by country to try.
* We will perfrom hypothesis test to infere if increased Covid-19 vaccination will result in lower Covid-19 case counts.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Creating df from both the csv files
covid_cases_df = pd.read_csv('Resource/covid-data.csv')
covid_vacc_df = pd.read_csv('Resource/country_vaccinations.csv')


# updating the df to have columns needed for analysis
covid_vacc_df = covid_vacc_df[['country', 'date', 'iso_code', 'people_vaccinated', \
                               'daily_vaccinations', 'people_vaccinated_per_hundred']]

covid_vacc_df['date'] = pd.to_datetime(covid_vacc_df['date'])

covid_cases_df = covid_cases_df[['location', 'date', 'iso_code', 'total_cases', 'new_cases', \
                                'total_deaths', 'new_deaths', 'population', 'total_cases_per_million', \
                                 'new_cases_per_million']]

covid_cases_df['date'] = pd.to_datetime(covid_cases_df['date'])


In [None]:
# Adding total cases per hundred column
covid_cases_df['total_cases_per_hundred'] = covid_cases_df['total_cases'] / covid_cases_df['population']

# Dropping the row that contatins world data or continental data
covid_cases_df = covid_cases_df[~covid_cases_df['iso_code'].str.contains('OWID')]

covid_cases_df = covid_cases_df.reset_index(drop=True)

In [None]:
# Dropping the row that contatins world data or continental data
covid_vacc_df = covid_vacc_df[~covid_vacc_df['iso_code'].str.contains('OWID')]

In [None]:
# Grouped max vaccination DF on country
max_covid_vacc_df = covid_vacc_df.groupby(['iso_code'], as_index=False).max().\
                        sort_values(by=['people_vaccinated_per_hundred'], ascending=False).reset_index(drop=True)
max_covid_vacc_df

In [None]:
covid_cases_df

In [None]:
covid_vacc_df

In [None]:
# Grouped max covid cases DF on country
# Sorting the DF based on total cases per hundred
# Dropping the rows with NAN values for total cases and resetting the index

max_covid_case_df = covid_cases_df.groupby(['iso_code'], as_index=False).max()

sort_covid_case_per_hund_df = max_covid_case_df.drop(['new_cases', 'new_deaths', 'new_cases_per_million'], \
                           axis=1).sort_values(by=['total_cases_per_hundred'], ascending=False).dropna(subset=['total_cases']).reset_index(drop=True)


# Sorting the DF based on total cases
sort_most_covid_case_df = max_covid_case_df.drop(['new_cases', 'new_deaths', 'new_cases_per_million'], \
                           axis=1).sort_values(by=['total_cases'], ascending=False).dropna(subset=['total_cases']).reset_index(drop=True)

In [None]:
# This df shows the most effected countries based on cases per hundred. 
sort_covid_case_per_hund_df

In [None]:
# This df shows the most effected countries based on most number of total cases. 
sort_most_covid_case_df

In [None]:
#mergin two dfs on 'iso-code' and 'date'

merge_df = pd.merge(covid_cases_df, covid_vacc_df, on=['iso_code', 'date'], how='outer')

In [None]:
#verfying the merge df

merge_df.loc[(merge_df['iso_code'] == 'AFG') & (merge_df['date'] == '2022-03-06')]

In [None]:
# Dataframes available to use

#1. covid_vacc_df  ---- raw df with required columns for vaccination record
#2. covid_cases_df ---- raw df with required columns for cases record with added column for "total cases per hundred"
                        # plus data for world and continents are removed from it
    
#3. max_covid_vacc_df ---- data is grouped on country and is sorted with the countries with most people vaccinated
#4. sort_covid_case_per_hund_df ---- data is grouped on country and is sorted with the countries with most cases per hundred
#5. sort_most_covid_case_df ---- data is grouped on country and is sorted with the countries with most cases
#6. merge_df ---- its a merged df 'on=['iso_code', 'date']' containg vaccination and cases record. 




# sample plot

In [None]:
isreal_df = merge_df.loc[merge_df['iso_code'] == 'ISR']

In [None]:


plt.bar(isreal_df['date'], isreal_df['daily_vaccinations'], color='b', alpha=0.5)
plt.bar(isreal_df['date'], isreal_df['new_cases'], color='r', alpha=0.5)
plt.xlim(isreal_df['date'].min(), isreal_df['date'].max())
plt.xticks(rotation=90, fontweight='light',  fontsize='small')
plt.show()

In [None]:
isreal_df

In [None]:
isreal_df.loc[isreal_df['new_cases'] == isreal_df['new_cases'].max()]