# Overview

* We will analyze two datasets related to Covid-19, Covid-19 World Vaccination Progress by country and Covid -19 death/cases data by country to try.
* We will perfrom hypothesis test to infere if increased Covid-19 vaccination will result in lower Covid-19 case counts.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Creating df from both the csv files
covid_cases_df = pd.read_csv('Resource/covid-data.csv')
covid_vacc_df = pd.read_csv('Resource/country_vaccinations.csv')


# updating the df to have columns needed for analysis
covid_vacc_df = covid_vacc_df[['country', 'date', 'iso_code', 'people_vaccinated', \
                               'daily_vaccinations', 'people_vaccinated_per_hundred']]

# converting date object to pandas 'datetime64[ns]' format
covid_vacc_df['date'] = pd.to_datetime(covid_vacc_df['date'])

covid_cases_df = covid_cases_df[['location', 'date', 'iso_code', 'total_cases', 'new_cases', \
                                'total_deaths', 'new_deaths', 'population', 'total_cases_per_million', \
                                 'new_cases_per_million']]

# converting date object to pandas 'datetime64[ns]' format
covid_cases_df['date'] = pd.to_datetime(covid_cases_df['date']) 

In [None]:
# Adding total cases per hundred column
covid_cases_df['total_cases_per_hundred'] = (covid_cases_df['total_cases'] / covid_cases_df['population']) * 100

In [None]:
# Dropping the row that contatins world data or continental data
covid_cases_df = covid_cases_df[~covid_cases_df['iso_code'].str.contains('OWID')]

covid_cases_df = covid_cases_df.reset_index(drop=True)

In [None]:
# Dropping the row that contatins world data or continental data

covid_vacc_df = covid_vacc_df[~covid_vacc_df['iso_code'].str.contains('OWID')]

In [None]:
# Grouped max vaccination DF on country
max_covid_vacc_df = covid_vacc_df.groupby(['iso_code'], as_index=False).max().\
                        sort_values(by=['people_vaccinated_per_hundred'], ascending=False).reset_index(drop=True)

In [None]:
covid_vacc_df

In [None]:
# Grouped max covid cases DF on country
# Sorting the DF based on total cases per hundred
# Dropping the rows with NAN values for total cases and resetting the index

max_covid_case_df = covid_cases_df.groupby(['iso_code'], as_index=False).max()

sort_covid_case_per_hund_df = max_covid_case_df.drop(['new_cases', 'new_deaths', 'new_cases_per_million'], \
                           axis=1).sort_values(by=['total_cases_per_hundred'], ascending=False).dropna(subset=['total_cases']).reset_index(drop=True)


# Sorting the DF based on total cases
sort_most_covid_case_df = max_covid_case_df.drop(['new_cases', 'new_deaths', 'new_cases_per_million'], \
                           axis=1).sort_values(by=['total_cases'], ascending=False).dropna(subset=['total_cases']).reset_index(drop=True)

In [None]:
# This df shows the most effected countries based on cases per hundred. 


In [None]:
# This df shows the most effected countries based on most number of total cases. 


In [None]:
#mergin two dfs on 'iso-code' and 'date'

merge_df = pd.merge(covid_cases_df, covid_vacc_df, on=['iso_code', 'date'], how='outer')


In [None]:
#verfying the merge df

merge_df.loc[(merge_df['iso_code'] == 'AFG') & (merge_df['date'] == '2022-03-22')]

# Relationship Between Total Covid-19 Vaccinations vs Covid-19 cases for the entire dataset

In [None]:
# stackplots layer the y-axis values. Good for data with multiple y features. In This case, we have 233. Increasing opacity
# (alpha) can help visualize how this layering occurs. Scaled 1e7, or in units per ten million. 

In [None]:
plt.stackplot(merge_df['date'],
              merge_df['daily_vaccinations'], 
              color='b', alpha=0.2)

plt.stackplot(merge_df['date'], 
              merge_df['new_cases'], 
              color='r', alpha=0.5)

plt.xlim(merge_df['date'].min(), merge_df['date'].max())
plt.xticks(rotation=45, fontweight='light',  fontsize='small')
plt.xlabel('Jan 1,2020 - Mar 29,2022')
plt.ylabel('Daily Vaccinations vs New Reported Cases')
plt.title('Worldwide Reported Covid Cases plotted Alongside New Cases ', pad=20)
plt.legend(['Daily Vaccinations','New Cases'], loc='upper left')
plt.show()

In [None]:
# I set the alpha to .2 to better visualize the peaks and slopes that were being obscured potentially by 
# countries who reportedcases poorly, resulting in vertical lines between datapoints. 
# Important distinction is that this is 1e6 so in millions, while the previous plot is in 1e7 or ten million.


In [None]:
plt.stackplot(merge_df['date'], merge_df['new_cases'], color='r',alpha=0.2)
plt.xlim(merge_df['date'].min(), merge_df['date'].max())
plt.xticks(rotation=45, fontweight='light',  fontsize='small')
plt.xlabel('Jan 1,2020 - Mar 29,2022')
plt.ylabel('New Reported Cases')
plt.title('Worldwide Reported Covid Cases', pad=20)
plt.legend(['Daily Vaccinations','New Cases'], loc='upper left')
plt.show()

Overwhelmingly, on a global scale, the population of those vaccinated far exceeded the population of recorded cases. Overall, the population of the world far exceeds the population of those vaccinated. Notably, this is all just based on trusting the reported data from each country. Evidence for poor house keeping might even be possible to see from these plots.

In [None]:
# #stackplot showing just daily vaccinations. 

# plt.stackplot(total_relationship_df_dates['date'], 
#               total_relationship_df_dates['daily_vaccinations'], 
#               color='b', alpha=0.5)
# plt.xlim(total_relationship_df_dates['date'].min(), 
#          total_relationship_df_dates['date'].max())
# plt.xticks(rotation=90, fontweight='light',  fontsize='small')
# plt.show()

# Experimental/Testing 

The code/plots/dataframes here are the additional testing and analysis I was/am doing. Not strictly relevant to the research question. 

In [None]:
# #To create a plot that shows a scaled in version of the above plot focused on when vaccinations became available
# #starting 13 days before. The plot is coded directly below.

# interest_dates_total_rel = (merge_df_total_relationship_df['date'] > '2020-12-1') & \
# (merge_df_total_relationship_df['date'] <= '2022-03-8')

# total_relationship_df_dates = merge_df_total_relationship_df.loc[interest_dates_total_rel]

In [None]:
#plt.stackplot(total_relationship_df_dates['date'], 
#              total_relationship_df_dates['daily_vaccinations'], 
#              color='b', alpha=0.5)

#plt.stackplot(total_relationship_df_dates['date'], 
#              total_relationship_df_dates['new_cases'], 
#              color='r', alpha=0.5)

# plt.xlim(total_relationship_df_dates['date'].min(), total_relationship_df_dates['date'].max())
# plt.xticks(rotation=90, fontweight='light',  fontsize='small')
# plt.show()

In [None]:
# #Dataframe creation for exclusively cases before vaccination introduction. 

# interest_dates_cases_before = (covid_cases_df['date'] > covid_cases_df['date'].min()) & \
# (covid_cases_df['date'] <= covid_cases_df['date'].min())

# cases_before_df = covid_cases_df.loc[interest_dates_cases_before]

In [None]:
# #Plot of the case data before vaccination introduction. As a barplot, it might take up to 5 minutes to render. 
# #Also has no labels/title.

# plt.bar(relationship_df_cases['date'], relationship_df_cases['new_cases'], color='r', alpha=0.5)
# plt.xlim(relationship_df_cases['date'].min(), relationship_df_cases['date'].max())
# plt.xticks(rotation=90, fontweight='light',  fontsize='small')
# plt.show()

In [None]:
# #Grouped collection of code in progress for creating a fully labeled stackplot with each country labeled. 

# country_daily_list = merge_df.groupby(['iso_code'])['daily_vaccinations'].apply(list)
# country_daily_dict = country_daily_list.to_dict()

# max_key, max_value = max(country_daily_dict.items(), key = lambda a: len(set(a[1])))
# min_key, min_value = min(country_daily_dict.items(), key = lambda b: len(set(b[1])))
# print(max_key, len(max_value))
# print(min_key, len(min_value))

# country_daily_dict_equal = {}
# for k,v in country_daily_dict.items():
#     if len(v) < len(max_value):
#         zeroes = [0]*(len(max_value)-len(v))
#         country_daily_dict_equal[k] = zeroes+v
#     else:
#         country_daily_dict_equal[k] = v

# max_key_v2, max_value_v2 = max(country_daily_dict_equal.items(), key = lambda c: len(set(c[1])))
# min_key_v2, min_value_v2 = min(country_daily_dict_equal.items(), key = lambda d: len(set(d[1])))
# print(min_key_v2, len(min_value_v2))
# print(max_key_v2, len(max_value_v2))

In [None]:
#Collection of code in progress for summary table dataframe

# country_daily_list = merge_df.groupby(['iso_code'])['daily_vaccinations'].apply(list)
# country_daily_dict = country_daily_list.to_dict()

# populations = []
# [populations.append(n) for n in merge_df["population"] if n not in populations]
# populations_clean = [n for n in populations if str(n) != 'nan']
# #len(populations_clean)

# first_date = min(merge_df['date'])
# first_date_clean = first_date.strftime('%m/%d/%Y')

# last_date = max(merge_df['date'])
# last_date_clean = last_date.strftime('%m/%d/%Y')
# time_delta = last_date - first_date

In [None]:
# #For finding the total cases. Also helped find which countries did not record any cases. 

# cases_max_df =  pd.DataFrame(covid_cases_df.groupby(['iso_code','location'])['total_cases'].max())
# cases_max_df["total_cases"] = cases_max_df["total_cases"].fillna(0)
# zero_cases = cases_max_df.loc[cases_max_df["total_cases"] == 0]

In [None]:
# #For finding the date and location(s) where the first case was recorded

# case_dates_sorted_df = covid_cases_df.sort_values('date')
# earliest_case = case_dates_sorted_df.loc[(case_dates_sorted_df['total_cases'] ==1)] 
# earliest_case = earliest_case.reset_index(drop=True)
# same_date = earliest_case.loc[earliest_case['date'] == earliest_case['date'].iloc[0]]
# case_date = earliest_case['date'].iloc[0]
# case_date = case_date.strftime('%m/%d/%Y')

# case_date_str = (f"{vacc_date} [\
# {same_date['location'].iloc[0]},\
#  {same_date['location'].iloc[1]},\
#  {same_date['location'].iloc[2]},\
#  {same_date['location'].iloc[3]}]")

In [None]:
# # for finding the total vaccinations recorded in the dataframe across all countries. 

#vacc_sum_df =  pd.DataFrame(covid_vacc_df.groupby(['iso_code','country'])['daily_vaccinations'].sum())

In [None]:
# #For finding the earliest date that daily vaccinations began.

# vacc_dates = covid_vacc_df.sort_values('date')
# earliest_vacc = vacc_dates.loc[(vacc_dates['daily_vaccinations'] ==1)] 
# earliest_vacc = earliest_vacc.reset_index(drop=True)
# vacc_date = earliest_vacc['date'].iloc[0]
# vacc_date = vacc_date.strftime('%m/%d/%Y')

In [None]:
#Summary Datatable

# world_summary_df = pd.DataFrame(
#     {"Total Countries":[len(country_daily_dict.keys())],
#      "Total Population":[sum(populations_clean)],
#      "Entire Timespan":[(f"{first_date_clean}-{last_date_clean}")],
#      "Total Days": [time_delta.days],
#      "Total Cases":[cases_max_df["total_cases"].sum()],
#      "Total Vaccinations":[vacc_sum_df['daily_vaccinations'].sum()],
#      "Date of First Recorded Case":[case_date],
#      "Date of First Daily Vaccination":[vacc_date],
#      "Most Cases in Single Day": [""],    
#      "Most Vaccinations in Single Day":[""],
#     }) 