In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
# Creating df from both the csv files
covid_cases_df = pd.read_csv('Resource/covid-data.csv')
covid_vacc_df = pd.read_csv('Resource/country_vaccinations.csv')


# updating the df to have columns needed for analysis
covid_vacc_df = covid_vacc_df[['country', 'date', 'iso_code', 'people_vaccinated', \
                               'daily_vaccinations', 'people_vaccinated_per_hundred']]

# converting date object to pandas 'datetime64[ns]' format
covid_vacc_df['date'] = pd.to_datetime(covid_vacc_df['date'])

covid_cases_df = covid_cases_df[['location', 'date', 'iso_code', 'total_cases', 'new_cases', \
                                'total_deaths', 'new_deaths', 'population', 'total_cases_per_million', \
                                 'new_cases_per_million']]

# converting date object to pandas 'datetime64[ns]' format
covid_cases_df['date'] = pd.to_datetime(covid_cases_df['date'])

In [None]:
# Adding total cases per hundred column
covid_cases_df['total_cases_per_hundred'] = (covid_cases_df['total_cases'] / covid_cases_df['population']) * 100

covid_cases_df = covid_cases_df.reset_index(drop=True)

In [None]:
# Grouped max vaccination DF on country
max_covid_vacc_df = covid_vacc_df.groupby(['iso_code'], as_index=False).max().\
                        sort_values(by=['people_vaccinated_per_hundred'], ascending=False).reset_index(drop=True)
max_covid_vacc_df.head()

In [None]:
covid_cases_df.head()

In [None]:
# Grouped max covid cases DF on country
# Sorting the DF based on total cases per hundred
# Dropping the rows with NAN values for total cases and resetting the index

max_covid_case_df = covid_cases_df.groupby(['iso_code'], as_index=False).max()

sort_covid_case_per_hund_df = max_covid_case_df.drop(['new_cases', 'new_deaths', 'new_cases_per_million'], \
                           axis=1).sort_values(by=['total_cases_per_hundred'], ascending=False).dropna(subset=['total_cases']).reset_index(drop=True)


# Sorting the DF based on total cases
sort_most_covid_case_df = max_covid_case_df.drop(['new_cases', 'new_deaths', 'new_cases_per_million'], \
                           axis=1).sort_values(by=['total_cases'], ascending=False).dropna(subset=['total_cases']).reset_index(drop=True)

In [None]:
# This df shows the most effected countries based on cases per hundred. 
sort_covid_case_per_hund_df.head()

In [None]:
# This df shows the most effected countries based on most number of total cases. World and Continental data are excluded.
sort_most_covid_case_df = sort_most_covid_case_df[~sort_most_covid_case_df['iso_code'].str.contains('OWID')]
sort_most_covid_case_df

In [None]:
#merging two dfs on 'iso-code' and 'date'

merge_df = pd.merge(covid_cases_df, covid_vacc_df, on=['iso_code', 'date'], how='outer')

In [None]:
#verfying the merge df

merge_df.loc[(merge_df['iso_code'] == 'AFG') & (merge_df['date'] == '2022-03-22')]

In [None]:
#creating dataframe to show top 30 largest countries
large_countries_df = merge_df.loc[(merge_df['population'] > 47000000) & (~merge_df['iso_code'].str.contains('OWID')), :]

large_countries_df['location'].nunique()

In [None]:
world_cases_df  = covid_cases_df.loc[covid_cases_df['iso_code'] == 'OWID_WRL']
world_cases_df.head()

In [None]:
world_covid_vacc = covid_vacc_df.groupby(['date'], as_index=False).sum()

In [None]:
world_df = pd.merge(world_cases_df, world_covid_vacc, on='date', how='outer')
world_df

In [None]:
# Dataframes available to use

#1. covid_vacc_df  ---- raw df with required columns for vaccination record
#2. covid_cases_df ---- raw df with required columns for cases record with added column for "total cases per hundred"
                        # plus data for world and continents are removed from it
    
#3. max_covid_vacc_df ---- data is grouped on country and is sorted with the countries with most people vaccinated
#4. sort_covid_case_per_hund_df ---- data is grouped on country and is sorted with the countries with most cases per hundred
#5. sort_most_covid_case_df ---- data is grouped on country and is sorted with the countries with most cases
#6. merge_df ---- its a merged df 'on=['iso_code', 'date']' containg vaccination and cases record. 
#7. large_countries_df ----  dataframe to show top 30 largest countries based on population
#8. world_df ---- dataframe contains daily cases and daily vaccinations for the world
