# Library

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import geopandas as gpd
import numpy as np

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

pd.set_option("display.precision", 3)

# Load data

In [None]:
df = pd.read_csv('../data/2022-03-27-clean.csv')
df.head()

# Z-score

In [None]:
def z_score(x):
    return (x-x.mean())/x.std()

# Top 10 Countries With The Most Covid-19 Cases 

In [None]:
top10_total_index = df['TotalCases'].values.argsort()[::-1]
top10_total = df.iloc[top10_total_index[:10]]

plt.figure(figsize=(15,10))
sns.barplot(data=top10_total,x='TotalCases',y='Country,Other',color="salmon")
plt.title("Top 10 Countries With The Most Covid-19 Cases")
plt.ylabel("Country")
plt.xlabel("Total cases")

plt.savefig('../img/fig1.png')
plt.show()

# Top 10 Countries With The Highest Population-to-Case Ratio

In [None]:
df_rate_case_pop = df['Population']/df['TotalCases'] 
index = df_rate_case_pop.argsort()[::-1][:10]
top10_rate_case_pop = df.iloc[index]
top10_rate_case_pop['Rate'] = df_rate_case_pop[index]

plt.figure(figsize=(15,10))
sns.barplot(data=top10_rate_case_pop,y='Country,Other',x='Rate', color="salmon")
plt.xlabel("Rate (%)")
plt.ylabel("Country")
plt.title("Rate total population respected to total cases over countries")
plt.savefig("../img/fig2.png")
plt.show()

# Create dataframe by groupping by continent

In [None]:
df_continent = df.groupby("Continent").sum()
df_continent = df_continent.reset_index()
df_continent.head()

# Comparing The Number Of Covid-19 Cases Between Continents

In [None]:
colors = sns.color_palette('pastel')
fig, ax = plt.subplots(1,2,figsize=(20,10))

rate_test_pop_continent = df_continent['TotalTests']/df_continent['Population']
index = rate_test_pop_continent.values.argsort()[::-1]
df_rate_test_pop_continent = df_continent.iloc[index]
df_rate_test_pop_continent['Test/pop'] = rate_test_pop_continent[index]

ax[0].pie(x=df_rate_test_pop_continent['TotalCases'],
            labels=df_rate_test_pop_continent['Continent'],
            autopct='%.0f%%',
            wedgeprops = {'edgecolor':'k', 'linestyle': 'dashed'},
            colors=len(df_rate_test_pop_continent['TotalCases'])*["salmon"])
ax[0].set_title("Total case over continents")

sns.barplot(ax=ax[1],y=df_rate_test_pop_continent['Test/pop'],x=df_rate_test_pop_continent['Continent'],color="salmon")
ax[1].set_title("Rate test respected to population over continents")
ax[1].set_xlabel("Continent")
ax[1].set_ylabel("Rate (%)")

plt.savefig('../img/fig3.png')
plt.show()

# Pairplot for checking relationship

In [None]:
sns.pairplot(data=df,vars=['TotalCases','NewCases','TotalDeaths','NewDeaths','TotalRecovered','NewRecovered','ActiveCases','Serious,Critical'],corner=True)
plt.show()

# Pearson's correlation

In [None]:
corr = df.corr()
corr.style.background_gradient()

# Heatmap of correlation

In [None]:
plt.figure(figsize=(10,8))
plt.title("Pearson's correlation")
sns.heatmap(corr,cmap='Greys')
plt.show()

# Sorting correlation of total case

In [None]:
corr_total_cases = corr['TotalCases']
corr_total_cases = corr_total_cases.sort_values()[::-1]

plt.figure(figsize=(10,8))
sns.barplot(y=corr_total_cases.index,x=corr_total_cases,color='salmon')
plt.show()

# Death-to-case ratio between countries

In [None]:
mean = df['Deaths/1M pop'].mean()
median = df['Deaths/1M pop'].median() 

plt.figure(figsize=(10,8))
plt.hist(df['Deaths/1M pop'],bins = len(df)//4, color='salmon')

plt.axvline(mean, color='salmon', linestyle='--', label="Mean")
plt.axvline(median, color='salmon', linestyle='-', label="Median")

plt.legend()
plt.xlabel('Deaths/1M population Histogram')
plt.savefig('../img/fig4.png')
plt.show()

# Relationship log-log of death and serious 

In [None]:
df['Serious_Log_Std'] = z_score(np.log(df['Serious,Critical']))
df['TotalDeaths_Log_Std'] = z_score(np.log(df['TotalDeaths']))

plt.figure(figsize=(10,8))

sns.regplot(data=df,x='Serious_Log_Std',y='TotalDeaths_Log_Std',color='salmon')
plt.xlabel("Log(Serious)")
plt.ylabel("Log(Death)")
plt.title("Relationship between log-log of death and serious case")

plt.savefig("../img/fig5.png")
plt.show()

In [None]:
result = smf.ols(' Q("TotalDeaths_Log_Std") ~ Q("Serious_Log_Std")',data=df).fit()
result.summary()

# Dead and recovery rate

In [None]:
df_rate = pd.DataFrame(df['Country,Other'])
df_rate['Dead rate'] = df['TotalDeaths'] / df['TotalCases'] 
df_rate['Recovered rate'] = df['TotalRecovered'] / df['TotalCases'] 

plt.figure(figsize=(10,8))
df_rate = df_rate.melt('Country,Other',var_name='Rate type', value_name='Rate (%)')
sns.swarmplot(data=df_rate,y='Rate (%)',x='Rate type',color='salmon')
plt.title("Dead and recovered rate")
plt.savefig('../img/fig6.png')
plt.show()

In [None]:
df['TotalCases_Std'] = z_score(df['TotalCases'])
df['TotalDeaths_Std'] = z_score(df['TotalDeaths'])
df['TotalRecovered_Std'] = z_score(df['TotalRecovered'])

result = smf.ols('TotalCases_Std ~ TotalRecovered_Std + TotalDeaths_Std',data=df).fit()
result.summary()

# Total test less than total cases

In [None]:
plt.figure(figsize=(10,8))

df_delta_test_total = df
df_delta_test_total['Delta'] = df['TotalTests']-df['TotalCases']
df_delta_test_total = df_delta_test_total.sort_values(by='Delta',ascending=True)[:5]

ax = sns.barplot(data=df_delta_test_total,x='Country,Other',y='Delta',color='salmon')
ax.axhline(0,color='black')

plt.xlabel("Country")
plt.ylabel("(Total test - Total case)")
plt.title("Difference between total test and total case over countries")
plt.savefig("../img/fig7.png")
plt.show()

# Define world map

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.head()

# Cases on map over countries

In [None]:
countries = df['Country,Other']

def fuzzy_match(findStr, listAC, nElement=1):
    matches = process.extract(findStr, listAC, limit=nElement, scorer = fuzz.token_sort_ratio)
    return matches

for i, country in enumerate(countries):
    candidates = fuzzy_match(country,world['name'],1)
    if (candidates[0][1]<100 and candidates[0][1]>75):
        print(country,candidates)
        countries[i] = candidates[0][0]

countries.replace("USA","United States of America",inplace=True)
countries.replace("UAE","United Arab Emirates",inplace=True)
countries.replace("UK","United Kingdom",inplace=True)

df['Country'] = countries
df['Rate_case'] = df['Population']/df['TotalCases'] 
world_country = world.merge(df,how='left',left_on='name',right_on='Country')

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,8))
world_country.plot(column='TotalCases',ax=ax,legend=True,cmap=sns.color_palette("dark:salmon_r", as_cmap=True),edgecolors='grey')
plt.title("Total cases over countries")
plt.savefig("../img/fig8.png")
plt.show()