In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import os
import datetime as dt
import numpy as np
import copy 

# Setup

In [None]:
font = {'family' : 'monospace',
        'weight' : 'normal',
        'size'   : 15}

mpl.rc('font', **font)
plt.rcParams["figure.figsize"] = [16,9]

# Load data

In [None]:
files = os.listdir("data")
# Sort dates
dates = [dt.datetime.strptime(re.findall(r'\d{4}-\d{2}-\d{2}',f)[0],'%Y-%m-%d') for f in files]
dates = sorted(dates,reverse=False)
dates = [i.strftime('%Y-%m-%d') for i in dates]

In [None]:
dfs = {d: pd.read_csv(f'data/worldometers-{d}.tsv',sep="\t") for d in dates}

# Example data

In [None]:
dfs[dates[-1]]

# Cleaning
* Replace nan values in numerical attributes with ```0```
* Replace nan values in categorical attributes with ```Other```

In [None]:
for df in dfs.values():
    for i in range(1, df.shape[1] - 1):
        df.iloc[:,i] = df.iloc[:,i].fillna(0)
    df.iloc[:,-1] = df.iloc[:,-1].fillna('Other')

# Analyzing
## Questions
1. What is the trend of NewCases?
2. What is the trend of NewDeaths?
3. What is the trend of NewRecovered?
3. Distribution of cases per 1 million people
4. Distribution of proportion of recovered cases per 1 million people
5. Distribution of proportion of deaths per 1 million of peple
5. Total cases in each continent?
6. Does Population affect NewCases?
7. Does Population affect NewDeaths?
8. Does the number of test conducted affect the total cases?
9. Does the number of serious cases lead to high number of death cases?
10. 

## New cases trend

In [None]:
NewCases = [np.sum(dfs[d]["NewCases"]) for d in dates]
plt.xticks(rotation='90')
plt.plot(dates,NewCases)
plt.xlabel("Date")
plt.ylabel("New cases")
plt.title("World new cases")
plt.tight_layout()
plt.savefig("figures/WorldNewCases.png")

## New deaths trend

In [None]:
NewDeaths = [np.sum(dfs[d]["NewDeaths"]) for d in dates]
plt.xticks(rotation='90')
plt.plot(dates,NewDeaths)
plt.xlabel("Date")
plt.ylabel("New deaths")
plt.title("World new deaths")
plt.tight_layout()
plt.savefig("figures/WorldNewDeaths.png")

## New recovered

In [None]:
NewDeaths = [np.sum(dfs[d]["NewRecovered"]) for d in dates]
plt.xticks(rotation='90')
plt.plot(dates,NewDeaths)
plt.xlabel("Date")
plt.ylabel("New recovered")
plt.title("World new recovered")
plt.tight_layout()
plt.savefig("figures/WorldNewRecovered.png")

## Density of cases (cases per 1 million people) 
* Data is taken on the first 3 days of December

In [None]:
dates_t = ["2021-12-01", "2021-12-02", "2021-12-03"]

In [None]:
def display_density(attr, date):
    plt.xticks(rotation='90')
    plt.hist(dfs[date][attr])
    plt.xlabel("Cases/1M ppl")
    plt.ylabel("")
    plt.title(f"Total cases (1M Pop) {date}")
    plt.tight_layout()
    plt.savefig(f"figures/WorldTotalCase1MPop{date}.png")