In [None]:
# importing the necessary packages for the project
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from google.colab import files
import numpy as np

# setting the float format for the entire program
pd.options.display.float_format = "{:.2f}".format

# specifying the colors to reuse for plotting each of the attributes below
vaccination_color_hex = "00FF00"
vaccination_color_title = "green"
vaccination_color_map = "Greens_r"

cases_color_hex = "6a0dad"
cases_color_title = "purple"
cases_color_map = "Purples"

deaths_color_hex = "FF0000"
deaths_color_title = "red"
deaths_color_map = "Reds_r"

active_cases_color_hex = "FFA500"
active_cases_color_title = "orange"
active_cases_color_map = "Oranges_r"

mortality_rate_color_map = "Blues_r"
mortality_rate_color_title = "blue"

# a method to transpose matrices with date columns into matrices with date rows
def modify(df, cases_or_deaths):
    df = df.T
    df.columns = df.iloc[0]
    df = df.iloc[1:, :]
    df = df.stack()
    df = df.to_frame()
    df = df.reset_index()
    df.columns = ["Date", "Country", cases_or_deaths]
    return df

In [None]:
# importing datasets
confirmed_global = pd.read_csv(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
)
deaths_global = pd.read_csv(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
)
coordinates = pd.read_csv(
    "https://raw.githubusercontent.com/albertyw/avenews/master/old/data/average-latitude-longitude-countries.csv"
)
daily_vaccinations = pd.read_csv(
    "https://raw.githubusercontent.com/govex/COVID-19/master/data_tables/vaccine_data/global_data/time_series_covid19_vaccine_global.csv"
)
gdp_per_capita = pd.read_csv(
    "https://raw.githubusercontent.com/nchichilidze/gdppercapitacsv/main/GDP.csv"
)

In [None]:
# grouping all provinces/regions of each country together and summing the number of cases
confirmed = confirmed_global.groupby(["Country/Region"]).sum().reset_index()
confirmed = confirmed.drop(["Lat", "Long"], 1)

# transposing the dataframe, displaying a row containing date, country and cases for each combination
confirmed = modify(confirmed, "Cases")

# grouping the different provinces and summing up the cases for each date
# dropping longtitude and latitude because I will be utilizing another dataset which provides universal lon/lat coordinates for each country
confirmed = confirmed_global.groupby(["Country/Region"]).sum().reset_index()
confirmed = confirmed.drop(["Lat", "Long"], 1)

# transposing the dataframe, displaying a row containing date, country and cases for each combination
confirmed = modify(confirmed, "Cases")
confirmed.head()

# grouping all provinces/regions of each country together and summing the number of deaths
deaths = deaths_global.groupby(["Country/Region"]).sum().reset_index()
deaths = deaths.drop(["Lat", "Long"], 1)

# transposing the dataframe, displaying a row containing date, country and cases for each combination
deaths = modify(deaths, "Deaths")

# creating a collective dataframe that contains both death and case numbers
collective_df = confirmed
collective_df["Deaths"] = deaths["Deaths"]

# dropping unnecessary columns from the coordinates dataframe
coordinates = coordinates.iloc[:, 1:]
coordinates["Country"] = coordinates["Country"].str.replace("United States", "US")

# adding the coordinates info to the main dataframe
collective_df = pd.merge(collective_df, coordinates, on="Country", how="left")

# since the main dataframe contains accumulative numbers for cases and deaths,
# I turned them into incremental values to see how many new cases/deaths there are daily
collective_df["New_Cases"] = collective_df.groupby(["Country"])["Cases"].transform(
    lambda s: s.sub(s.shift().fillna(0)).abs()
)

collective_df["New_Deaths"] = collective_df.groupby(["Country"])["Deaths"].transform(
    lambda s: s.sub(s.shift().fillna(0)).abs()
)

# formatting the date string in the dataframe
collective_df["Date"] = pd.to_datetime(collective_df["Date"])
collective_df["Date"] = collective_df["Date"].dt.strftime("%d/%m/%Y")

In [None]:
collective_df

Unnamed: 0,Date,Country,Cases,Deaths,Latitude,Longitude,New_Cases,New_Deaths
0,22/01/2020,Afghanistan,0,0,33.00,65.00,0,0
1,22/01/2020,Albania,0,0,41.00,20.00,0,0
2,22/01/2020,Algeria,0,0,28.00,3.00,0,0
3,22/01/2020,Andorra,0,0,42.50,1.50,0,0
4,22/01/2020,Angola,0,0,-12.50,18.50,0,0
...,...,...,...,...,...,...,...,...
132687,28/11/2021,Vietnam,1210340,24882,16.00,106.00,12936,190
132688,28/11/2021,West Bank and Gaza,459479,4789,,,0,0
132689,28/11/2021,Yemen,9987,1946,15.00,48.00,6,1
132690,28/11/2021,Zambia,210143,3667,-15.00,30.00,5,0


In [None]:
# drop the province from the vaccines dataset since it's not relevant in this case
# dropping UID and doses_admin since I am not going to use them
daily_vaccinations = daily_vaccinations.drop("Province_State", 1)
daily_vaccinations = daily_vaccinations.drop("UID", 1)
daily_vaccinations = daily_vaccinations.drop("Doses_admin", 1)

# group vaccinations by the same DATE and COUNTRY to ensure that the province data does not get lost
daily_vaccinations = (
    daily_vaccinations.groupby(["Date", "Country_Region"]).sum().reset_index()
)

# calculating incremental values for vaccination doses since the dataset only provides cumulative numbers
daily_vaccinations["New_people_partially_vaccinated"] = daily_vaccinations.groupby(
    ["Country_Region"]
)["People_partially_vaccinated"].transform(lambda s: s.sub(s.shift().fillna(0).abs()))

daily_vaccinations["New_people_fully_vaccinated"] = daily_vaccinations.groupby(
    ["Country_Region"]
)["People_fully_vaccinated"].transform(lambda s: s.sub(s.shift().fillna(0).abs()))

# getting rid of the global data
daily_vaccinations = daily_vaccinations.drop(
    daily_vaccinations[daily_vaccinations.Country_Region == "World"].index
)

# renaming the columns to match collective_df
daily_vaccinations = daily_vaccinations.rename(
    columns={
        "Country_Region": "Country",
        "People_partially_vaccinated": "Part_vac",
        "People_fully_vaccinated": "Full_vac",
        "New_people_partially_vaccinated": "New_part_vac",
        "New_people_fully_vaccinated": "New_full_vac",
    }
)

# formatting the date string to match collective_df
daily_vaccinations["Date"] = pd.to_datetime(daily_vaccinations["Date"])
daily_vaccinations["Date"] = daily_vaccinations["Date"].dt.strftime("%d/%m/%Y")

# merging the vaccination data with collective_df on same date & country
collective_df = collective_df.merge(
    daily_vaccinations, how="left", on=["Date", "Country"]
)

# re-cast and format the date field as a datetime object (need it later for mapping cases on the world map)
collective_df["Date"] = pd.to_datetime(collective_df["Date"])
collective_df["Date"] = collective_df["Date"].dt.strftime("%m/%d/%Y")

# casting the numbers in the dataframe from strings to floats so we can plot them later
collective_df["Cases"] = pd.to_numeric(collective_df["Cases"], downcast="float")
collective_df["Deaths"] = pd.to_numeric(collective_df["Deaths"], downcast="float")
collective_df["New_Deaths"] = pd.to_numeric(
    collective_df["New_Deaths"], downcast="float"
)
collective_df["New_Cases"] = pd.to_numeric(collective_df["New_Cases"], downcast="float")
collective_df["New_part_vac"] = pd.to_numeric(
    collective_df["New_part_vac"], downcast="float"
)
collective_df["New_full_vac"] = pd.to_numeric(
    collective_df["New_full_vac"], downcast="float"
)

In [None]:
# getting a look at the state of the collective df at the end
collective_df

Unnamed: 0,Date,Country,Cases,Deaths,Latitude,Longitude,New_Cases,New_Deaths,Part_vac,Full_vac,New_part_vac,New_full_vac
0,01/22/2020,Afghanistan,0.00,0.00,33.00,65.00,0.00,0.00,,,,
1,01/22/2020,Albania,0.00,0.00,41.00,20.00,0.00,0.00,,,,
2,01/22/2020,Algeria,0.00,0.00,28.00,3.00,0.00,0.00,,,,
3,01/22/2020,Andorra,0.00,0.00,42.50,1.50,0.00,0.00,,,,
4,01/22/2020,Angola,0.00,0.00,-12.50,18.50,0.00,0.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
132687,11/28/2021,Vietnam,1210340.00,24882.00,16.00,106.00,12936.00,190.00,69747303.00,49021083.00,1923298.00,3900454.00
132688,11/28/2021,West Bank and Gaza,459479.00,4789.00,,,0.00,0.00,0.00,0.00,0.00,0.00
132689,11/28/2021,Yemen,9987.00,1946.00,15.00,48.00,6.00,1.00,537397.00,353822.00,0.00,0.00
132690,11/28/2021,Zambia,210143.00,3667.00,-15.00,30.00,5.00,0.00,806611.00,671006.00,0.00,3573.00



**Plot 1: COVID 19 Heat Map**

Questions Answered: How does the spread of Covid 19 look geographically? In what direction did it spread over time? How have the cases grown? 

1. Simplify: The earth is depicted with a simple graphic and the cases are mapped to a single (central) point in each country. Only mapping the spread of covid and nothing else. 
2. Understand Magnitudes: The range of the colors on the map are set by taking into consideration the natural spread of covid and the number of cases. 
3. Use Color: The heatmap naturally depicts the magnitude of covid cases using a gradient color.
4. Use Structure: The map allows us to use structure to convey the correlation between longitude, magnitude and the number of covid cases. The range slider below allows the user to interact with the database and move through time. 





In [None]:
fig = px.density_mapbox(
    collective_df,
    lat="Latitude",
    lon="Longitude",
    z="Cases",
    radius=40,
    zoom=1,
    hover_data=["Country", "Deaths"],
    mapbox_style="carto-positron",
    animation_frame="Date",
    range_color=[0, 1000000],
    title="The spread of C19",
)
fig.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})
fig.show()

**Plot 2: Covid-19 Infographic for each country**

Questions Answered: What are the current total number of cases, deaths, vaccinations, etc. for each country in the world? How do they compare to one another? 

1. Simplify: The plot only depicts the significant numbers associated with covid19. 
2. Understand Magnitudes: The numbers depicted in each column are comparable to one another.
3. Use Color: The predefined color palette was used for each attribute, as well as a gradient color scheme to differentiate  values within each column.
4.   Use Structure: The dataset is visualized in an extremely straightforward way using rows and columns. 


In [None]:
# let's add up the number of cases, deaths and vaccinations per country
totals_per_country = collective_df
# drop unnecessary columns
totals_per_country = totals_per_country.drop(
    [
        "Date",
        "Latitude",
        "Longitude",
        "New_Cases",
        "New_Deaths",
        "New_part_vac",
        "New_full_vac",
    ],
    1,
)
# since the data in deaths, cases and vaccinations are accumulative .max() will return the latest value
totals_per_country = totals_per_country.groupby("Country").max()

# calculating the mortality rate per 100 cases.
totals_per_country["Mortality Rate (per 100)"] = np.round(
    100 * totals_per_country["Deaths"] / totals_per_country["Cases"], 2
)
# calculating the active cases in each country
totals_per_country["Active Cases"] = (
    totals_per_country["Cases"] - totals_per_country["Deaths"]
)
totals_per_country = totals_per_country.fillna(0)

# plot all the numbers for each country
totals_per_country.sort_values("Cases", ascending=False).style.background_gradient(
    cmap="Wistia"
)
totals_per_country.sort_values("Cases", ascending=False).style.background_gradient(
    cmap=cases_color_map, subset=["Cases"]
).background_gradient(cmap=deaths_color_map, subset=["Deaths"]).background_gradient(
    cmap=vaccination_color_map, subset=["Part_vac"]
).background_gradient(
    cmap=vaccination_color_map, subset=["Full_vac"]
).background_gradient(
    cmap=active_cases_color_map, subset=["Active Cases"]
).background_gradient(
    cmap=mortality_rate_color_map, subset=["Mortality Rate (per 100)"]
).format(
    "{:.2f}"
).format(
    "{:.0f}", subset=["Cases", "Deaths", "Part_vac", "Full_vac", "Active Cases"]
)

Unnamed: 0_level_0,Cases,Deaths,Part_vac,Full_vac,Mortality Rate (per 100),Active Cases
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
US,48229208,776639,231367686,196398948,1.61,47452568
India,34580832,468790,784647922,438546779,1.36,34112040
Brazil,22080906,614278,162436791,128481994,2.78,21466628
United Kingdom,10202370,145218,50917949,46309909,1.42,10057152
Russia,9403480,267527,66201198,56115075,2.84,9135953
Turkey,8748025,76446,56201040,50354168,0.87,8671579
France,7723032,119875,51843873,46883650,1.55,7603157
Iran,6108882,129629,57332853,46328337,2.12,5979253
Germany,5804139,100960,59221785,56911547,1.74,5703179
Argentina,5326448,116529,36527835,29320900,2.19,5209919


In [None]:
# printing the dataframe to visualize how it looks currently
totals_per_country

Unnamed: 0_level_0,Cases,Deaths,Part_vac,Full_vac,Mortality Rate (per 100),Active Cases
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,157218.00,7365.00,4285440.00,3454113.00,4.68,149853.00
Albania,199555.00,3089.00,1067769.00,953489.00,1.55,196466.00
Algeria,210152.00,6058.00,6703364.00,5314016.00,2.88,204094.00
Andorra,16712.00,131.00,54999.00,49535.00,0.78,16581.00
Angola,65144.00,1733.00,6266044.00,2760266.00,2.66,63411.00
...,...,...,...,...,...,...
Vietnam,1210340.00,24882.00,69747303.00,49021083.00,2.06,1185458.00
West Bank and Gaza,459479.00,4789.00,0.00,0.00,1.04,454690.00
Yemen,9987.00,1946.00,537397.00,353822.00,19.49,8041.00
Zambia,210143.00,3667.00,806611.00,671006.00,1.75,206476.00


In [None]:
# let's add up the numbers per date to get daily data
totals_per_date = collective_df.groupby("Date").sum().reset_index()
totals_per_date["Date"] = pd.to_datetime(totals_per_date["Date"])
# drop unnecessary columns
totals_per_date = totals_per_date.drop(
    ["Cases", "Deaths", "Latitude", "Longitude", "Part_vac", "Full_vac"], 1
)
totals_per_date = totals_per_date.sort_values(by="Date")
# add up the daily incremental numbers to get cumulative data
cumulative = totals_per_date.cumsum()
# add cumulative data to main dataframe
totals_per_date["Total_Cases"] = cumulative["New_Cases"]
totals_per_date["Total_Deaths"] = cumulative["New_Deaths"]
totals_per_date["Total_Part_Vacs"] = cumulative["New_part_vac"]
totals_per_date["Total_Full_Vacs"] = cumulative["New_full_vac"]
# add active case number
totals_per_date["Total_Active"] = (
    totals_per_date["Total_Cases"] - totals_per_date["Total_Deaths"]
)

In [None]:
# printing the dataframe to visualize how it looks currently
totals_per_date

Unnamed: 0,Date,New_Cases,New_Deaths,New_part_vac,New_full_vac,Total_Cases,Total_Deaths,Total_Part_Vacs,Total_Full_Vacs,Total_Active
1,2020-01-02,2111.00,46.00,0.00,0.00,2111.00,46.00,0.00,0.00,2065.00
3,2020-01-03,2378.00,54.00,0.00,0.00,4489.00,100.00,0.00,0.00,4389.00
5,2020-01-04,82914.00,6014.00,0.00,0.00,87403.00,6114.00,0.00,0.00,81289.00
7,2020-01-05,88597.00,5483.00,0.00,0.00,176000.00,11597.00,0.00,0.00,164403.00
9,2020-01-06,95705.00,3525.00,0.00,0.00,271705.00,15122.00,0.00,0.00,256583.00
...,...,...,...,...,...,...,...,...,...,...
648,2021-12-07,436692.00,8058.00,374149.00,263464.00,260594992.00,5177200.00,4187183360.00,3304204032.00,255417792.00
650,2021-12-08,710787.00,10675.00,10660362.00,11272986.00,261305776.00,5187875.00,4197843712.00,3315476992.00,256117904.00
652,2021-12-09,368246.00,5522.00,10662654.00,9567662.00,261674016.00,5193397.00,4208506368.00,3325044736.00,256480624.00
654,2021-12-10,433308.00,8136.00,514083.00,499358.00,262107328.00,5201533.00,4209020416.00,3325544192.00,256905792.00


**Plot 3: Plotting the raise in total numbers over time**

Questions Answered: How did the numbers in each case rise over time? Have we flattened the curve? How has the vaccination process progressed over time? 

1. Simplify: Each sub-plot in the graph depicts a growth in a single case.
2. Understand Magnitudes: I decided to map these numbers separately due to the fact that they each convey important information.
3. Use Color: The colors used in each case were pre-defined and reused throughout all the plots. 
4.    Use Structure: I put the plots in a 2x2 matrix for it to have a concise and tidy structure. You can easily compare these numbers with one another. You are also able to select a range by dragging your cursor over the graphic for each of the plots to get more detailed information. 



In [None]:
from plotly.subplots import make_subplots

# building up each plot separately
fig = px.bar(
    totals_per_date,
    x="Date",
    y="Total_Cases",
    color_discrete_sequence=[cases_color_title],
)
fig1 = px.bar(
    totals_per_date,
    x="Date",
    y="Total_Deaths",
    color_discrete_sequence=[deaths_color_title],
)
fig2 = px.bar(
    totals_per_date,
    x="Date",
    y="Total_Part_Vacs",
    color_discrete_sequence=[vaccination_color_title],
)
fig3 = px.bar(
    totals_per_date,
    x="Date",
    y="Total_Active",
    color_discrete_sequence=[active_cases_color_title],
)

# Placing each subplot into the final plot
fi = make_subplots(
    rows=2,
    cols=2,
    horizontal_spacing=0.1,
    vertical_spacing=0.3,
    subplot_titles=["Cases", "Deaths", "Vaccination Doses", "Active Cases"],
)

# arranging the subplots on the plane
fi.add_trace(fig["data"][0], row=1, col=1)
fi.add_trace(fig1["data"][0], row=1, col=2)
fi.add_trace(fig2["data"][0], row=2, col=1)
fi.add_trace(fig3["data"][0], row=2, col=2)
fi.update_layout(height=600)
fi.show()

**Plot 4: The daily numbers over time for Covid-19 cases, deaths and mortality.  [Interactive]**

Questions Answered: Are people dying more or less of covid? Are more people getting covid daily? Has the mortality rate of covid become smaller over time? What have the waves looked like?

1. Simplify: data of comparable nature are visualized in the same graph.
2. Understand Magnitudes: dates are plotted in the date axis and the numbers are plotted on y. The buttons that interact with the range of time allow for seamless change of range. 
3. Use Color: The colors used in the line plots match with the predefined color scheme. 
4.   Use Structure: The viewer is able to interact with the plot by dragging the range picker at the bottom. They can also click the buttons at the top to view specific time periods.




In [None]:
import plotly.express as px
import pandas as pd

df = totals_per_date

fig = px.line(
    df,
    x=df["Date"],
    y="New_Deaths",
    title="Time Series visualization of daily COVID cases, deaths and vaccinations",
)
fig.add_scatter(
    x=df["Date"],
    y=df["New_Cases"],
    mode="lines",
    name="Cases",
    line=dict(color=cases_color_title),
)
fig.add_scatter(
    x=df["Date"],
    y=df["New_Deaths"],
    mode="lines",
    name="Deaths",
    line=dict(color=deaths_color_title),
)


fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list(
            [
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all"),
            ]
        )
    ),
)

In [None]:
df = totals_per_date
df["mortality_rate"] = np.round(
    100 * totals_per_date["Total_Deaths"] / totals_per_date["Total_Cases"], 2
)


fig1 = px.line(
    df,
    x="Date",
    y="mortality_rate",
    title="Time Series visualization of daily COVID daily mortality rate",
)


fig1.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list(
            [
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(step="all"),
            ]
        )
    ),
)

**Plot 5: Plotting the correlation between GDP Per Capita to Mortality ratio**

Questions Answered: Do countries with higher GDP per capita have a smaller mortality rate? Are wealthier countries dealing with Covid-19 better? 

1. Simplify: I have only plotted mortality rate : GDP per capita and not any other information.
2. Understand Magnitudes: Countries with high dependency are depicted as larger circles
3. Use Color: Each country is depicted in a different color for the viewer to tell them apart
4.   Use Structure: The scatter plot is a visual way to structurize the correlation of different da


In [None]:
# reformating the gdp dataframe
gdp_per_capita = gdp_per_capita.rename(columns={"Country Name": "Country"})
totals_with_gdp = totals_per_country
# merging the gdp dataframe to the collective df per country
totals_with_gdp = pd.merge(totals_with_gdp, gdp_per_capita, on="Country", how="left")
# renaming the column for better visualization
totals_with_gdp = totals_with_gdp.rename(
    columns={"Mortality Rate (per 100)": "Mortality"}
)

# plotting the scatter plot
fig = px.scatter(
    totals_with_gdp.sort_values("GDP", ascending=False).iloc[:50, :],
    x="GDP",
    y="Mortality",
    color="Country",
    size="Mortality",
    height=700,
    text="Country",
    log_x=False,
    log_y=False,
    title="GDP Per Capita to Mortality Ratio",
)
fig.update_traces(textposition="top center")
fig.update_layout(showlegend=False)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()