# Advanced Plotting in Seaborn and How to Use it in EDA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import covid data
covid = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-Cookbook/master/Chapter05/data/covidtotals.csv')

### Initial inspection

`covid` is a DataFrame with with 209 rows. Each row represents a location (country). It contains the following columns: 

- iso_code - a unique 3 digit identifier for each country
- lastdate - the date for when the report was last updated
- location - full name of the country
- total_cases - the cumulative number of confirmed COVID-19 cases in that location
- total_deaths - the cumulative number of confirmed COVID-19 deaths in that location
- total_cases_pm - the cumulative number of confirmed COVID-19 cases per million in that location
- total_deaths_pm - the cumulative number of confirmed COVID-19 deaths per million in that location
- population - population size
- pop_density - number of people per square kilometer
- median_age - median age of the population 
- gdp_per_capita - The GDP per capita (economic indicator of wealth per person)
- hosp_beds - The number of hospital beds per 1,000 people (an indicator of healthcare capacity)
- region - broader region 

### Preprocessing

In [None]:
# Check for na values


In [None]:
# Summarize number of na's by column
nas = 
nas

In [None]:
# drop columns with na values


In [None]:
covid.shape

In [None]:
# drop rows with na values


In [None]:
covid.shape

In [None]:
# check if there are any duplicate rows


In [None]:
# check the data types to make sure they are appropriate


In [None]:
covid['lastdate'] = 
covid.dtypes

### Summary statistics and Exploration

In [None]:
# begin exploring the dataset
covid_desc = 
covid_desc

In [None]:
# What is the average death rate by region?


In [None]:
# Which country has the highest death counts


In [None]:
# Which country has the highest death rate


# Visualization

In [None]:
# heatmaps to look at correlations


The median age and gdp per capita are correlated with deaths per million. This is interesting. 

In [None]:
# Let's look at the relationship with gdp


In [None]:
# Where would the regression line be?
sns...(data = covid, x = 'gdp_per_capita',y = 'total_deaths_pm');

In [None]:
# Look at the relationships and distributions together
sns...(data = covid, x = 'gdp_per_capita',y = 'total_deaths_pm');

In [None]:
# boxplots to look at deaths by region
plt.figure(figsize=(4,6))

plt.show()

There are a few regions we might want to zoom in on...

In [None]:
high_covid_regions = ...
high_covid_regions.head()

In [None]:
# Let's redo the boxplot with these points
sns.boxplot(data = ..., y = 'region',x='total_deaths_pm');

In [None]:
# To get more information about the distributions
sns...(data = high_covid_regions, y = 'region',x='total_deaths_pm');

In [None]:
# Show all the points in the distributions with a swarm plot
sns...(data = high_covid_regions, y = 'region',x='total_deaths_pm');

In [None]:
# Notice a swarm plot is different from a scatter plot
sns...(data = high_covid_regions, y = 'region',x='total_deaths_pm');

In [None]:
# Another way to compare categories (best if this is an ordinal variable)
sns...(data = high_covid_regions, x = 'region',y='total_deaths_pm');

In [None]:
# Let's look at that gdp relationship again
sns.jointplot(data = high_covid_regions, x = 'gdp_per_capita',y = 'total_deaths_pm', hue = 'region');
plt.xlim([0,120000]);
plt.ylim([-20,1000]);

South America has almost a bimodal distribution

In [None]:
# Focus on these distributions more (messy!)
sns...(high_covid_regions, x = 'total_deaths_pm', hue = 'region');

In [None]:
# Better
sns...(high_covid_regions, x = 'total_deaths_pm', hue = 'region', fill=True);

In [None]:
# Even better 
g = sns.FacetGrid(high_covid_regions, col="region", hue = 'region')
g.map(sns.kdeplot, "total_deaths_pm", fill = True)
plt.tight_layout();

In [None]:
na = covid.loc[covid.region == 'North America', :]
na

In [None]:
# A teaser for interactive plots

import plotly.express as px


fig = px.scatter(covid, x = 'gdp_per_capita', y = 'total_deaths_pm', hover_name='location')
# Set figure size
fig.update_layout(width=600, height=600)