In [None]:
from __future__ import print_function, division

In [None]:
# Supress warnings
import warnings

warnings.simplefilter("ignore")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image

%matplotlib inline

### EDA Books

* [Exploratory Data Analysis](http://www.amazon.com/Exploratory-Data-Analysis-John-Tukey/dp/0201076160) by John Tukey

Other Authors on EDA:

* William Cleveland
* Edward Tufte

In [None]:
# Let's do a histogram with defaults

In [None]:
plt.hist(np.random.randn(100))

### Better Defaults

In [None]:
import seaborn as sns

In [None]:
plt.hist(np.random.randn(100))

* What do you see different between the two plots?

## Seaborn

Seaborn is a Python visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. 

Source: [Seaborn Docs](http://stanford.edu/~mwaskom/software/seaborn/)

Seaborn comes with some sample datasets that we will use to explore (https://github.com/mwaskom/seaborn-data)

In [None]:
titanic = sns.load_dataset("titanic")

In [None]:
# This is our familiar Pandas dataframe
titanic.info()

In [None]:
# Remove all rows that have null Age
titanic = titanic[titanic.age.notnull()]

### Histogram (Distributions)

In [None]:
# We can plot distribution of Age by just using Pandas
# We don't need Seaborn for that
# Plot an histogram of age

In [None]:
# But Seaborn adds some additional ways to look at distributions
sns.kdeplot(titanic.age)

In [None]:
# Do a Cumulative Plot
sns.kdeplot(titanic.age, cumulative=True)

In [None]:
# Overlay Histogram and Dist Plot
sns.distplot(titanic.age)

### Box Plots

In [None]:
# Do a Boxplot of "age" with Pandas 
# Your code here

In [None]:
# Box plot "age" by "sex"
# Your code here

In [None]:
sns.boxplot(titanic.age)

In [None]:
sns.boxplot(titanic.age, titanic.sex)

### Violin Plot

In [None]:
sns.violinplot(titanic.age)

In [None]:
sns.violinplot(titanic.age, titanic.sex)

### Plot by Group

In [None]:
# Pandas Histogram of "age"
# Your code here

In [None]:
# Pandas histogram of "age" by "sex"

In [None]:
# Pandas histogram of "age" by "sex" and "survived"

In [None]:
# Let's now look at Distribution of Age and Survived
# Now look at survived data
g = sns.FacetGrid(titanic, row='sex', col='survived', sharex=True, sharey=True)
g.map(plt.hist, "age")

In [None]:
# Plot how many survived, by gender and class
grid_plot = sns.FacetGrid(titanic, row='sex', col='pclass')
grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1)

### Let's look at a IRIS Dataset

In [None]:
iris = sns.load_dataset("iris")

In [None]:
iris.head()

### Scatter Matrix

In [None]:
# Do a Scatter Plot with Pandas on two columns - "petal_length", "petal_width"

In [None]:
# Look at pair wise comparison (scatter matrix) between all numerical variables

In [None]:
# Blow it up a little bit
pd.scatter_matrix(iris, figsize=(12,8));

In [None]:
# Seaborn does this pair wise comparison Better
sns.pairplot(iris)

In [None]:
sns.pairplot(iris, hue="species")

### Joint Plot

* Multiple plots at once

In [None]:
sns.jointplot("petal_width", "petal_length", kind="regplot", data=iris)