# Guided Exercise Week 3: EDA, Visualization

## Data Exploration and Visualization with the Iris Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Load the data
iris = pd.read_csv('iris.csv', names= ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"])
iris.head() #to check the first 5 rows of the data set

In [None]:
all_species = pd.unique(iris.species) # find the unique names of species in the data
all_species

In [None]:
iris["species"].value_counts()

In [None]:
iris.isnull().sum() #checks out how many null info are on the dataset

In [None]:
iris.describe() #to give a statistical summary about the dataset

### Pandas implements `matplotlib` to work with DataFrames and Series, natively.

[Visualization in pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html)

In [None]:
iris.plot(kind='box') # create a box plot of attributes for all species in the dataset
plt.show()

In [None]:
iris.hist() # visualize distribution of attributes in the whole dataset.  Note that petals seem to fall into groups.
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(iris) # gives a visual feel for correlation
plt.show()

## Another visualization module you might want to consider is [Seaborn](https://seaborn.pydata.org/).  

Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.

In [None]:
# We'll use seaborn's FacetGrid to color the scatterplot by species
# Note that the versicolor and virginca show a lot of overlap with these two features, but setosa stands out.
sns.FacetGrid(iris, hue="species", height=5).map(plt.scatter, "sepal_length", "sepal_width").add_legend()

In [None]:
# The "Seaborn" plotting package provides some nicer visuals
sns.set(style="ticks")
iris = sns.load_dataset("iris")
sns.pairplot(iris, hue="species",palette="bright")
plt.show()

In [None]:
piris = pd.melt(iris, "species", var_name="measurement") 
sns.catplot(x="measurement", y="value", hue="species", data=piris, height=7, kind="bar",palette="bright")
plt.show() 
print(piris.head())

We've looked at box plots of attributes, but now that we have decided that petal length and petal width are the most informative, let's revisit them with box and/or violin plots

In [None]:
# We will plot length and width in two side by side subplots
# A violin plot is more informative than a box plot for this purpose
# Denser regions of the data are fatter, and sparser thiner in a violin plot

fig = plt.figure()
ax1 = fig.add_subplot(1, 2, 1)
sns.violinplot(x="species", y="petal_length", data=iris, size=10)

ax2 = fig.add_subplot(1, 2, 2)
sns.violinplot(x="species", y="petal_width", data=iris, size=10)

In [None]:
# Another multivariate visualization technique pandas has is parallel_coordinates
# Parallel coordinates plots each feature on a separate column & then draws lines
# connecting the features for each data sample

from pandas.plotting import parallel_coordinates

# parallel_coordinates(iris.drop("Id", axis=1), "Species")
parallel_coordinates(iris, "species")

In [None]:
# the RadViz plot provides a radial depiction of the attributes
from pandas.plotting import radviz
radviz(iris, "species")