# Lab | Matplotlib & Seaborn

#### Import all the libraries that are necessary.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

# Challenge 1

#### Data

In [None]:
x = np.arange(0,100)
y = x*2
z = x**2


#### Plot (x,y) and (x,z) on the axes.
There are 2 ways of doing this. Do in both ways.
Hint: Check out the nrows, ncols, and index arguments of subplots.

Also, play around with the linewidth and style. Use the ones you're most happy with.

In [None]:
# OPTION 1
# Create a figure of a fixed size

# First plot
plt.plot(x, x*2, label='First Plot')

# Second plot
plt.plot(x, x**2, label='Second Plot')

# Show the plot
plt.title("First Plot vs Second Plot")
plt.legend()
plt.show()


#### Use plt.subplots(nrows=1, ncols=2) to create the plot below

In [None]:
# OPTION 2

# Create a figure of a fixed size and axes
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)


# Plots

ax1.plot(x,x*2)
ax1.set_title('FIRST PLOT')   
ax1.set_xlabel('x label')     
ax1.set_ylabel('y label') 


ax2.plot(x, x**2)
ax2.set_title('SECOND PLOT')
ax2.set_xlabel('x label')
ax2.set_ylabel('y label')

# Show the plot

plt.show()

#### Augmenting your previous code, resize your previous plot.
Hint: Add the figsize argument in plt.subplots()

In [None]:
# ENHANCED VERSION to see more matplotlib features

# Variables
fs = 16 # fontsize
fw = 700 # fontweight
lw = 3 # lineweight
y_rot = 0 # ylabel rotation

# Create a figure of a fixed size and axes
fig, [ax1, ax2] = plt.subplots(1,2, figsize = (20,4))


ax1.plot(x,x*2, lw = 3)
ax1.set_title('FIRST PLOT', fontsize=fs, fontweight=fw)   
ax1.set_xlabel('x label', fontsize=fs, fontweight=fw)     
ax1.set_ylabel('y label', fontsize=fs, fontweight=fw, rotation=y_rot)  

ax2.plot(x, x**2, lw = 3)
ax2.set_title('SECOND PLOT', fontsize=fs, fontweight=fw)
ax2.set_xlabel('x label', fontsize=fs, fontweight=fw)
ax2.set_ylabel('y label', fontsize=fs, fontweight=fw, rotation=y_rot)


# Show the plot

plt.show()



#### Plot both y=x^2 and y=exp(x) in the same plot using normal and logarithmic scale.
Hint: Use set_xscale and set_yscale

In [None]:
# Create a figure of a fixed size and axes

fig, axes = plt.subplots(1, 2, figsize=(20,5))

# First plot
axes[0].plot(x, x^2, x, np.exp(x))
axes[0].set_title("Normal scale")

# Second plot
axes[1].plot(x, x^2, x, np.exp(x))
axes[1].set_yscale("log")
axes[1].set_title("Logarithmic scale");

# Show the plot

plt.show()







In [None]:
# ALTERNATIVE WITH LEGEND

fig, axes = plt.subplots(1, 2, figsize=(20,5))

# First plot
axes[0].plot(x, x^2, x, np.exp(x), label='Normal Plot')
axes[0].set_title("Normal scale")
axes[0].set_xlabel('x label')     
axes[0].set_ylabel('y label') 


# Second plot
axes[1].plot(x, x^2, x, np.exp(x), label='Logarithmic Plot')
axes[1].set_yscale("log")
axes[1].set_title("Logarithmic scale")
axes[1].set_xlabel('x label')     
axes[1].set_ylabel('y label') 

# Show the plot
plt.legend()
plt.show()

# Challenge 2

Import the Fitbit2.csv file and name your dataset fitbit. Download the data from [here](https://drive.google.com/file/d/17TW-w-izKuR7bLoa0Mnyp9YN9zEQNXq-/view?usp=sharing) and place it in the data folder.

In [None]:
# Read the data

fitbit = pd.read_csv('data/Fitbit2.csv')
fitbit.head()

#### From the Fitbit data, we want to visually understand:

How the average number of steps change by month. Use the appropriate visualization to show the median steps by month.
Is Fitbitter more active on weekend or workdays?
All plots must be in the same jupyter notebook cell.

Hints:

- Use Months_encoded and Week_or Weekend columns.
- Use matplolib.pyplot object oriented API.
- Set your size figure to 12,4
- Explore plt.sca
- Explore plt.xticks
- Save your figures

In [None]:

# MEDIAN STEPS BY MONTH_ENCODED

# Sort weekday in an order

fitbit.groupby(['Months_encoded'])['Steps'].apply(np.median)

# Find the median steps for each month

# Create a figure of a fixed size and axes

fig, ax0 = plt.subplots(figsize = (12,4))

# Set the current axes instance to ax[0] - Now plt methods will affect ax[0]

# Plot the weekday steps in the current axes
weekday_filter = fitbit[fitbit.Work_or_Weekend != 0]
weekday_filter.groupby(['Days'])['Steps'].apply(np.sum).plot()
weekday_filter

# Add labels, title, etc
ax0.set_title('WEEKDAY STEPS')  
ax0.set_xlabel('Days') 
ax0.set_ylabel('Steps')

# Save the figure
firstplot = ax0.plot()


# MEDIAN STEPS BY WORK_OR_WEEKEND

# Sort weekday and weekends in an order
sorteddays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] 
fitbit['Days'] = pd.Categorical(fitbit['Days'], categories=sorteddays, ordered=True) 
fitbit = fitbit.sort_values('Days')
fitbit

# Set the current axes instance to ax[1] - Now plt methods will affect ax[1]
fig, ax1 = plt.subplots(figsize = (12,4))

# Find the median steps for workdays and weekdays
fitbit.groupby(['Days'])['Steps'].apply(np.median).plot()


# Plot the workday and weekend steps in the current axes
# Add labels, title, etc
ax1.set_title('WEEKDAY & WEEKEND STEPS')  
ax1.set_xlabel('Days') 
ax1.set_ylabel('Steps')

# Save the figure
secondplot = ax1.plot()


#### Write a loop to plot 3 scatter plots of the following features:

Minutes Lightly Active vs Steps
Minutes Very Active vs Steps
Minutes Sedentary vs Steps

In [None]:
# ALTERNATIVE 1
# Create a df with the columns we are interested in 

fitbitnew = fitbit[['Minutes Lightly Active', 'Minutes Very Active', 'Minutes Sedentary', 'Steps' ]]
fitbitnew.head()

# Create a figure of a fixed size and axes
fig, [ax2, ax3, ax4] = plt.subplots(1,3, figsize = (19,6))

# Iterate to draw each scatter plot
#First Scatter Plot
ax2.scatter(fitbitnew['Minutes Lightly Active'],fitbitnew['Steps'])
ax2.set_xlabel('Minutes Lightly Active')
ax2.set_ylabel('Steps')

#Second Scatter Plot
ax3.scatter(fitbitnew['Minutes Very Active'],fitbitnew['Steps'])
ax3.set_xlabel('Minutes Very Active')
ax3.set_ylabel('Steps')

#Third Scatter Plot
ax4.scatter(fitbitnew['Minutes Sedentary'],fitbitnew['Steps'])
ax4.set_xlabel('Minutes Sedentary')
ax4.set_ylabel('Steps')


plt.show()

In [None]:
# ALTERNATIVE 2
# Create a df with the columns we are interested in 


# Create a figure of a fixed size and axes


# Iterate to draw each scatter plot


# Challenge 3

#### Open the titanic file. Name your dataset titanic.

Download the dataset from [here](https://drive.google.com/file/d/1jkkOcWm9aEF8gb0r2SsarF5Qdoaaw1do/view?usp=sharing) and place it in the data folder

In [None]:
# Read the data
titanic = pd.read_csv('data/titanic.csv')
titanic.head(20)

#### Explore the titanic dataset using Pandas dtypes.

In [None]:
titanic.dtypes

#### What are your numerical variables? What are your categorical variables?
Hint: Use Pandas select_dtypes

In [None]:
# Numerical variables

titanic.select_dtypes(include='float64')


In [None]:
# Categorical variables
titanic.select_dtypes(include='category')

#### Set the plot style to classic and the figure size to (12,6).
Hint: To set the style you can use matplotlib.pyplot functions or seaborn 

In [None]:
# Plot style change
plt.style.use('classic')

# Figure size change
plt.figure(figsize=(12,6))

#### Use the right visulalization to show the distribution of the column Age.

In [None]:
# Pandas plots are based on matplotlib

# Plot the histogram of column Age

titanic["Age"].plot.hist(bins=50)
# Set titles, labels, etc


# Show the plot


#### Use subplots and plot the distribution of the Age variable with bins equal to 10, 20 and 50.

In [None]:
# Variables


# Create a figure of a fixed size and axes
fig, [plot1, plot2, plot3] = plt.subplots(1,3, figsize = (19,6))

# Plot the histogram using a different number of bins

plot1.hist(titanic['Age'], bins=10)
plot2.hist(titanic['Age'], bins=20)
plot3.hist(titanic['Age'], bins=50)

#### How does the bin size affect your plot? Comment.

~~~~
Using a low number of bins (wider bins) reduces noise on the distribution estimation while using a 
high number of bins (narrower bins) gives greater precision to the distribution estimation (and more noise). 
~~~~

#### Use seaborn to show the distribution of the column Age.

In [None]:
# Seaborn is a Python library based on matplotlib

# Plot of the age histogram using seaborn
sns.distplot(titanic['Age'], hist=True, kde=False, bins=50)

# Show the plot


#### Use the right plot to visualize the column Gender. There are 2 ways of doing it. Do it both ways.
Hint: Use matplotlib and seaborn

In [None]:
# OPTION 1 - Matplotlib

# Bar plot of the gender
titanic['Gender'].value_counts().plot.bar()

# Add labels to the plot and change xticks rotation
plt.title("Titanic Passengers")
plt.xlabel("Gender")
plt.ylabel("Number of passengers")
plt.xticks(rotation = 0)


# Show the plot
plt.show()

In [None]:
# OPTION 2 - Seaborn

# Seaborn Countplot
# Show the plot

sns.countplot(titanic['Gender'])

plt.show()

#### Use the right plot to visualize the column Pclass.

In [None]:
# Bar plot of Pclass
titanic['Pclass'].value_counts().plot.bar()

# Add labels to the plot and change xticks rotation
plt.title("Passenger Class")
plt.xlabel("Type of Class")
plt.ylabel("Number of passengers")
plt.xticks(rotation = 0)

# Show the plot
plt.show()

#### We would like to have in one single plot the summary statistics of the feature Age. What kind of plot would you use?

In [None]:
# Box plot
titanic.boxplot(column="Age")

# Show the plot
plt.show()

#### What does the last plot tells you about the feature Age? Comment.

~~~~
The statistics that you can get from the boxplot are the minimum, first quartile, median, 
third quartile, and maximum.

The red line shows us the median of Age. 
The blue box shows us the interquartile range (from Q1 to Q3).
It also shows us the outliers, which are out of the maximum and minimum range (Q1 - 1.5*IQR, Q3 + 1.5*IQR).
~~~~

#### Now in addition to the summary statistics, we want to have in the SAME plot the distribution of Age. What kind of plot would you use?

In [None]:
# Set figure size
plt.subplots(figsize=(30,10))

# Violin plot
sns.violinplot("Age", data=titanic)

# Show the plot
plt.show()

#### What additional information the last plot provides you about the feature Age? Comment.

~~~~
This plot is a combination of a boxplot and a density plot. The violin plot features a kernel density estimation 
of the underlying distribution of the data.

The black central part of the plot is the same as a boxplot and the white dot is the median.
The blue part is the distribution of the data.
~~~~

#### We suspect that there is a linear relationship between Fare and Age. Use the right plot to show the relationship between these 2 features. There are 2 ways, please do it both ways.
Hint: One of the ways involves using Seaborn.

In [None]:
# OPTION 1

# Scatter plot
sns.scatterplot(x="Age", y="Fare", data=titanic)

# Add labels


# Show the plot
plt.show()

In [None]:
# OPTION 2

# Jointplot
sns.jointplot(x="Age", y="Fare", data=titanic)

# Show the plot
plt.show()

#### Using Seaborn plot the correlation matrix.

In [None]:
# Set figure size
plt.figure(figsize=(11,10))

# Correlation matrix

sns.heatmap(titanic.corr(), annot=True)

# Add title


# Show plot
plt.title('Titanic Correlation Heatmap')
plt.show()


In [None]:
sns.pairplot(titanic)

#### What are the most correlated features? Comment.

~~~~~
The most correlated features are Parch and SibSp.
~~~~~

#### Use the right plot to display the summary statistics of the Age in function of the Pclass.

In [None]:
# Boxplot

sns.boxplot(x="Pclass", y="Age", data=titanic)

# Show the plot
plt.show()

#### Use the seaborn to plot the distribution of the Age based on the Gender.
Hint: Use Facetgrid

In [None]:
# Create a grid with FacetGrid

x = sns.FacetGrid(titanic, col = "Gender")

# Draw a plot on each facet

x = x.map(plt.hist, 'Age')
# Show the plot
plt.show()