# Lab | Matplotlib & Seaborn

#### Import all the libraries that are necessary.

In [None]:
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# Challenge 1

#### Data

In [None]:
x = np.arange(0,100)
y = x*2
z = x**2

#### Plot (x,y) and (x,z) on the axes.
There are 2 ways of doing this. Do in both ways.
Hint: Check out the nrows, ncols, and index arguments of subplots.

Also, play around with the linewidth and style. Use the ones you're most happy with.

In [None]:
# OPTION 1
# Create a figure of a fixed size
plt.subplots(figsize=(10,5))

# First plot
plt.plot(x,y, color='purple', linewidth=3)

# Second plot
plt.plot(x,z, color='green', linewidth=3)

# Show the plot
plt.show()

#### Use plt.subplots(nrows=1, ncols=2) to create the plot below

In [None]:
# OPTION 2

# Create a figure of a fixed size and axes
fig, [ax1, ax2] = plt.subplots(1,2, figsize = (10,4))

# Plots
ax1.plot(x,y)
ax2.plot(x,z)

# Show the plot
plt.show()

#### Augmenting your previous code, resize your previous plot.
Hint: Add the figsize argument in plt.subplots()

In [None]:
# ENHANCED VERSION to see more matplotlib features

# Variables
fs = 16 # fontsize
fw = 700 # fontweight
lw = 10 # lineweight
y_rot = 10 # ylabel rotation

# Create a figure of a fixed size and axes
fig, [ax1, ax2] = plt.subplots(1,2, figsize = (12,4))


# First plot adding a title and x and y labels
ax1.plot(x,y, lw=lw)
ax1.set_title('PLOT_1', fontweight=fw, fontsize=fs)   
ax1.set_xlabel('x label')     
ax1.set_ylabel('y label', rotation=y_rot) 

# Second plot adding a title and x and y labels
ax2.plot(x,z,lw=lw)
ax2.set_title('PLOT_2', fontweight=fw, fontsize=fs)   
ax2.set_xlabel('x label')     
ax2.set_ylabel('y label', rotation=y_rot) 

# Show the plot
plt.show()

#### Plot both y=x^2 and y=exp(x) in the same plot using normal and logarithmic scale.
Hint: Use set_xscale and set_yscale

In [None]:
# Create a figure of a fixed size and axes
plt.subplots(figsize = (6,4))
plt.yscale('log')
plt.xscale('log')

# Normal plot
plt.plot(x,x**2)

# Logarithmic scale
plt.plot(x,np.exp(x))

# Show the plot
plt.show()
#Paolo: here the idea was to plot both functions on logscale
# and on normal scale, check cell below

In [None]:
#Paolo: alternative to  previous excercise 
f, ax = plt.subplots(1, 2, figsize=(10,4))

# Normal plot
ax[0].plot(x, x**2, 'r', x, np.exp(x), 'g')
ax[0].set_title("Normal scale")

# Logarithmic scale
ax[1].plot(x, x**2, 'r', x, np.exp(x), 'g')
ax[1].set_yscale("log")
ax[1].set_title("Logarithmic scale (y)")

# Show plot
plt.show()

In [None]:
# ALTERNATIVE WITH LEGEND
#Paolo: ok but same comment as above plus you miss the legend, check
# cell below
# Create a figure of a fixed size and axes
fig, [ax1, ax2] = plt.subplots(1,2, figsize = (12,4))

# Normal plot
ax1.plot(x,x**2)
ax1.set_title('normal_scale')   

# Logarithmic scale
ax2.plot(x,np.exp(x))
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_title('logarithmic_scale')   

# Show plot
plt.show()


In [None]:
# Paolo ALTERNATIVE WITH LEGEND
# Create a figure of a fixed size and axes
f, ax = plt.subplots(1, 2, figsize=(10,4))

# Normal plot
ax[0].plot(x, x**2, 'r', label = 'x**2')
ax[0].plot(x, np.exp(x), 'g', label = 'exp(x)')
ax[0].set_title("Normal scale")
ax[0].legend()

# Logarithmic scale
ax[1].plot(x, x**2, 'r', label = 'x**2')
ax[1].plot(x, np.exp(x), 'g', label = 'exp(x)')
ax[1].set_yscale("log")
ax[1].set_title("Logarithmic scale (y)")
ax[1].legend()

# Show plot
plt.show()

# Challenge 2

Import the Fitbit2.csv file and name your dataset fitbit. Download the data from [here](https://drive.google.com/file/d/17TW-w-izKuR7bLoa0Mnyp9YN9zEQNXq-/view?usp=sharing) and place it in the data folder.

In [None]:
# Read the data
data = pd.read_csv('data/Fitbit2.csv')
data.head()

#### From the Fitbit data, we want to visually understand:

How the average number of steps change by month. Use the appropriate visualization to show the median steps by month.
Is Fitbitter more active on weekend or workdays?
All plots must be in the same jupyter notebook cell.

Hints:

- Use Months_encoded and Week_or Weekend columns.
- Use matplolib.pyplot object oriented API.
- Set your size figure to 12,4
- Explore plt.sca
- Explore plt.xticks
- Save your figures

In [None]:
# MEDIAN STEPS BY MONTH_ENCODED

# Find the median steps for each month
per_month = data.groupby('Months_encoded')['Steps'].median().reset_index()

# Create a figure of a fixed size and axes
fig, ax = plt.subplots(1,2, figsize = (12,4))

# Set the current axes instance to ax[0] - Now plt methods will affect ax[0]
# Plot the weekday steps in the current axes
ax[0].plot(per_month['Months_encoded'],per_month['Steps'])

# Add labels, title, etc
ax[0].set_title('Steps Per Month')   
ax[0].set_xlabel('Months')     
ax[0].set_ylabel('Median_Steps') 

# Save the figure
plt.savefig('Steps_Per_Month.png')

# MEDIAN STEPS BY WORK_OR_WEEKEND
# Set the current axes instance to ax[1] - Now plt methods will affect ax[1]
# Find the median steps for workdays and weekdays
per_week = data.groupby('Work_or_Weekend')['Steps'].median().reset_index()

# Plot the workday and weekend steps in the current axes
ax[1].bar(per_week['Work_or_Weekend'],per_week['Steps'],color='green')

# Add labels, title, etc
ax[1].set_title('Steps Per Week or Weekend')   
ax[1].set_xlabel('Weekend=0_Weekday=1')     
ax[1].set_ylabel('Median_Steps')
#ax[1].set_yticks([4000,6000,])
ax[1].set_xticks([0,1])

# Save the figure
plt.savefig('Steps_Per_Week.png')

plt.show()

#this works as well:
# ax = sns.boxplot(x="Months_encoded", y="Steps", hue='Work_or_Weekend', data=data)
# ax = ax.set_xticklabels(ax.get_xticklabels(),rotation=90)


#### Write a loop to plot 3 scatter plots of the following features:

Minutes Lightly Active vs Steps
Minutes Very Active vs Steps
Minutes Sedentary vs Steps

In [None]:
# ALTERNATIVE 1
# Create a df with the columns we are interested in 
active_min = data[['Minutes Lightly Active','Minutes Very Active','Minutes Sedentary','Steps']]
active_min.head()

# Create a figure of a fixed size and axes
fig, axs = plt.subplots(1,3, figsize = (20,4))

# Iterate to draw each scatter plot
x=0
for ax in axs:
    ax = ax.scatter(active_min['Steps'],active_min.iloc[:,x])
    axs[x].set_title(active_min.columns[x])   
    axs[x].set_xlabel('Steps')   
    axs[x].set_ylabel('Minutes')   
    x+=1

plt.show()


In [None]:
# ALTERNATIVE 2
# Create a df with the columns we are interested in 
active_df = data[['Minutes Lightly Active','Minutes Very Active','Minutes Sedentary','Steps']]
active_df.head()

# Create a figure of a fixed size and axes
fig, axs = plt.subplots(1,3, figsize = (20,4))

# Iterate to draw each scatter plot
axs[0].scatter(active_min['Steps'],active_min.iloc[:,0])
axs[0].set_title(active_min.columns[0])   
axs[0].set_xlabel('Steps')   
axs[0].set_ylabel('Minutes')   

axs[1].scatter(active_min['Steps'],active_min.iloc[:,1])
axs[1].set_title(active_min.columns[1])   
axs[1].set_xlabel('Steps')   
axs[1].set_ylabel('Minutes')   

axs[2].scatter(active_min['Steps'],active_min.iloc[:,2])
axs[2].set_title(active_min.columns[2])   
axs[2].set_xlabel('Steps')   
axs[2].set_ylabel('Minutes')   

plt.show()

#Paolo: great!  you could have used a for loop, usually the word 'iterate' is a hint to use a for loop, in this case it saves you many lines of code
#check cell below for alternative

In [None]:
active_df = data[['Minutes Lightly Active','Minutes Very Active','Minutes Sedentary','Steps']]
active_df.head()

# Create a figure of a fixed size and axes
fig, axs = plt.subplots(1,3, figsize = (20,4))
for i in range (3):
    axs[i].scatter(active_min['Steps'],active_min.iloc[:,i])
    axs[i].set_title(active_min.columns[i])   
    axs[i].set_xlabel('Steps')   
    axs[i].set_ylabel('Minutes') 

# Challenge 3

#### Open the titanic file. Name your dataset titanic.

Download the dataset from [here](https://drive.google.com/file/d/1jkkOcWm9aEF8gb0r2SsarF5Qdoaaw1do/view?usp=sharing) and place it in the data folder

In [None]:
# Read the data
titanic = pd.read_csv('data/titanic.csv')
titanic.head()

#### Explore the titanic dataset using Pandas dtypes.

In [None]:
titanic.dtypes

#### What are your numerical variables? What are your categorical variables?
Hint: Use Pandas select_dtypes

In [None]:
# Numerical variables
titanic.select_dtypes(include=['int','float']).head()

In [None]:
# Categorical variables
titanic.select_dtypes(include=['object']).head()

#### Set the plot style to classic and the figure size to (12,6).
Hint: To set the style you can use matplotlib.pyplot functions or seaborn 

In [None]:
# Plot style change
plt.style.use('classic')

# Figure size change
figsize = (12,6)
plt.rcParams["figure.figsize"] = figsize


#### Use the right visulalization to show the distribution of the column Age.

In [None]:
# Pandas plots are based on matplotlib

# Plot the histogram of column Age
plt.hist(titanic['Age'])

# Set titles, labels, etc
plt.title("Age on Titanic")
plt.xlabel('Age')   
plt.ylabel('People on Titanic')  

# Show the plot
plt.show()

#### Use subplots and plot the distribution of the Age variable with bins equal to 10, 20 and 50.

In [None]:
# Variables
# Create a figure of a fixed size and axes
# Plot the histogram using a different number of bins
#Paolo: here the idea was to create 3 different plots each with different
# number of bins, check cell below
titanic['Age'].hist(bins=[0, 10, 30, 80])
plt.title("Age on Titanic")
plt.xlabel('Age')   
plt.ylabel('People on Titanic')  
plt.show()

In [None]:
bin = [10, 20, 50]
for i in range(3):
    titanic['Age'].hist(bins=bin[i])
    plt.title("Age on Titanic")
    plt.xlabel('Age')   
    plt.ylabel('People on Titanic')  
    plt.show()

#### How does the bin size affect your plot? Comment.

~~~~
Using a low number of bins (wider bins) reduces noise on the distribution estimation while using a 
high number of bins (narrower bins) gives greater precision to the distribution estimation (and more noise). 
~~~~

#### Use seaborn to show the distribution of the column Age.

In [None]:
# Seaborn is a Python library based on matplotlib

# Plot of the age histogram using seaborn
sns.distplot(titanic['Age'])

# Show the plot
plt.show()


#### Use the right plot to visualize the column Gender. There are 2 ways of doing it. Do it both ways.
Hint: Use matplotlib and seaborn

In [None]:
# OPTION 1 - Matplotlib

# Bar plot of the gender
titanic['Gender'].value_counts().plot.bar()

# Add labels to the plot and change xticks rotation
plt.title("Gender on Titanic")
plt.xlabel('Gender')   
plt.ylabel('Value Count')  
plt.xticks(rotation=0)   

# Show the plot
plt.show()


In [None]:
# OPTION 2 - Seaborn

# Seaborn Countplot
sns.countplot(titanic['Gender'])


# Show the plot
plt.show()


#### Use the right plot to visualize the column Pclass.

In [None]:
# Bar plot of Pclass
titanic['Pclass'].value_counts().plot.bar()

# Add labels to the plot and change xticks rotation
plt.title("Passenger Class on Titanic")
plt.xlabel('Passenger Class')   
plt.ylabel('Value Count')  
plt.xticks(rotation=0) 

# Show the plot
plt.show()

#### We would like to have in one single plot the summary statistics of the feature Age. What kind of plot would you use?

In [None]:
# Box plot
titanic.boxplot(column="Age")

# Show the plot
plt.show()

#### What does the last plot tells you about the feature Age? Comment.

~~~~
The statistics that you can get from the boxplot are the minimum, first quartile, median, 
third quartile, and maximum.

The red line shows us the median of Age. 
The blue box shows us the interquartile range (from Q1 to Q3).
It also shows us the outliers, which are out of the maximum and minimum range (Q1 - 1.5*IQR, Q3 + 1.5*IQR).
~~~~

#### Now in addition to the summary statistics, we want to have in the SAME plot the distribution of Age. What kind of plot would you use?

In [None]:
# Set figure size
figsize = (8,4)
plt.rcParams["figure.figsize"] = figsize

# Violin plot
sns.violinplot("Age", data=titanic)

# Show the plot
plt.show()

#### What additional information the last plot provides you about the feature Age? Comment.

~~~~
This plot is a combination of a boxplot and a density plot. The violin plot features a kernel density estimation 
of the underlying distribution of the data.

The black central part of the plot is the same as a boxplot and the white dot is the median.
The blue part is the distribution of the data.
~~~~

#### We suspect that there is a linear relationship between Fare and Age. Use the right plot to show the relationship between these 2 features. There are 2 ways, please do it both ways.
Hint: One of the ways involves using Seaborn.

In [None]:
# OPTION 1

# Scatter plot
sns.scatterplot(x="Age", y="Fare", data=titanic)

# Add labels


# Show the plot
plt.show()

In [None]:
# OPTION 2

# Joinplot
sns.jointplot(x="Age", y="Fare", data=titanic)

# Show the plot
plt.show()

#### Using Seaborn plot the correlation matrix.

In [None]:
# Set figure size


# Correlation matrix
corr = titanic.corr()
sns.heatmap(corr)

# Add title
plt.title("Titanic Correlation Heatmap")


# Show plot
plt.show()

#### What are the most correlated features? Comment.

~~~~~
The most correlated features are Parch and SibSp.
~~~~~

#### Use the right plot to display the summary statistics of the Age in function of the Pclass.

In [None]:
# Boxplot
figsize = (12,4)
plt.rcParams["figure.figsize"] = figsize

sns.boxplot(x="Pclass", y='Age', data=titanic)

# Show the plot
plt.show()

#### Use the seaborn to plot the distribution of the Age based on the Gender.
Hint: Use Facetgrid

In [None]:
# Create a grid with FacetGrid
g = sns.FacetGrid(titanic, col="Gender")

# # Draw a plot on each facet
g.map(plt.hist, "Age")

# # Show the plot
plt.show()
