# Lab | Matplotlib & Seaborn

#### Import all the libraries that are necessary.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Challenge 1

#### Data

In [None]:
x = np.arange(0,100)
y = x*2
z = x**2

#### Plot (x,y) and (x,z) on the axes.
There are 2 ways of doing this. Do in both ways.
Hint: Check out the nrows, ncols, and index arguments of subplots.

Also, play around with the linewidth and style. Use the ones you're most happy with.

In [None]:
# OPTION 1
# Create a figure of a fixed size
plt.subplots(figsize=(5,5))

# First plot
plt.plot(x, y, label='x-y linear')

# Second plot
plt.plot(x, z, label='x-z quadratic')

# Show the plot
plt.title("Linear vs Quadratic")
plt.legend()
plt.show()


#### Use plt.subplots(nrows=1, ncols=2) to create the plot below

In [None]:
# OPTION 2

# Create a figure of a fixed size and axes
fig, [plot_a, plot_b] = plt.subplots(1,2, figsize=(10,5))
# Plots
plot_a.plot(x, y)
plot_a.set_title('LINEAR PLOT')   
plot_a.set_xlabel('x')     
plot_a.set_ylabel('y')

plot_b.plot(x, z)
plot_b.set_title('QUADRATIC PLOT')   
plot_b.set_xlabel('x')     
plot_b.set_ylabel('z')
# Show the plot
plt.show()

#### Augmenting your previous code, resize your previous plot.
Hint: Add the figsize argument in plt.subplots()

In [None]:
# ENHANCED VERSION to see more matplotlib features

# Variables
fs = 16 # fontsize
fw = 700 # fontweight
lw = 3 # lineweight
y_rot = 0 # ylabel rotation

# Create a figure of a fixed size and axes
fig, [plot_a, plot_b] = plt.subplots(1,2, figsize=(10,5))

# First plot adding a title and x and y labels
plot_a.plot(x, y, linewidth = lw)
plot_a.set_title('LINEAR PLOT', fontweight = fw)   
plot_a.set_xlabel('x', fontsize = fs)     
plot_a.set_ylabel('y')

# Second plot adding a title and x and y labels
plot_b.plot(x, z)
plot_b.set_title('QUADRATIC PLOT')   
plot_b.set_xlabel('x')     
plot_b.set_ylabel('z')

# Show the plot
plt.show()

#### Plot both y=x^2 and y=exp(x) in the same plot using normal and logarithmic scale.
Hint: Use set_xscale and set_yscale

In [None]:
# Create a figure of a fixed size and axes
fig = plt.figure(figsize=(5, 5))

# Normal plot
plot_a = fig.add_subplot()
plot_a.plot(x, x**2)

# Logarithmic scale
plot_b = fig.add_subplot()
plot_b.plot(x, np.exp(x))
plot_b.set_yscale('log')

# Show plot
plt.show()

In [None]:
# ALTERNATIVE WITH LEGEND

# Create a figure of a fixed size and axes
fig = plt.figure(figsize=(5, 5))

# Normal plot
plot_a = fig.add_subplot()
plot_a.plot(x, x**2)

# Logarithmic scale
plot_b = fig.add_subplot()
plot_b.plot(x, np.exp(x))
plot_b.set_yscale('log')

# Show plot
plt.legend()
plt.show()

# Challenge 2

Import the Fitbit2.csv file and name your dataset fitbit. Download the data from [here](https://drive.google.com/file/d/17TW-w-izKuR7bLoa0Mnyp9YN9zEQNXq-/view?usp=sharing) and place it in the data folder.

In [None]:
# Read the data
fitbit = pd.read_csv('data/Fitbit2.csv')

#### From the Fitbit data, we want to visually understand:

How the average number of steps change by month. Use the appropriate visualization to show the median steps by month.
Is Fitbitter more active on weekend or workdays?
All plots must be in the same jupyter notebook cell.

Hints:

- Use Months_encoded and Week_or Weekend columns.
- Use matplolib.pyplot object oriented API.
- Set your size figure to 12,4
- Explore plt.sca
- Explore plt.xticks
- Save your figures

In [None]:
# MEDIAN STEPS BY MONTH_ENCODED
# Find the median steps for each month
describe_steps = fitbit.groupby('Months_encoded')['Steps'].describe()[['mean', '50%']]
# describe_steps.index
# Create a figure of a fixed size and axes
fig, [plot_a, plot_b] = plt.subplots(1,2, figsize=(12,4))
# Set the current axes instance to ax[0] - Now plt methods will affect ax[0]
plt.sca(plot_a)
# Plot the month steps in the current axes
plot_a.plot(describe_steps['mean'])
# Add labels, title, etc
plot_a.set_title('Steps X Month')   
plot_a.set_xlabel('Months')     
plot_a.set_ylabel('Steps')
# plt.xticks(fitbit['Months_encoded'])
plot_a = plt.xticks(fitbit['Months_encoded'])
# Save the figure

# MEDIAN STEPS BY WORK_OR_WEEKEND
# Set the current axes instance to ax[1] - Now plt methods will affect ax[1]
plt.sca(plot_b)

# Find the median steps for workdays and weekdays
describe_wow = fitbit.groupby('Work_or_Weekend')['Steps'].describe()[['mean', '50%']]

# Plot the workday and weekend steps in the current axes
plot_b.plot(describe_wow['mean'])

# Add labels, title, etc
plot_b.set_title('Steps X Workday/Weekend')   
plot_b.set_xlabel('Workday/Weekend')     
plot_b.set_ylabel('Steps')
# plt.xticks(fitbit['Work_or_Weekend'])
plot_b = plt.xticks(fitbit['Work_or_Weekend'])
# Save the figure
fig.savefig('temp.png')

#### Write a loop to plot 3 scatter plots of the following features:

Minutes Lightly Active vs Steps
Minutes Very Active vs Steps
Minutes Sedentary vs Steps

In [None]:
# ALTERNATIVE 1
cols = ['Minutes Lightly Active', 'Minutes Very Active', 'Minutes Sedentary']
# Create a df with the columns we are interested in 
fit_min = fitbit[['Minutes Lightly Active', 'Minutes Very Active', 'Minutes Sedentary', 'Steps']]

# Create a figure of a fixed size and axes
plt.subplots(figsize=(5,5))

# Iterate to draw each scatter plot
for col in cols:
    plt.plot(fit_min[col], fit_min['Steps'], 'o', label=f'Steps-{col}')
plt.legend()

In [None]:
# ALTERNATIVE 2
# Create a df with the columns we are interested in 


# Create a figure of a fixed size and axes


# Iterate to draw each scatter plot


# Challenge 3

#### Open the titanic file. Name your dataset titanic.

Download the dataset from [here](https://drive.google.com/file/d/1jkkOcWm9aEF8gb0r2SsarF5Qdoaaw1do/view?usp=sharing) and place it in the data folder

In [None]:
# Read the data
titanic = pd.read_csv('data/titanic.csv')

#### Explore the titanic dataset using Pandas dtypes.

In [None]:
titanic.dtypes

#### What are your numerical variables? What are your categorical variables?
Hint: Use Pandas select_dtypes

In [None]:
# Numerical variables
titanic.select_dtypes(np.number)

In [None]:
# Categorical variables
titanic.select_dtypes(exclude=np.number)

#### Set the plot style to classic and the figure size to (12,6).
Hint: To set the style you can use matplotlib.pyplot functions or seaborn 

In [None]:
# Plot style change
plt.style.use('classic')

# Figure size change
plt.subplots(figsize=(12,6))

#### Use the right visulalization to show the distribution of the column Age.

In [None]:
# Pandas plots are based on matplotlib
# Plot the histogram of column Age
plt.hist(titanic['Age'])


# Set titles, labels, etc
plt.title('Age')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Show the plot
plt.show()

#### Use subplots and plot the distribution of the Age variable with bins equal to 10, 20 and 50.

In [None]:
# Variables
# Create a figure of a fixed size and axes
fig, [plot_a, plot_b,plot_c] = plt.subplots(1,3, figsize=(16,4))


# Plot the histogram using a different number of bins
# plot_1 = plot_a
plot_a.hist(titanic['Age'], bins=10)
plot_b.hist(titanic['Age'], bins=20)
plot_c.hist(titanic['Age'], bins=50)
plt.show()

#### How does the bin size affect your plot? Comment.

~~~~
Using a low number of bins (wider bins) reduces noise on the distribution estimation while using a 
high number of bins (narrower bins) gives greater precision to the distribution estimation (and more noise). 
~~~~

#### Use seaborn to show the distribution of the column Age.

In [None]:
# Seaborn is a Python library based on matplotlib

# Plot of the age histogram using seaborn
sns.distplot(titanic['Age'])
# Show the plot


#### Use the right plot to visualize the column Gender. There are 2 ways of doing it. Do it both ways.
Hint: Use matplotlib and seaborn

In [None]:
# OPTION 1 - Matplotlib

# Bar plot of the gender
plt.subplots(figsize=(5,5))
plt.hist(titanic['Age'])
# Add labels to the plot and change xticks rotation
plt.xticks(rotation=90)
plt.title('Age Distribution')

# Show the plot
plt.show()

In [None]:
# OPTION 2 - Seaborn

# Seaborn Countplot
sns.countplot(titanic['Age'])
# Show the plot


#### Use the right plot to visualize the column Pclass.

In [None]:
# Bar plot of Pclass
plt.subplots(figsize=(5,5))
plt.hist(titanic['Pclass'])
# Add labels to the plot and change xticks rotation
plt.xticks([1,2,3],rotation=90)
plt.title('Pclass')

# Show the plot
plt.show()

#### We would like to have in one single plot the summary statistics of the feature Age. What kind of plot would you use?

In [None]:
# Box plot

titanic['Age'].describe()
# Show the plot
sns.boxplot(data=titanic['Age'])

#### What does the last plot tells you about the feature Age? Comment.

~~~~
The statistics that you can get from the boxplot are the minimum, first quartile, median, 
third quartile, and maximum.

The red line shows us the median of Age. 
The blue box shows us the interquartile range (from Q1 to Q3).
It also shows us the outliers, which are out of the maximum and minimum range (Q1 - 1.5*IQR, Q3 + 1.5*IQR).
~~~~

#### Now in addition to the summary statistics, we want to have in the SAME plot the distribution of Age. What kind of plot would you use?

In [None]:
# Set figure size
plt.subplots(figsize=(5,5))

# Violin plot
sns.violinplot("Age", data=titanic)

# Show the plot


#### What additional information the last plot provides you about the feature Age? Comment.

~~~~
This plot is a combination of a boxplot and a density plot. The violin plot features a kernel density estimation 
of the underlying distribution of the data.

The black central part of the plot is the same as a boxplot and the white dot is the median.
The blue part is the distribution of the data.
~~~~

#### We suspect that there is a linear relationship between Fare and Age. Use the right plot to show the relationship between these 2 features. There are 2 ways, please do it both ways.
Hint: One of the ways involves using Seaborn.

In [None]:
# OPTION 1

# Scatter plot
plt.scatter(titanic['Age'], titanic['Fare'])
plt.title('Scatter plot Fare/Age')


# Add labels
plt.xlabel('Age')
plt.ylabel('Fare')

# Show the plot
plt.show()

In [None]:
# OPTION 2

# Joinplot
sns.jointplot(x="Age", y="Fare", data=titanic)

# Show the plot
# titanic.head()

#### Using Seaborn plot the correlation matrix.

In [None]:
# Set figure size
plt.figure(figsize=(5,5))

# Correlation matrix
sns.heatmap(titanic.corr())

# Add title


# Show plot


#### What are the most correlated features? Comment.

~~~~~
The most correlated features are Parch and SibSp.
~~~~~

#### Use the right plot to display the summary statistics of the Age in function of the Pclass.

In [None]:
# Boxplot
sns.boxplot(x = "Pclass", y="Age", data=titanic)

# Show the plot
plt.show()

#### Use the seaborn to plot the distribution of the Age based on the Gender.
Hint: Use Facetgrid

In [None]:
# Create a grid with FacetGrid

g = sns.FacetGrid(titanic)
# Draw a plot on each facet
g.map(plt.scatter,"Age","Gender");

# Show the plot
plt.show()

In [None]:
months