# Subplots

In [1]:
%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

In [2]:
plt.subplot?

In [3]:
plt.figure(figsize=(8, 8))

plt.subplot(2, 2, 1)
x = np.arange(1, 10, 1)
plt.plot(x, '-o', c = 'blue', label = 'blue')
plt.legend()

plt.subplot(2, 2, 2)
y = np.array([x**2 for x in range(10)])
plt.plot(y, '-o', c = 'orange')

plt.subplot(2, 2, 3)
z = np.array([x**3 for x in range(10)])
plt.plot(z, '-o', c = 'red')

plt.subplot(2, 2, 4)
plt.plot(x, '-o', c = 'blue', label = 'blue')
plt.plot(y, '-o', c = 'orange')
plt.plot(z, '-o', c = 'red')

plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x2715d807e10>

In [None]:
plt.figure()
ax1 = plt.subplot(2, 2, 1)
plt.plot(x, '-o')
ax2 = plt.subplot(2, 2, 2, sharey = ax1)
plt.plot(y, '-o')

plt.subplot(2, 2, 3, sharey = ax1)
plt.plot(z, '-o')

In [None]:
plt.figure()
# subplot with 1 row, 2 columns, and current axis is 1st subplot axes
plt.subplot(1, 2, 1)

linear_data = np.array([1,2,3,4,5,6,7,8])

plt.plot(linear_data, '-o')

In [None]:
exponential_data = linear_data**2 

# subplot with 1 row, 2 columns, and current axis is 2nd subplot axes
plt.subplot(1, 2, 2)
plt.plot(exponential_data, '-o')

In [None]:
# plot exponential data on 1st subplot axes
plt.subplot(1, 2, 1)
plt.plot(exponential_data, '-x')

In [None]:
plt.figure()
ax1 = plt.subplot(1, 2, 1)
plt.plot(linear_data, '-o')
# pass sharey=ax1 to ensure the two subplots share the same y axis
ax2 = plt.subplot(1, 2, 2, sharey=ax1)
plt.plot(exponential_data, '-x')

In [None]:
plt.figure()
# the right hand side is equivalent shorthand syntax
plt.subplot(1,2,1) == plt.subplot(121)

In [None]:
x = np.array([x**2 for x in range(10)])
y = np.array([x**2 for x in range(20, 1, -2)])
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, sharex = True, sharey = True)
ax1.plot(x, '-x')
#ax1.xaxis(10)
ax4.plot(y, '_')

In [None]:
# create a 3x3 grid of subplots
fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True)
# plot the linear_data on the 5th subplot axes 
ax5.plot(linear_data, '-')

In [None]:
# set inside tick labels to visible
for ax in plt.gcf().get_axes():
    for label in ax.get_xticklabels() + ax.get_yticklabels():
        label.set_visible(True)

In [None]:
# necessary on some systems to update the plot
plt.gcf().canvas.draw()

# Histograms

In [None]:
'''
A histogram is a bar chart which shows the frequency of a given phenomena. 

A great example are probability distributions.
For instance, in the first course in this specialization,
we touched on the difference between the random, uniform, normal, and
chi squared distributions.
Probability function can be visualized as a curve, where the y-axis holds
the probability a given value would occur, and the x-axis is the value itself.
This is called a probability density function.
The y-axis values are limited to between zero and one, where zero means there's no
chance of a given value occurring and one means that the value will always occur

The x-axis values are labeled in terms of the distribution function.
In the case of the normal distribution,
this is usually in terms of standard deviations. 

So a histogram is just a bar chart where the x-axis is a given observation and
the y-axis is the frequency with which that observation occurs.
So we should be able to plot a given probability distribution
by sampling from it. 
sampling means that we just pick a number out of
the distribution, like rolling a die or pulling a single card out of a deck.
As we do this over and over again,
we get a more accurate description of the distribution. 


'''

In [2]:
# create 2x2 grid of axis subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)
axs = [ax1,ax2,ax3,ax4]

# draw n = 10, 100, 1000, and 10000 samples from the normal distribution and plot corresponding histograms
for n in range(0,len(axs)):
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size) # look in the next cell for normal function of numpy
    # loc = mean for the gausian distribution
    # scale = standard deviation of distribution
    # size = output shape 
    axs[n].hist(sample)
    axs[n].set_title(f'n={sample_size}')

<IPython.core.display.Javascript object>

In [None]:
'''
normal function of random just creates a list of numbers
based on the underlying normal distribution.


'''

In [None]:
plt.figure()
ax1 = plt.subplot(111)
sample = np.random.normal(loc=0.0, scale=2.0, size=10000)
ax1.hist(sample, bins = 100)
print(sample)
print(np.average(sample))
print(np.max(sample))

In [None]:
# repeat with number of bins set to 100
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)
axs = [ax1,ax2,ax3,ax4]

for n in range(0,len(axs)):
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size)
    axs[n].hist(sample, bins=100)
    axs[n].set_title('n={}'.format(sample_size))

In [None]:
plt.figure()
Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
plt.scatter(X,Y,s=1.5)

In [None]:
# use gridspec to partition the figure into subplots
import matplotlib.gridspec as gridspec

plt.figure()
gspec = gridspec.GridSpec(3, 3)
top_histogram = plt.subplot(gspec[0, 1:]) 
# 0th row and 1 till end columns
side_histogram = plt.subplot(gspec[1:, 0])
# 1 till end row and 0th column
lower_right = plt.subplot(gspec[1:, 1:])
# 1 till end row and 1 till end column

In [None]:
Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
lower_right.scatter(X, Y,s=1.5)
top_histogram.hist(X, bins=100)
s = side_histogram.hist(Y, bins=100, orientation='horizontal')

In [None]:
top_histogram.clear()
side_histogram.clear()
lower_right.clear()

In [None]:
# clear the histograms and plot normed histograms
top_histogram.clear()
top_histogram.hist(X, bins=100, density=True)
side_histogram.clear()
side_histogram.hist(Y, bins=100, orientation='horizontal', density=True)
# flip the side histogram's x axis
side_histogram.invert_xaxis()

In [None]:
# change axes limits
for ax in [top_histogram, lower_right]:
    ax.set_xlim(0, 1)
for ax in [side_histogram, lower_right]:
    ax.set_ylim(-5, 5)

In [None]:
%%HTML
<img src='http://educationxpress.mit.edu/sites/default/files/journal/WP1-Fig13.jpg' />

# Box and Whisker Plots

In [None]:
'''
method of showing
aggregate statistics of various samples in a concise matter

The box plot simultaneously shows, for each sample, the median of each value,
the minimum and maximum of the samples, and the interquartile range
'''

In [None]:
import pandas as pd
normal_sample = np.random.normal(loc=0.0, scale=1.0, size=10000)
random_sample = np.random.random(size=10000)
gamma_sample = np.random.gamma(2, size=10000)

df = pd.DataFrame({'normal': normal_sample, 
                   'random': random_sample, 
                   'gamma': gamma_sample})
df.head()

In [None]:
df.describe()

In [None]:
plt.figure()
# create a boxplot of the normal data, assign the output to a variable to supress output
_ = plt.boxplot(df['normal'], whis='range')

In [None]:
# clear the current figure
plt.clf()
# plot boxplots for all three of df's columns
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')

In [None]:
plt.figure()
_ = plt.hist(df['gamma'], bins=100)

In [None]:
import mpl_toolkits.axes_grid1.inset_locator as mpl_il

plt.figure()
plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')
# overlay axis on top of another 
ax2 = mpl_il.inset_axes(plt.gca(), width='55%', height='40%', loc=2)
ax2.hist(df['gamma'], bins=100)
ax2.margins(x=0.1)

In [None]:
# switch the y axis ticks for ax2 to the right side
ax2.yaxis.tick_right()

In [None]:
# if `whis` argument isn't passed, boxplot defaults to showing 1.5*interquartile (IQR) whiskers with outliers
plt.figure()
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ] )

In [None]:
'''
This is one method of detecting outliers.
And the points which are plotted beyond the whiskers are called fliers

You can also plot the confidence interval in a couple of different ways on the data.
The most common is to add notches to the box plot representing the 95% confidence
interval of the data and there are lots of other ways to customize the box plot.
The box plot is one of the more common plots that you might use as
a data scientist. 
'''

# Heatmaps

In [None]:
plt.figure()

Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
_ = plt.hist2d(X, Y, bins=25)
print(X[:5])
print(Y[:5])

In [None]:
plt.figure()
_ = plt.hist2d(X, Y, bins=100)

In [None]:
plt.figure()
Z = np.random.gamma(15, size=10000)
_ = plt.hist2d(X,Z,bins = 50)

In [None]:
plt.figure()
_ = plt.hist2d(np.random.random(size=10000),np.random.random(size=10000),bins=100)


In [None]:
# add a colorbar legend
plt.colorbar()

# Animations

In [None]:
'''
Animation and interactivity heavily depend on support from this backend layer.
And using a backend like the image png1 doesn't provide this.
However, the NBN backend or the matplotlib notebook magic function does provide for
some interactivity, so we can leverage that here. 

FuncAnimation it builds an animation by iteratively calling a function which you define.
Essentially, your function will either clear the axis object and
redraw the next frame, which you want users to see or
will return a list of objects which need to be redrawn. 
'''

In [None]:
import matplotlib.animation as animation

n = 100
x = np.random.randn(n)
print(x)
plt.figure()
plt.plot(x)

In [None]:
'''
create a function which will do the plotting.
We'll call this function update.
Now the matplotlib FuncAnimation object is going to call this every few milliseconds
and pass in the frame number we are on starting with frame zero.
So we can use this is as the index into our array values, which we called x.

Now we also need to consider the bins.
Previously we just passed a single number in for the bins eg 10 or 100.
But we can also pass in the spacing in between bins.
Since we want all of our bins set and evenly spaced, because we're redrawing
the animation in each clock tick, we can use the NumPy arange function.
This will ensure that the bins don't change.
We use the balance of minus 4 to plus 4, in half-step increments. 
'''

In [None]:
# create the function that will do the plotting, where curr is the current frame
def update(curr):
    # check if animation is at the last frame, and if so, stop the animation a
    if curr == n: 
        a.event_source.stop()
        # to stop animation if its on the last frame
    # Clear current axis
    plt.cla()
    bins = np.arange(-4, 8, 0.25)
    plt.hist(x[:curr], bins=bins)
    plt.axis([-4,4,0,30])
    plt.gca().set_title('Sampling the Normal Distribution')
    plt.gca().set_ylabel('Frequency')
    plt.gca().set_xlabel('Value')
    plt.annotate(f'n = {curr}', [3,27])

In [None]:
fig = plt.figure()
a = animation.FuncAnimation(fig, update)#, interval=100)

In [None]:
# chris brooks suggests animating 4 plots gamma, normal, random, parameterized distribution
#def anim():
     
#fig = plt.figure()
fig, ((x1,x2),(x3,x4)) = plt.subplots(2,2)
x1 = plt.subplot(221)
b = animation.FuncAnimation(fig, update)
x2 = plt.subplot(222)
c = animation.FuncAnimation(fig, update)
x3 = plt.subplot(223)
d = animation.FuncAnimation(fig, update)
x4 = plt.subplot(224)
a = animation.FuncAnimation(fig, update)

# Interactivity

In [None]:
'''
You can think of an event as a piece of data which is associated with
a function call.
And when the event happens, the software environment, in our case this is
Matplotlibs backend, will call the function with the relevant data


'''

In [6]:
plt.figure()
data = np.random.rand(10)
plt.plot(data)

def onclick(event):
    plt.cla()
    plt.plot(data)
    plt.gca().set_title('Event at pixels {},{} \nand data {},{}'.format(event.x, event.y, event.xdata, event.ydata))

# tell mpl_connect we want to pass a 'button_press_event' into onclick when the event is detected
plt.gcf().canvas.mpl_connect('button_press_event', onclick)

<IPython.core.display.Javascript object>

7

In [8]:
import pandas as pd
from random import shuffle
origins = ['China', 'Brazil', 'India', 'USA', 'Canada', 'UK', 'Germany', 'Iraq', 'Chile', 'Mexico']

shuffle(origins)

df = pd.DataFrame({'height': np.random.rand(10),
                   'weight': np.random.rand(10),
                   'origin': origins})
df

Unnamed: 0,height,weight,origin
0,0.962992,0.930884,Germany
1,0.923035,0.408338,China
2,0.745577,0.308678,UK
3,0.693145,0.813935,Mexico
4,0.085494,0.248052,Canada
5,0.675958,0.637207,Brazil
6,0.892634,0.524718,Iraq
7,0.434827,0.241292,Chile
8,0.131722,0.931121,India
9,0.43862,0.15631,USA


In [9]:
plt.figure()
# picker=5 means the mouse doesn't have to click directly on an event, but can be up to 5 pixels away
plt.scatter(df['height'], df['weight'], picker=5)
plt.gca().set_ylabel('Weight')
plt.gca().set_xlabel('Height')

<IPython.core.display.Javascript object>

Text(0.5,0,'Height')

In [10]:
def onpick(event):
    origin = df.iloc[event.ind[0]]['origin']
    plt.gca().set_title('Selected item came from {}'.format(origin))

# tell mpl_connect we want to pass a 'pick_event' into onpick when the event is detected
plt.gcf().canvas.mpl_connect('pick_event', onpick)

7