# Data Visualization in Python


<br>


<br>
<img src="figure/artist-demo.png" alt="Figure 1" style="width: 800px;"/>

<h4 style="text-align: center;" markdown="1">  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Figure 1, Visualization Demo </h4>

<br>

## 1. Principles of Information Visualization

### 1.1 Two authors

* Alberto Ciro
* Edward Tufty

### 1.2 Theories


<br>
<img src="figure/visualize_theory1.png" alt="Figure 2" style="width: 800px;"/>


<h4 style="text-align: center;" markdown="1">  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Figure 2, theory example 1</h4>

<br>

<br>
<img src="figure/visualize_theory2.png" alt="Figure 3" style="width: 800px;"/>

<h4 style="text-align: center;" markdown="1">  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Figure 3, theory example 2</h4>

<br>

## 2. Matplotlib

### 1.1 Matplotlib Architecture

<br>
<img src="figure/visualize_matplotlib.png" alt="Figure 4" style="width: 800px;"/>

<h4 style="text-align: center;" markdown="1">  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Figure 4, Matplotlib Architecture </h4>

<br>


<br>
<img src="figure/artists_tree.png" alt="Figure 5" style="width: 800px;"/>

<h4 style="text-align: center;" markdown="1">  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Figure 5, Artists Tree </h4>

<br>

## 3. Visualization Examples 

* Scatter plots
* Line Plots
* Bar Charts
* Subplots
* Histograms
* Box Plots
* Heatmaps
* Animation
* Interactivity 
* Plotting with Pandas 
* Seaborn


### 3.1 Scatter Plots

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

x=[1,2,3,4,5]
y=[6,7,8,9,10]

plt.figure()
# plot a data series 'Tall students' in red using the first two elements of x and y
plt.scatter(x[:2], y[:2], s=100, c='red', label='Tall students')
# plot a second data series 'Short students' in blue using the last three elements of x and y 
plt.scatter(x[2:], y[2:], s=100, c='blue', label='Short students')

In [None]:
# add a label to the x axis
plt.xlabel('The number of times the child kicked a ball')
# add a label to the y axis
plt.ylabel('The grade of the student')
# add a title
plt.title('Relationship between ball kicking and grades')
plt.legend()

### 3.2 Line Plots

In [None]:
import numpy as np

linear_data = np.array([1,2,3,4,5,6,7,8])
exponential_data = linear_data**2

plt.figure()
# plot the linear data and the exponential data
plt.plot(linear_data, '-o', exponential_data, '-o')



In [None]:
# plot another series with a dashed red line
plt.plot([22,44,55], '--r')

In [None]:
plt.xlabel('Some data')
plt.ylabel('Some other data')
plt.title('A title')
# add a legend with legend entries (because we didn't have labels when we plotted the data series)
plt.legend(['Baseline', 'Competition', 'Us'])

### 3.3 Bar Charts

In [None]:
plt.figure()
xvals = range(len(linear_data))
plt.bar(xvals, linear_data, width = 0.3)

In [None]:
new_xvals = []

# plot another set of bars, adjusting the new xvals to make up for the first set of bars plotted
for item in xvals:
    new_xvals.append(item+0.3)

plt.bar(new_xvals, exponential_data, width = 0.3 ,color='red')

In [None]:
from random import randint
linear_err = [randint(0,15) for x in range(len(linear_data))] 

# This will plot a new set of bars with errorbars using the list of random error values
plt.bar(xvals, linear_data, width = 0.3, yerr=linear_err)

### 3.4 Subplots

In [None]:
plt.figure()
ax1 = plt.subplot(1, 2, 1)
plt.plot(linear_data, '-o')
# pass sharey=ax1 to ensure the two subplots share the same y axis
ax2 = plt.subplot(1, 2, 2, sharey=ax1)
plt.plot(exponential_data, '-x')

In [None]:
# create a 3x3 grid of subplots
fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True)
# plot the linear_data on the 5th subplot axes 
ax5.plot(linear_data, '-')

### 3.5 Histograms

In [None]:
# create 2x2 grid of axis subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)
axs = [ax1,ax2,ax3,ax4]

# draw n = 10, 100, 1000, and 10000 samples from the normal distribution and plot corresponding histograms
for n in range(0,len(axs)):
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size)
    axs[n].hist(sample)
    axs[n].set_title('n={}'.format(sample_size))

In [None]:
# use gridspec to partition the figure into subplots
import matplotlib.gridspec as gridspec

plt.figure()
gspec = gridspec.GridSpec(3, 3)

top_histogram = plt.subplot(gspec[0, 1:])
side_histogram = plt.subplot(gspec[1:, 0])
lower_right = plt.subplot(gspec[1:, 1:])

Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
lower_right.scatter(X, Y)
top_histogram.hist(X, bins=100)
s = side_histogram.hist(Y, bins=100, orientation='horizontal')


### 3.6 Box Plots

In [None]:
import pandas as pd
normal_sample = np.random.normal(loc=0.0, scale=1.0, size=10000)
random_sample = np.random.random(size=10000)
gamma_sample = np.random.gamma(2, size=10000)

df = pd.DataFrame({'normal': normal_sample, 
                   'random': random_sample, 
                   'gamma': gamma_sample})

df.describe()

In [None]:
plt.figure()
# create a boxplot of the normal data, assign the output to a variable to supress output
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')
#_ = plt.boxplot([1,2,3,4,5,6,7,8,9,10,8,8,8], whis='range')

In [None]:
plt.figure()
# plot boxplots for all three of df's columns
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ])

### 3.7 Heatmaps

In [None]:
plt.figure()


num_points=int(1e6)
Y = np.random.normal(loc=0.0, scale=1.0, size=num_points)
X = np.random.random(size=num_points)
Z = np.random.normal(loc=0.0, scale=2.0, size=num_points)
_ = plt.scatter(Z, Y)
plt.axis('equal')

In [None]:
plt.figure()
#plt.style.use('ggplot')
my_cmap = plt.cm.jet
_ = plt.hist2d(Z, Y, bins=100,cmap = my_cmap)
plt.axis('equal')
axes=plt.gca()
axes.grid()
# axes.set_xlim([-2,2])
# axes.set_ylim([-4,4])

In [None]:
plt.colorbar()

### 3.8 Animations

In [None]:
import matplotlib.animation as animation

n = 100
x = np.random.randn(n)

In [None]:
# create the function that will do the plotting, where curr is the current frame
def update(curr):
    # check if animation is at the last frame, and if so, stop the animation a
    if curr == n: 
        a.event_source.stop()
    plt.cla()
    bins = np.arange(-4, 4, 0.5)
    plt.hist(x[:curr], bins=bins)
    plt.axis([-4,4,0,30])
    plt.gca().set_title('Sampling the Normal Distribution')
    plt.gca().set_ylabel('Frequency')
    plt.gca().set_xlabel('Value')
    plt.annotate('n = {}'.format(curr), [3,27])

In [None]:
fig = plt.figure()
a = animation.FuncAnimation(fig, update, interval=100)

### 3.9 Interactivity

In [None]:
from random import shuffle
origins = ['China', 'Brazil', 'India', 'USA', 'Canada', 'UK', 'Germany', 'Iraq', 'Chile', 'Mexico']

shuffle(origins)

df = pd.DataFrame({'height': np.random.rand(10),
                   'weight': np.random.rand(10),
                   'origin': origins})

In [None]:
plt.figure()
# picker=5 means the mouse doesn't have to click directly on an event, but can be up to 5 pixels away
plt.scatter(df['height'], df['weight'], picker=5)
plt.gca().set_ylabel('Weight')
plt.gca().set_xlabel('Height')

In [None]:
def onpick(event):
    origin = df.iloc[event.ind[0]]['origin']
    plt.gca().set_title('Selected item came from {}'.format(origin))

# tell mpl_connect we want to pass a 'pick_event' into onpick when the event is detected
plt.gcf().canvas.mpl_connect('pick_event', onpick)

### 3.10 Plotting with Pandas 

In [None]:
plt.style.use('seaborn-colorblind')

iris = pd.read_csv('iris.csv')
iris.head()

In [None]:
pd.tools.plotting.scatter_matrix(iris);

In [None]:
plt.figure()
pd.tools.plotting.parallel_coordinates(iris, 'Name');

### 3.11 Seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure()
sns.stripplot('Name', 'PetalLength', data=iris);

In [None]:
plt.figure(figsize=(8,6))
plt.subplot(121)
sns.swarmplot('Name', 'PetalLength', data=iris);
plt.subplot(122)
sns.violinplot('Name', 'PetalLength', data=iris);