# Chapter 03 - Visualization

In [None]:
from collections import Counter

import matplotlib.pyplot as plt

## Simple (Line) Plots

In [None]:
years = range(1950, 2020, 10)
gdp = [300.2, 543.3, 1075.9, 2862.5, 5979.6, 10289.7, 14598.3]

# Create a line chart with years on the x-axis and GDP on the y-axis.
plt.plot(years, gdp, color='green', marker='o', linestyle='solid')
plt.title('Nominal GDP')  # Add a title
plt.ylabel('Billions of $')  # Add a label to the y-axis

plt.show()  # Show the plot

A simple line chart

## Bar Charts

In [None]:
movies = ['Annie Hall', 'Ben-Hur', 'Casablanca', 'Gandhi', 'West Side Story']
num_oscars = [5, 11, 3, 8, 10]

# Plot bars with x-coordinate [0, 1, 2, 3, 4] and hights [num_oscars]
plt.bar(range(len(movies)), num_oscars)


plt.title('My Favorite Movies')  # Add a title
plt.ylabel('# of Academy Awards')  # Label the y-axis
# Label the x-axis with movie names at bar centers
plt.xticks(range(len(movies)), movies)

plt.show()

A simple bar chart

In [None]:
### Historgram (a Variant of a Bar Chart)

A bar chart can be a good choice for plotting histograms of bucketed numeric values.

In [None]:
grades = [83, 95, 91, 87, 70, 0, 85, 82, 100, 67, 73, 77, 0]

# Bucket grades by decile, but put 100 in with the 90's
histogram = Counter(min(grade // 10 * 10, 90) for grade in grades)

plt.bar(
    [x + 5 for x in histogram.keys()],  # Shift bars to the right by 5
    histogram.values(),  # Give each bar its correct height
    10,  # Give each bar a width of 10
    edgecolor=(0, 0, 0),  # Black edges for each bar
)

# x-axis from -5 to 105; y-axis from 0 to 5
plt.axis([-5, 105, 0, 5])

# x-axis labels at 0, 10, ..., 100
plt.xticks([10 * i for i in range(11)])

plt.xlabel('Decile')
plt.ylabel('# of Students')
plt.title('Distribution of Exam 1 Grades')

plt.show()

The third argument to `plt.bar` specifies the bar width. In our case,
we choose a width of 10, to fill the entire decile. We also shifted 
the bars to the right by 5, so that, for example, the "10" bar (which
corresponds to the decile 10-20) would have its center at 15 and hence
occupy the correct range. We added a black edge to each bar to make it
visually distinct.

The call to `plt.axis` indicates that we want the x-axis to range from 
-5 to 105 (just to leave a little space on the left and right), and 
that the y-axis should range from 0 to 5. And the call to `plt.xticks`
puts x-axis labels at 0, 10, 20, ..., 100.

### Misleading bar chart y-axis

Be judicious when using `plt.axis`. When creating bar charts it is 
considered especially bad form for your y-axis to **not** start at 0, 
since this is an easy way to mislead people.

In [None]:
mentions = [500, 505]
years = [2017, 2018]

plt.bar(years, mentions, 0.8)
plt.xticks(years)
plt.ylabel('# of times I heard someone say "data science"')

# Misleading y-axis only shows the part above 500
plt.axis([2016.5, 2018.5, 499, 506])
plt.title('Look at the "Huge" Increase!')

plt.show()

A chart with misleading y-axis 

Since the y-axis **does not** begin at zero, the graph communicates a 
**much** larger difference (perhaps an 80% relative increase) instead
of the actual 1% increase.

### More appropriate bar chart y-axis

In [None]:
plt.bar(years, mentions, 0.8)
plt.xticks(years)
plt.ylabel('# of times I heard someone say "data science"')

plt.axis([2016.5, 2018.5, 0, 550])
plt.title('Not So Huge Anymore')

plt.show()

## Line Charts

Line charts are a good choice for showing _trends_

In [None]:
variance = [2 ** n for n in range(0, 8 + 1)]  # include upper-bound
# include upper and lower bound but move downward
bias_squared = [2 ** n for n in range(8, 0 - 1, -1)]
total_error = [x + y for x, y in zip(variance, bias_squared)]

xs = [i for i, _ in enumerate(variance)]  # simply a list of indices

# We can make multiple calls to `plt.plot` to show multiple series 
# on the same chart
plt.plot(xs, variance, 'g-', label='variance')  # green solid line
plt.plot(xs, bias_squared, 'r-.', label='bias^2')  # red dot-dashed line
plt.plot(xs, total_error, 'b:', label='total error') # blue dotted line

# Because we assigned labels to each series, we get a legend "for free."
# (The expression `loc=9` means "top center" of the chart.)
plt.legend(loc=9)
plt.xlabel('model complexity')
plt.xticks([])
plt.title('The Bias-Variance Tradeoff')
plt.show()

## Scatterplots

The right choice for visualizing the relationship between two paired
sets of data.

In [None]:
friends = [70, 65, 72, 63, 71, 64, 60, 64, 67]
minutes = [175, 170, 205, 120, 220, 130, 105, 145, 190]
labels = [chr(i) for i in range(ord('a'), ord('i') + 1)]  # 'a' through 'i'

plt.scatter(friends, minutes)

# Label each point
for label, friend_count, minute_count in zip(labels, friends, minutes):
    plt.annotate(
        label, 
        xy=(friend_count, minute_count),  # Put the label with its point
        xytext=(5, -5),  # but slightly offset right and down
        textcoords='offset points'
    )
    
plt.title('Daily Minutes vs. Number of Friends')
plt.xlabel('# of friends')
plt.ylabel('Daily minutes spent on site')

plt.show()

### Misleading scatterplot

If you're scattering comparable variables, you might get a misleading
picture if you let matplotlib choose the scale

In [None]:
test_1_grades = [99, 90, 85, 97, 80]
test_2_grades = [100, 85, 60, 90, 70]

plt.scatter(test_1_grades, test_2_grades)

plt.title('Axes Are NOT Comparable')
plt.xlabel('test 1 grade')
plt.ylabel('test 2 grade')

plt.show()

The axes have a different scale: the x-axis is from 80 to 97.5 (17.5)
but the y-axis is from 60 to 100 (40).

A call to `plt.axis('equal')` more accurately shows that most of the 
variation in the data occurs on test 2.

In [None]:
test_1_grades = [99, 90, 85, 97, 80]
test_2_grades = [100, 85, 60, 90, 70]

plt.scatter(test_1_grades, test_2_grades)

plt.title('Axes Are NOT Comparable')
plt.axis('equal')
plt.xlabel('test 1 grade')
plt.ylabel('test 2 grade')

plt.show()