## Lesson 05: Visualizations

In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Census Data

In [None]:
full = Table.read_table('data/nc-est2019-agesex-res.csv')
full

**Question 1.** Create a new table named `partial` that only contains the columns `SEX`, `AGE`, `POPESTIMATE2010`, `POPESTIMATE2019`.

In [None]:
partial = ...
partial

**Question 2.** Rename the columns `POPESTIMATE2010`, `POPESTIMATE2019` to `2010` and `2019` respectively. Save this to a new table named `simple`.

In [None]:
simple = ...
simple

**Question 3.** Sort the `partial` table in descending order by `AGE`. What do you notice?

In [None]:
simple.sort('AGE', descending=True)

In [None]:
no_999 = simple.where('AGE', are.below(999))
no_999

In [None]:
everyone = no_999.where('SEX', 0).drop('SEX')
everyone

## Line Plots ##

In [None]:
everyone.plot('AGE', '2010')

The plot above should be labeled. Let's add a title.

In [None]:
everyone.plot('AGE', '2010')
plots.title('US Population');

Let's plot the ages for 2010 and 2019.

In [None]:
everyone.plot('AGE')

## Males and Females in 2019

Let's compare male and female counts per age

In [None]:
males = ...
females = ...

In [None]:
pop_2019 = Table().with_columns(
    'Age', males.column('AGE'),
    'Males', males.column('2019'),
    'Females', females.column('2019')
)
pop_2019

In [None]:
pop_2019.plot('Age')

Let's calculate the percent female for each age.

In [None]:
total = ...
pct_female = ...
pct_female

In [None]:
total

We should round it to 3 so that it's easier to read.

In [None]:
pct_female = np.round(pct_female, 3)
pct_female

In [None]:
Now we can add female percent to our table.

In [None]:
pop_2019 = ...
pop_2019

Let's plot the percent female by age.

In [None]:
pop_2019.plot('Age', 'Percent female')

Look at the $y-$axis. The trend is not as dramatic as you might think.

In [None]:
pop_2019.plot('Age', 'Percent female')
plots.ylim(0, 100);

## Scatter Plots ##

In [None]:
actors = Table.read_table('data/actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Average per Movie', are.above(400))

## Bar Charts

Highest grossing movies as of 2017.

In [None]:
top_movies = Table.read_table('data/top_movies_2017.csv')
top_movies

In [None]:
np.arange(10)

In [None]:
top10_adjusted = top_movies.take(np.arange(10))
top10_adjusted

Let's convert to millions of dollars for readability.

In [None]:
millions = ...
top10_adjusted = ...
top10_adjusted

In [None]:
top10_adjusted.plot('Year', 'Millions')

A line plot doesn't make sense here.

In [None]:
top10_adjusted.barh('Title', 'Millions')
plots.title('Top Ten');

## Binning

In [None]:
ages = 2021 - top_movies.column('Year')
min(ages), max(ages)

Let's add the ages to our table.

In [None]:
top_movies = top_movies.with_column('Age', ages)
top_movies

In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 105)

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

In [None]:
sum(binned_data.column('Age count'))

In [None]:
np.arange(0, 126, 25)

In [None]:
top_movies.bin('Age', bins = np.arange(0, 126, 25))

In [None]:
top_movies.bin('Age', bins = np.arange(0, 101, 25))

In [None]:
 np.arange(0, 101, 25)

In [None]:
top_movies.where('Age', 100)

## Histograms ##

In [None]:
my_bins

In [None]:
binned_data

Let's make our first histogram.

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

Let's try equally spaced bins instead.

In [None]:
top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')

Let's try not specifying any bins.

In [None]:
top_movies.hist('Age', unit = 'Year') 

Add a column containing what percent of movies are in each bin.

In [None]:
binned_data = binned_data.with_column(
    'Percent', 100*binned_data.column('Age count')/200)

binned_data

In [None]:
np.sum(binned_data.column('Percent'))