In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Load Census Data ##

In [None]:
full = Table.read_table('../data/nc-est2014-agesex-res.csv')
full

In [None]:
# Keep only the columns we care about
partial = full.select('SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2014')
partial

In [None]:
# Make things easier to read
simple = partial.relabeled(2, '2010').relabeled(3, '2014')
simple

In [None]:
# Sort by age
simple.sort('AGE')

In [None]:
# Sort by age (another way)
simple.sort('AGE', descending=True)

## Line Plots ##

In [None]:
# Remove the age totals
no_999 = simple.where('AGE', are.below(999))

In [None]:
# Remove male and female (keep only combined)
everyone = no_999.where('SEX', 0).drop('SEX')

In [None]:
everyone

In [None]:
everyone.plot('AGE', '2010')

In [None]:
everyone.plot('AGE', '2010')
plots.title('US Population');  # add a plot label 

In [None]:
# Age distribution for two different years
everyone.plot('AGE')

### Males and Females in 2014 ###

In [None]:
# Let's compare male and female counts per age
males = no_999.where('SEX', 1).drop('SEX')
females = no_999.where('SEX', 2).drop('SEX')

In [None]:
pop_2014 = Table().with_columns(
    'Age', males.column('AGE'),
    'Males', males.column('2014'),
    'Females', females.column('2014')
)
pop_2014

In [None]:
pop_2014.plot('Age')

In [None]:
# Calculate the percent female for each age
total = pop_2014.column('Males') + pop_2014.column('Females')
pct_female = pop_2014.column('Females') / total * 100
pct_female

In [None]:
# Round it to 3 so that it's easier to read
pct_female = np.round(pct_female, 3)
pct_female

In [None]:
# Add female percent to our table
pop_2014 = pop_2014.with_column('Percent female', pct_female)
pop_2014

In [None]:
pop_2014.plot('Age', 'Percent female')

In [None]:
# ^^ Look at the y-axis! Trend is not as dramatic as you might think
pop_2014.plot('Age', 'Percent female')
plots.ylim(0, 100);  # change the range of the y axis

## Scatter Plots ##

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table('../data/actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Average per Movie', are.above(400))

## Bar Charts ##

In [None]:
# Highest grossing movies as of 2017
top_movies = Table.read_table('../data/top_movies_2017.csv')
top_movies

In [None]:
top10_adjusted = top_movies.take(np.arange(10))
top10_adjusted

In [None]:
# Convert to millions of dollars for readability
millions = np.round(top10_adjusted.column('Gross (Adjusted)') / 1000000, 3)
top10_adjusted = top10_adjusted.with_column('Millions', millions)
top10_adjusted

In [None]:
# A line plot doesn't make sense here
top10_adjusted.plot('Year', 'Millions')

In [None]:
top10_adjusted.barh('Title', 'Millions')

## Histogram ##

In [None]:
# create a table of movies with a million column
top_movies = Table.read_table('../data/top_movies_2017.csv')
millions = np.round(top_movies.column('Gross (Adjusted)') / 1000000, 3)
top_movies = top_movies.with_column('Millions', millions)
top_movies

In [None]:
#what is the distribtuion of adjusted gross revenues (in millions), i.e. using Million column
top_movies.hist('Millions')

## Group (counts) ##

In [None]:
#which are the top 10 studios that produces the most number of top movies, display a bar plot
top_movies.group('Studio').sort('count',descending=True).take(np.arange(10)).barh('Studio')

## Group (statistics)

In [None]:
top_movies_mils = top_movies.select('Studio','Millions')
top_movies_mils.group('Studio',np.mean).relabel(1,'mean').sort('mean',descending=True).take(np.arange(10)).barh('Studio')