In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Review of plots: welcome survey ##

In [None]:
survey = Table.read_table('welcome_survey.csv')

In [None]:
survey

In [None]:
survey.hist('Extraversion')

In [None]:
survey.hist('Hours of Sleep')

In [None]:
min(survey.column('Hours of Sleep')), max(survey.column('Hours of Sleep'))

In [None]:
survey.hist('Hours of Sleep', bins=np.arange(4, 11, 1))

In [None]:
survey.bin('Hours of Sleep', bins=make_array(0,8,15))

In [None]:
# percent of people who report 8+ hours of sleep?
...

In [None]:
np.average(survey.column('Number of Texters'))

In [None]:
survey.hist('Number of Texters')

In [None]:
survey.hist('Number of Texters', bins=np.arange(0,42,1))

In [None]:
survey = survey.where('Number of Texters', are.below(41))

In [None]:
survey.hist('Number of Texters', bins=[0,2,4,6,10,15,20,30])

In [None]:
survey.barh('Handedness')

In [None]:
handedness = survey.group('Handedness')
handedness

In [None]:
# bar and barh expects a categorical value 
# to plot with corresponding numerical column(s)
handedness.barh('Handedness')

## Predictions with heights and heredity ##

In [None]:
galton = Table.read_table('galton.csv')

In [None]:
galton

In [None]:
galton.hist('midparentHeight')

In [None]:
galton.hist('childHeight')

In [None]:
galton.hist('midparentHeight', 'childHeight')

Can we tell from these histograms how many children heights are more than 3in away from their midparent height?

## Functions and predictions

In [None]:
galton

In [None]:
galton.scatter('midparentHeight', 'childHeight')

In [None]:
galton.scatter('midparentHeight', 'childHeight')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
nearby = galton.where('midparentHeight', are.between(67.5, 68.5))
nearby.column('childHeight').mean()

In [None]:
galton.scatter('midparentHeight', 'childHeight')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, 66.24, color='gold', s=50);

In [None]:
def predictHeight(h):
    nearby = galton.where('midparentHeight', are.between(h - 1/2, h + 1/2))
    return nearby.column('childHeight').mean()

In [None]:
predictHeight(68)

In [None]:
predictHeight(70)

In [None]:
predictHeight(73)

In [None]:
predicted_heights = galton.apply(predictHeight, 'midparentHeight')

In [None]:
galton = galton.with_column('predictedHeight', predicted_heights)

In [None]:
galton.select('midparentHeight', 'childHeight', 'predictedHeight').scatter('midparentHeight')

In [None]:
def difference(x, y):
    return x - y

In [None]:
predict_err = galton.apply(difference, 'childHeight', 'predictedHeight')

In [None]:
galton = galton.with_column('errors',predict_err)

In [None]:
galton.hist('errors', group='sex')

**Based on the survey data, can we predict Extaversion, Number of Texters, Hours of Sleep?**

In [None]:
survey

**Optional:** add some lines and dots to the graphs below to add information. For example, a scatter plot shows the spread of values but not the averages.

This code: `plots.plot([x1, x2], [y1, y2], color='red', lw=2)`
draws a red line from (x1,y1) to (x2,y2)

This code: `plots.scatter(x, y, color='gold', s=50)` draws a gold dot at (x, y).

`lw` and `s` describe the width of the line and the size of the dot respectively.

In [None]:
survey.scatter('Extraversion','Number of Texters')

In [None]:
survey.scatter('Hours of Sleep', 'Number of Texters')

In [None]:
def year_to_int(y):
    years = {"First": 1,
            "Second": 2,
            "Third": 3,
            "Fourth or Higher": 4}
    
    return years[y]

In [None]:
survey = survey.with_column(
    "Year", survey.apply(year_to_int, "Undergrad Year"))

In [None]:
survey.scatter('Year', 'Hours of Sleep')

In [None]:
survey.scatter('Year', 'Number of Texters')

In [None]:
by_year = survey.group('Year', np.average)
by_year

In [None]:
by_year.plot("Year", "Number of Texters average")

In [None]:
by_year.plot("Year", "Hours of Sleep average")

In [None]:
by_extra = survey.group('Extraversion', np.average)
by_extra

In [None]:
by_extra.select(0,1).plot('Extraversion')

In [None]:
by_extra.select(0,2).plot('Extraversion')

**Question 1** Write a function, based on `predictHeight` to predict hours of sleep, extraversion, or number of texters based on one of the other variables.

In [None]:
# Edit this code to write a predict[SomethingElse] function
def predictHeight(h):
    nearby = galton.where('midparentHeight', are.between(h - 1/2, h + 1/2))
    return nearby.column('childHeight').mean()

**Question 2:** Repeat the steps for calculating the prediction errors and then drawing a histogram of the errors.