In [1]:
import os
import random

In [2]:
from IPython.display import Image

In [3]:
import pandas as pd
import numpy as np

In [4]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
from bokeh.layouts import gridplot

In [5]:
output_notebook()

In [6]:
import statsmodels.formula.api as smf
from scipy import stats

In [7]:
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

In [8]:
def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = np.mean(resample_a) - np.mean(resample_b)
        difference_in_means.append(difference)
    return difference_in_means

In [9]:
def make_bar(labels, nums, title = None, y_range = None, plot_width = 350, plot_height = 350):
    p = figure(title = title, plot_width = plot_width, plot_height = plot_height,
              y_range = y_range)
    p.vbar(x=labels, top=nums, width=0.9)
    p.xgrid.grid_line_color = None
    return p

In [10]:
def hist(l):
    hist, edges = np.histogram(l, density=True)
    p = figure()
    p.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
    return p

# Overview

## Goals for this module
* understand linear regression
* understand SSR
* understand P value for linear regression

In [11]:
DF = pd.read_csv(os.path.join('data', 'death_rate.csv'))
DF.head()

Unnamed: 0,annual_precipitation,jan_temp,jul_temp,percent_older_60,household_size,school_over_22,has_kitchen,pop_per_square_mile,per_nonwhite,per_office_workers,poor_families,pollution_hydorocarbons,pollution_nitrogen,pollution_sulfter_diox,relative_humidity,death_rate
0,36,27,71,8.1,3.34,11.4,81.5,3243,8.8,42.6,11.7,21,15,59,59,921.87
1,35,23,72,11.1,3.14,11.0,78.8,4281,3.6,50.7,14.4,8,10,39,57,997.875
2,44,29,74,10.4,3.21,9.8,81.6,4260,0.8,39.4,12.4,6,6,33,54,962.354
3,47,45,79,6.5,3.41,11.1,77.5,3125,27.1,50.2,20.6,18,8,24,56,982.291
4,43,35,77,7.6,3.44,9.6,84.6,6441,24.4,43.7,14.3,43,38,206,55,1071.289


In [12]:
def make_scatter_plot(df):
    p = figure(title = 'Deaths vs Education')
    p.circle(y = DF['death_rate'], x = DF['school_over_22'])
    return p

Let's plot the death rate vs the level of education

In [13]:
show(make_scatter_plot(DF))

We want to include a line of best fit with linear regression, OLS, ordinary least squares. Specifically, we want to fit a line that minimizes the square of the distances between the points (known as *regressors*) and the fitted line.

![title](data/linear2.jpeg)

The linear line is in red. Each point on the line is denoted as  ŷ (h hat). The actual points are y. I have illustrated the concept for point 2. You find the distance between the line and the acual point and square it. 

To determine the the sum of square residuals (SSR) we add up all the distances:

$SSR =  (y_1 - \hat{y}_1)^2 + (y_2 - \hat{y}_2)^2 + (y_3- \hat{y}_3)^2 ....$

or

$SSR = \sum_{i=1}^n (y - \hat{y})^2$

For OLS, only one such line will solve the equqtion to minimize SSR. Generally, linear algebra can solve this equation quicklly and efficiently. The result will be an equation in the form:

$y = b + mx$

where b is the intercept, and m is the slope of the line

Answer the following: Is the line better fitted if SSR is small, or if SSR is large? Why? (Do we want a lot of distnace between the line and the points, or a little distance?)


In [22]:
def add_linear_fit():
    p = make_scatter_plot(DF)
    y = DF['death_rate'].tolist()
    x = DF['school_over_22'].tolist()
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    fitted = [x * slope + intercept for x in x]
    p.line(x = x, y = fitted)
    return  p
show(add_linear_fit())


If we use res.summary, we can get the intercept and slope. For this equation, it is $y = 1352.9965 + -37.6037 * x$. That tells us education and the death rate are negatively correlated. The more education you have, the longer you will live

## Getting a P Value

How good is our fit? Do we have confident that if you get more education, you will live longer? In order to evaluate this question, we need:

1. a null hypothesis
2. A p value

For our null hypothesis, we can use a line that goes through the mean of our points. If there is no relationship between the x and y, then a line with no slope will be the same as a line with a slope.

In [30]:
def resample_slopes(x, y, num_iter = 1000):
    slopes = []
    pairs = list(zip(x, y))
    for i in range(num_iter):
        pairs_n = resample(pairs)
        x = [x[0] for x in pairs_n]
        y = [x[1] for x in pairs_n]
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
        slopes.append(slope)
    return slopes

def resample_linear():
    y = DF['death_rate'].tolist()
    x = DF['school_over_22'].tolist()
    return resample_slopes(x, y)
    
    
resampled_slopes = resample_linear() 

Just like with the students, we resampled the original sample to create a "popualation." We tooks samples from that popultion and re-ran our model. Our resampled slopes tells us if our original model really has a slope, or are just random. Our null hypothesis is that the slopes are equal to 0. Let's test this out.

In [34]:
print('num less than 0 is {n}'.format(n = len([x for x in resampled_slopes if x < 0])))
print('num greater than 0 is {n}'.format(n = len([x for x in resampled_slopes if x > 0])))

num less than 0 is 1000
num greater than 0 is 0


*All* the slopes are less than 0. None of them are greater than 0. If the data were just random, we would expect some of the slopes to be greater than 0, and some less than 0. Our null hypothesis is that the slope is equal to 0. The probability of this being true is 0. So our p value is less than .01. (p value < .01). We reject our null hypothesis. 

In fact, we can predict the extent of the slope. 

In [35]:
np.quantile(resampled_slopes, .95)

-20.785776859881253

The *quantile* method gets the value at 95%. All values less than 95% are also less than approxtimately -20. So we can say with 95. We can re-formulate our null hypothesis as "the slope is greater than -19." We can reject this null hypothesis, and say the slope is at least -19. There is a strong relationship between education and living long. So keep studying!

In [55]:
def resample_linear_pollution_sulfter_diox(x):
    y = DF['death_rate'].tolist()
    x = DF[x].tolist()
    resampled_slopes = resample_slopes(x, y)
    gt = len([x for x in resampled_slopes if x > 0])
    lt = len([x for x in resampled_slopes if x < 0])
    is_significant = False
    if (gt/1000) <= .05 or (lt/1000) < .05:
        is_significant = True
        return is_significant, np.quantile(resampled_slopes, .95)
    return False, None

resample_linear_pollution_sulfter_diox('household_size')

(False, None)

The above function resamples the pair of x, y values and gets all the slopes. If slopes are greater or less than 950/1000, the relationship is significant (we reject the null hypothesis that the slope is equal to 0). If the relationshipe is significant, it returns the maximum significant value.

## Exercises
Test if the following are significant, and if so, get the maximum signifance. 
1. has_kitchen
2. pollution_sulfter_diox
3. poor_families
4. household_size
5. per_nonwhite
6. relative_humidity