In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random

## Linear Regression

In [None]:
# fuel efficiency dataset
df_cars = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv')

#import seaborn as sns
#df_cars = sns.load_dataset('mpg')

df_cars.dropna(inplace=True)

In [None]:
df_cars.head()

Which variables might have a linear relationship? Which definitely do not?

In [None]:
# scatter plot

In [None]:
# try using seaborn's pairplot feature!

In [None]:
plt.scatter(x=df_cars['weight'], y=df_cars['horsepower'])

# pick out two random points
idx1 = np.random.randint(low=1, high=len(df_cars)+1, size=1)
idx2 = np.random.randint(low=1, high=len(df_cars)+1, size=1)

# use .item() to get just the value not a Series object
x1 = df_cars.loc[idx1, 'weight'].item()
y1 = df_cars.loc[idx1, 'horsepower'].item()
x2 = df_cars.loc[idx2, 'weight'].item()
y2 = df_cars.loc[idx2, 'horsepower'].item()


# plot a line through two points
plt.axline((x1, y1), (x2, y2), color='green')

plt.show()

How can we decide how "well" a line fits our data? How about a particular datapoint?

In [None]:
# another random point
idx3 = np.random.randint(low=1, high=len(df_cars)+1, size=1)

# use .item() to get just the value not a Series object
x3 = df_cars.loc[idx3, 'weight'].item()
y3 = df_cars.loc[idx3, 'horsepower'].item()

plt.scatter(x=x3, y=y3)

plt.axline((x1, y1), (x2, y2), color='green')
plt.show()

Denote our line as:

$$
Y = \beta_0 + \beta_1 X
$$

For a particular $(x,y)$ we denote the prediction as

$$
\hat{y} = \beta_0 + \beta_1 x,
$$

and then the error between the true value and the prediction as

$$
(\hat{y} - y)^2 = ((\beta_0 + \beta_1 x) - y)^2.
$$

In [None]:
plt.scatter(x=x3, y=y3)
plt.axline((x1, y1), (x2, y2), color='green')

# plot the prediction vs. the true value
m = (y2 - y1) / (x2 - x1)
y_pred = m * (x3 - x1) + y1
plt.plot([x3,x3], [y3, y_pred], color='red')

plt.show()

How would we get the error for the whole line?

The **R**esidual **S**um of **S**quares for a particular line with coefficents $\hat{\beta}_0$ and $\hat{\beta}_1$:
$$
RSS = \displaystyle \sum_{i=1}^N(\hat{y}_i-y_i)^2 = \sum_{i=1}^N((\hat{\beta}_0+\hat{\beta}_1 x_i)-y_i)^2
$$


In [None]:
# the error for this particular line

def y_pred_line(x, m, b):
    return m*x+b
    
m = (y2 - y1) / (x2 - x1)
b = y1 - m * x1

# zip lets us loop over two iterables
sq_errors = []
for x, y in zip(df_cars['weight'], df_cars['horsepower']):
    sq_errors.append((y_pred_line(x, m, b) - y)**2)

RSS = sum(sq_errors)
RSS

Goal: Find $\beta_0$ and $\beta_1$ such that the $RSS$ is as small as possible.

Let's bring in some back-up.

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

```statsmodel``` is a nice package for statistical modeling. We'll import the ```ols``` or Ordinary Least Squares function.

The dependent variable (or $Y$) appears on the left. It has to be a column from your data frame, in this case ```horsepower```. The ```~``` tells us that we are writing $Y$ as a function of the variables that come next. Since we're doing simple ```ols``` with only one independent predictor variable (or $X$), the next variable ```weight``` completes the model specification. Finally, we point to the dataframe we're using.

In [None]:
model = ols(formula = 'horsepower ~ weight', data=df_cars)

In [None]:
# fit the model after creating it
res = model.fit()
res.summary()

What is all of this?!
- For now pay attention to the coefficients parameters.
- How do we interpret them as a line? How about in non-statistical language?

In [None]:
# seaborn also calculates and plots the linear regression line from OLS
# the shaded region is a 95% confidence interval
import seaborn as sns
sns.lmplot(x='weight',y='horsepower',data=df_cars)