# Applied Exercises

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import random
import seaborn as sns
import statsmodels.api as sm 
import statsmodels.formula.api as smf
import warnings

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

%matplotlib notebook
warnings.simplefilter(action='ignore', category=FutureWarning)

### 8

In [None]:
# Load dataset
auto = pd.read_csv('../../data/Auto.csv')
auto.head()

In [None]:
# Fit linear model using statsmodels as it can produce summaries
X = auto['horsepower']
y = auto['mpg']
X_cons = sm.add_constant(X)
results = sm.OLS(y,X_cons).fit()
results.summary()

In [None]:
# Get prediction for horsepower of 98 as well as confidence and prediction intervals 
X_pred = pd.DataFrame(np.array([[1,98]]))
preds = results.predict(X_pred)

preds_info = results.get_prediction(X_pred)
preds_info.summary_frame(alpha=0.05)

In [None]:
# Plot mpg, horsepower with least squares regression line
fig, ax = plt.subplots(1,1)
ax = sns.regplot(X,y,scatter_kws={'color':'lightblue'},line_kws={'color':'firebrick'}, ci=None)
ax.set_title('MPG vs Horsepower')
ax.set_xlabel('Horsepower')
ax.set_ylabel('MPG')
ax.grid(True)

In [None]:
## Residual plots for the model
resids = results.resid
yhat = results.fittedvalues

fig, ax = plt.subplots(1,1)
ax = sns.regplot(yhat,resids,lowess=True, line_kws={'color':'firebrick', 'lw':1},
            scatter_kws={'color': 'lightblue', 'alpha':0.5})
ax.hlines(y=0,xmin=np.min(yhat),xmax=np.max(yhat),color='firebrick',linestyle='--',lw=3)
ax.set_title('Residual plot of model')
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax.grid(True)

### 9

In [None]:
# Produce scatter plot matrix
sns.pairplot(auto) 

In [None]:
# Matrix of correlations
auto_trim = auto.drop('name', axis=1)
auto_trim.corr()

In [None]:
# Multiple linear regression
y = auto_trim['mpg']
X = auto_trim.drop('mpg', axis = 1)
X_cons = sm.add_constant(X)
multiple_lm = sm.OLS(y,X_cons).fit()
multiple_lm.summary()

In [None]:
# Residual plots of regression fit
resids = multiple_lm.resid
yhat = multiple_lm.fittedvalues

fig, ax = plt.subplots(1,1)
ax = sns.regplot(yhat,resids,lowess=True, line_kws={'color':'firebrick', 'lw':1},
            scatter_kws={'color': 'lightblue', 'alpha':0.5})
ax.hlines(y=0,xmin=np.min(yhat),xmax=np.max(yhat),color='firebrick',linestyle='--',lw=3)
ax.set_title('Residual plot of model')
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax.grid(True)

In [None]:
# Leverage plot
sm.graphics.influence_plot(multiple_lm)

In [None]:
# Model with interaction terms
import statsmodels.formula.api as smf
inter_model = smf.ols(formula = 'mpg ~ cylinders*weight', data = auto_trim).fit()
inter_model.summary()

In [None]:
# Trying different transformations - Polynomial degree 2 
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_2 = poly.fit_transform(X)
X2_cons = sm.add_constant(X_2)

sq_model = sm.OLS(y, X2_cons).fit()
sq_model.summary()

In [None]:
# Residual plots of regression fit
resids = sq_model.resid
yhat = sq_model.fittedvalues

fig, ax = plt.subplots(1,1)
ax = sns.regplot(yhat,resids,lowess=True, line_kws={'color':'firebrick', 'lw':1},
            scatter_kws={'color': 'lightblue', 'alpha':0.5})
ax.hlines(y=0,xmin=np.min(yhat),xmax=np.max(yhat),color='firebrick',linestyle='--',lw=3)
ax.set_title('Residual plot of model')
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax.grid(True)

In [None]:
# Log transform each variable 
X_log = X.copy()
for col in X_log.columns:
    X_log[col] = np.log(X_log[col])

In [None]:
# Fit log model
Xlog_cons = sm.add_constant(X_log)

log_model = sm.OLS(y, Xlog_cons).fit()
log_model.summary()

In [None]:
# Plot log model residuals 
resids = log_model.resid
yhat = log_model.fittedvalues

fig, ax = plt.subplots(1,1)
ax = sns.regplot(yhat,resids,lowess=True, line_kws={'color':'firebrick', 'lw':1},
            scatter_kws={'color': 'lightblue', 'alpha':0.5})
ax.hlines(y=0,xmin=np.min(yhat),xmax=np.max(yhat),color='firebrick',linestyle='--',lw=3)
ax.set_title('Residual plot of model')
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
ax.grid(True)

### 10

In [None]:
# Read data
cars = pd.read_csv(f'../../data/Carseats.csv')
cars.head()

In [None]:
# Fit multiple regression model
X = cars.loc[:,['Price','Urban','US']]
X = pd.get_dummies(X, columns = ['Urban','US'])
y = cars['Sales']

X_cons = sm.add_constant(X)
multiple_lm = sm.OLS(y, X_cons).fit()
multiple_lm.summary()

In [None]:
multiple_lm = smf.ols(formula='Sales ~ Price + Urban + US', data = cars).fit()
multiple_lm.summary()

In [None]:
multiple_lm = smf.ols(formula='Sales ~ Price + US', data = cars).fit()
multiple_lm.summary()

In [None]:
sm.graphics.influence_plot(multiple_lm)

### 11

In [None]:
# Create dataset
random.seed(10)
x = np.random.normal(loc=0,scale=1,size=100)
y = 2*x + np.random.normal(loc=0,scale=1,size=100)

In [None]:
# Simple linear regression of y onto x without intercept
model = sm.OLS(y,x).fit()
model.summary()

In [None]:
# Simple linear regression of x onto y without intercept
model = sm.OLS(x,y).fit()
model.summary()

In [None]:
# Simple linear regression of y onto x with intercept
x_cons = sm.add_constant(x)
model = sm.OLS(y,x_cons).fit()
model.summary()

In [None]:
# Simple linear regression of y onto x with intercept
y_cons = sm.add_constant(y)
model = sm.OLS(x,y_cons).fit()
model.summary()

### 13

In [None]:
# Create dataset and plot
random.seed(10)
x = np.random.normal(loc=0,scale=1,size=100)
eps = np.random.normal(loc=0,scale=0.5,size=100)
y = 0.5*(x) + eps -1

plt.scatter(x,y)

In [None]:
# Fit least squares linear model 
x_cons = sm.add_constant(x)
model = sm.OLS(y,x_cons).fit()
model.summary()
yhat = model.fittedvalues
yhat

In [None]:
# Plot model
fig, ax = plt.subplots(1,1)
sns.regplot(x,y,scatter_kws={'color':'lightblue'},line_kws={'color':'firebrick'}, ci=None)
#sns.regplot(x,yhat,scatter_kws = {'color' : 'black'},line_kws={'color':'magenta'})
plt.plot(x,yhat, color = 'magenta')
ax.set_title('X vs Y')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.grid(True)

### 14

In [None]:
# Create dataset
random.seed(10)
x1 = np.random.uniform(size=100)
x2 = 0.5*x1 + (np.random.normal(loc=0,scale=1,size=100)/10)
y = 2 + 2*x1 + 0.3*x2 + np.random.normal(loc=0,scale=1,size=100)
data = pd.DataFrame({'x1': x1, 'x2': x2, 'y':y})
data.head()

In [None]:
print(np.corrcoef(x1,x2))
fig,ax = plt.subplots(1,1)
plt.scatter(x1,x2)
ax.set_xlabel('X1')
ax.set_ylabel('X2')

In [None]:
# Multiple linear regression with X1 and X2 
X = data[['x1','x2']]
y = data['y']
X_cons = sm.add_constant(X)
model = sm.OLS(y, X_cons).fit()
model.summary()

In [None]:
# Linear regression with just X1
x1_cons = X_cons.drop('x2', axis =1)
model = sm.OLS(y,x1_cons).fit()
model.summary()

In [None]:
# Linear regression with just X2
x2_cons = X_cons.drop('x1', axis =1)
model = sm.OLS(y,x2_cons).fit()
model.summary()

### 15

In [None]:
# Read data
boston = pd.read_csv(f'../../data/Boston.csv')
boston.head()

In [None]:
y = boston['crim']
X = boston.drop('crim', axis = 1)

In [None]:
# Create graphing function
def plot_reg(X,y,label):
    
    fig, ax = plt.subplots(1,1)
    ax = sns.regplot(X,y,scatter_kws={'color':'lightblue'},line_kws={'color':'firebrick'}, ci=None)
    ax.set_title(f'Sales vs {label}')
    ax.set_xlabel(f'{label}')
    ax.set_ylabel('Sales')
    ax.grid(True)
    
    return fig

In [None]:
# Loop throug columns, create scatter plot and fit linear model, storing results in a dictionary
param_dict_single = {}
for col in X.columns:
    subset = X[col]
    plot_reg(subset,y,col)
    subset_cons = sm.add_constant(subset)
    model = sm.OLS(y, subset_cons).fit()
    param_dict_single[col] = model.params[col]
    print(model.summary())
    

In [None]:
# Fit multiple linear model, storing results in a dictionary
param_dict_multiple = {}
X_cons = sm.add_constant(X)
multiple_lm = sm.OLS(y, X_cons).fit()
multiple_lm.summary()

for param in X:
    if param == 'const':
        pass
    else:
        param_dict_multiple[param] = multiple_lm.params[param]

In [None]:
# Create dataset with parameter, single reg coefficient, multiple reg coefficient
param_data_multiple = pd.DataFrame.from_dict(param_dict_multiple, orient='index', columns = ['Multiple'])
param_data_single = pd.DataFrame.from_dict(param_dict_single, orient='index', columns = ['Single'])
param_data = pd.merge(param_data_single, param_data_multiple, left_index=True, right_index=True)
param_data.head()

In [None]:
# Plot single reg coefficients vs multiple reg coefficients 
fig, ax = plt.subplots(1,1)
plt.scatter(param_data['Single'], param_data['Multiple'])
ax.set_xlabel('Coefficient from Single Regression')
ax.set_ylabel('Coefficient from Multiple Regression')
ax.grid(True)

for i,txt in enumerate(param_data.index):
    ax.annotate(txt, (param_data.Single[i], param_data.Multiple[i]), size=7)

In [None]:
# Run multiple linear regression with X,X^2,X^3
for col in X.columns:
    subset = pd.DataFrame(X[col])
    subset[col+'_2'] = subset[col] ** 2
    subset[col+'_3'] = subset[col] ** 3
    #plot_reg(subset,y,col)
    subset_cons = sm.add_constant(subset)
    model = sm.OLS(y, subset_cons).fit()
    print(model.summary())