---
# Linear Regression
---

**Content**

- Linear Regression with One Feature


- Linear Regression with Multiple Features


**Additional Material**:

- interactive linear regression exploration [here](https://observablehq.com/@yizhe-ang/interactive-visualization-of-linear-regression)


---
---

# Settings

In [None]:
dark_plot_theme = True

if dark_plot_theme:
    plt.style.use('dark_background')


# pandas display settings

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import sys

# utils
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

# metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score


In [None]:
# reload ./utils.py

import importlib
import utils
importlib.reload(utils)
from utils import get_dichotomous

# Load Feature Data

assume: no nulls, no outliers (z>3)

In [None]:
# prepare the directory and load the data

cwd = Path()

ipath = cwd / 'data'

ipath.mkdir(exist_ok=True)

ifile = ipath / 'features.csv'

data = pd.read_csv(ifile, index_col=['id'])

data.head(3)

In [None]:
# TODO:
# check "data" for null values.
# Use the pandas functions "isnull" and "any"

# TIP:
# data...

data.isnull().values.any() # REMOVE

---

# Linear Regression with One Feature


All our inputs need to be **numeric** for linear regression.

**Linear Regression Assumptions**

- Linearity: A linear correlation between the input and the target

- Normality: Normal distributed input variables.

- No Multicollinearity: Linear independence between variables.

- No Auto-Correlation: No correlation between input variables.

- Homoscedasticity: Constant variance for the sample distribution.

**Metric**

- RSME $ = \sqrt{\frac{1}{n}\sum (y_i - \hat{y}_i)^{2}}$

- R2 $ = 1 - \frac{\sum (y_i - \hat{y}_i)^{2}}{\sum (y_i - \bar{y}_i)^{2}}$

**Z Score**

- outlier if:  z-score $ = \frac{x - \bar{x}}{\sigma} > 3$

**Skew**

- skew $ = \frac{E[(x - \bar{x})^3]}{\sigma^3}$

In [None]:
# predict the "price" from the "square_meter"

target = 'price'
features = ['square_meter']

variables = [target] + features

In [None]:
# plot target distribution: price and log-price

figsize = (12, 5)
fig, axs = plt.subplots(1, 2, figsize=figsize)

# price distribution
d = data[target]
skew = d.skew()
title = f'skewness: {skew:0.2f}'
sns.histplot(d, bins=50, ax=axs[0]).set(title=title);

# log-price distribution
d = np.log(data[target]+1)
skew = d.skew()
title = f'skewness: {skew:0.2f}'
sns.histplot(d, bins=50, ax=axs[1]).set(title=title);

In [None]:
# plot feature distribution: sqm and log-sqm

figsize = (12, 5)
fig, axs = plt.subplots(1, 3, figsize=figsize)

d = data[features[0]]
mask = data[f'imp_{features[0]}']+data[f'imp_z_{features[0]}'] < 1

# sqm distribution
skew = d.skew()
title = f'skewness: {skew:0.2f}'
sns.histplot(d, bins=50, ax=axs[0]).set(title=title);

# sqm distribution without imputation
d = d[mask]
skew = d.skew()
title = f'skewness: {skew:0.2f}'
sns.histplot(d, bins=50, ax=axs[1]).set(title=title);

# log sqm distribution
d = np.log(d+1)
skew = d.skew()
title = f'skewness: {skew:0.2f}'
sns.histplot(d, bins=50, ax=axs[2]).set(title=title);

In [None]:
# create train test split

rdata = data[variables]

xtrain, xtest, ytrain, ytest = train_test_split(
    rdata.drop(target, axis=1), rdata[target], random_state=0)


In [None]:
# linear regression

# TODO:
# Instantiate the default sklearn
# "LinearRegression" model

# TIP:
# lr = ...

lr = LinearRegression() # REMOVE

# TODO:
# Fit the created "LinearRegression" model
# to "xtrain" and "ytrain"

# TIP:
# lr...

lr.fit(xtrain, ytrain) # REMOVE

# TODO:
# Use the fitted model "lr"
# to make predictions
# based on "xtest"

# TIP:
# ypred = lr...

ypred = lr.predict(xtest) # REMOVE

In [None]:
# calculation of metrics

# TODO:
# Calculate "r2" explicitly
# Use "ytest" and "ypred"
# and apply "np.sum", "np.square" and "np.mean"

# TIP:
# r2 = 1 - ...

r2 = 1 - np.sum(np.square(ytest - ypred)) / np.sum(np.square(ytest - np.mean(ytest))) # REMOVE

# TODO:
# Calculate RMSE explicitly
# Use "ytest" and "ypred"
# and apply "np.sum", "np.square" and "np.mean"

# TIP:
# rmse = ...

rmse = np.sqrt(np.mean(np.square(ytest - ypred))) # REMOVE

print(f'R-squared: {r2:.2f}')
print(f'RMSE:      {rmse:.2f}')

In [None]:
# CHECK

if np.round(r2, 2) == 0.07:
    print('*** passed, well done!')
else:
    print('*** r2 is not correct, try again')

if np.round(rmse, 2) == 47.74:
    print('*** passed, well done!')
else:
    print('*** rmse is not correct, try again')

In [None]:
# regression metrics

def rmse_score(*args, **kwargs):
    return mean_squared_error(*args, **kwargs, squared=False)

rmetrics = {}
rmetrics['r2'] = r2_score
rmetrics['mse'] = mean_squared_error
rmetrics['rmse'] = rmse_score

In [None]:
def regression_wrapper(xtrain, xtest, ytrain, ytest, data, show=True):
    '''
    Convenience function wrapping
    the application of linear regression
    calculation of metrics and
    plotting the results
    '''

    # fit the linear regression model
    lr = LinearRegression()
    lr.fit(xtrain, ytrain)

    # model predictions
    ypred = lr.predict(xtest)

    # print and plot results
    if show:
        print('*** model paramters:')
        print('coeff.: ', ', '.join([f'{x:.3f}' for x in lr.coef_]))
        print(f'inter.: {lr.intercept_:.3f}')
        print()
        print('*** scores:')
        for k, v in rmetrics.items():
            score = v(ytest, ypred)
            print(f'{k:22} {score:.3f}')

        plot = sns.jointplot(data=data, x='square_meter', y='price', marker='.', marginal_kws=dict(bins=25));
        plot.ax_joint.plot(xtest, ypred, '-', color='violet' );
    return ypred

In [None]:
# fit and plot

ypred = regression_wrapper(xtrain, xtest, ytrain, ytest, rdata)

In [None]:
# drop imputations

# all imputation masks related to "price" and "square_meter"
pattern = '^imp.*({}|{})$'.format(*variables)


# TODO:
# Use the regex pattern in "pattern"
# to filter for the respective feature columns.
# "sum" the result along "axis=1"

# TIP:
# mask = data.filter(...).sum() < 1

mask = data.filter(regex=pattern, axis=1).sum(axis=1) < 1 # REMOVE


In [None]:
# TODO:
# Evaluate "rdata"
# at the calculated "mask"
# and drop all the imputations.
# Use "where" and "dropna"

# TIP:
# rdata = rdata.where(...)...

rdata = rdata.where(mask).dropna() # REMOVE

In [None]:
# train-test split

xtrain, xtest, ytrain, ytest = train_test_split(
    rdata.drop(target, axis=1), rdata[target], random_state=0)

# fit and plot in the wrapper

ypred = regression_wrapper(xtrain, xtest, ytrain, ytest, rdata)

In [None]:
# log transformation

# TODO:
# Log transform "rdata"
# Use "np.log" and
# apply +1 to remove zeros

# TIP:
# rdata = np.log(...)

rdata = np.log(rdata + 1) # REMOVE

In [None]:
# train-test split
xtrain, xtest, ytrain, ytest = train_test_split(
    rdata.drop(target, axis=1), rdata[target], random_state=0)

# fit model / plot results again
ypred = regression_wrapper(xtrain, xtest, ytrain, ytest, rdata)

In [None]:
# TODO:
# Revert the log transformation
# for "ytest" and "ypred"
# Don't forget -1

# TIP:
# ytest_exp = ...
# ypred_exp = ...

ypred_exp = np.exp(ypred) - 1 # REMOVE
ytest_exp = np.exp(ytest) - 1 # REMOVE

# TODO:
# Calculate "r2" and "rmse"
# From "ytext_exp" and "ypred_exp"
# using the functions:
# "r2_score" and "rmse_score"

# TIP:
# r2 = r2_score(...)
# rmse = rmse_score(...)

r2 = r2_score(ytest_exp, ypred_exp) # REMOVE
rmse = rmse_score(ytest_exp, ypred_exp) # REMOVE

print(f'r2   = {r2:.2f}')
print(f'rmse = {rmse:.2f}')

fig, ax = plt.subplots();
sns.scatterplot(data=np.exp(rdata)-1, x='square_meter', y='price', ax=ax);
ax.scatter(np.exp(xtest)-1, np.exp(ypred)-1, color='violet');

In [None]:
# normalize the log data

# TODO:
# Standardize "rdata"
# use the pandas utilities
# for "mean" and "std"

# TIP:
# rdata = ...

rdata = (rdata - rdata.mean()) / rdata.std() # REMOVE

In [None]:
# train-test split
xtrain, xtest, ytrain, ytest = train_test_split(
    rdata.drop(target, axis=1), rdata[target], random_state=0)

# fit model / plot results again
ypred = regression_wrapper(xtrain, xtest, ytrain, ytest, rdata)

In [None]:
# residuals with:
# LOESS (locally estimated scatterplot smoothing)

tmp = rdata[variables]
tmp['residuals'] = (ytest - ypred)

fig, ax = plt.subplots()

# TODO:
# Use the seaborn "residplot" function
# to plot the residuals in stored in "tmp"
# include, "lowess=True"

# TIP:
# sns.residplot(..., lowess=True, ax=ax)

sns.residplot(data=tmp, x='square_meter', y='residuals', lowess=True, line_kws=dict(color='red'), ax=ax); # REMOVE

ax.axis('equal');



# Linear Regression with Multiple Features
---

## Feature Selection

In [None]:
# correlation

corr_data = data.drop(get_dichotomous(data), axis=1)

# TODO
# Calculate the absolute values
# of default correlations matrix.
# Use "np.abs" and the pandas function "corr"
# on "corr_data"

# TIP:
# cor = ...

cor = np.abs(corr_data.corr()) # REMOVE


In [None]:
# plot correlations

# absolute correlation
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(cor, annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[0]);

# TODO:
# Plot the absolution correlation
# only if corr > 0.7
# use "sns.heatmap"
# but filter for values larger then 0.7
# using pandas "where" function

# TIP:
# sns.heatmap(cor.where(...), annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[1]);)

sns.heatmap(cor.where(cor>0.7, other=0), annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[1]); # REMOVE

In [None]:
# TODO:
# Select for numeric features only
# use "select_dtypes" on "data"
# by including only "np.number" variables

# TIP:
# rdata = data...

rdata = data.select_dtypes(include=[np.number]) # REMOVE

In [None]:
# train test split

x = rdata.drop(target, axis=1, errors='ignore')
y = rdata[target]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)

In [None]:
# forward feature selection

print('*** selected features:')
max_features = 16
features = []
for i in range(1, max_features):
    # TODO:
    # Instantiate "SelectFromModel" from sklearn
    # with an Lasso() instance as estimator
    # and allow for max_features of i.

    # TIP:
    # selector = SelectFromModel(...)

    selector = SelectFromModel(Lasso(), max_features=i) # REMOVE
    
    selector.fit(xtrain, ytrain)

    # Only keep the best columns
    mask = selector.get_support()
    cnames = xtrain.columns[mask]
    features.append(cnames)

    print(i, ', '.join(list(cnames)))

In [None]:
# linear regression convenience function

def linear_regression_wrapper(data):
    x = data.drop(target, axis=1)
    y = data[target]

    xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)

    lr = LinearRegression()
    lr.fit(xtrain, ytrain)

    ypred = lr.predict(xtest)
    r2 = r2_score(ytest, ypred)

    # calculate r2 adjusted
    n = np.shape(ytest)[0]
    k = len(feature) + 1
    r2_adj = 1 - (1 - r2) * (n - 1) / (n - k)
    
    print(', '.join(variables))
    print(f'r2={r2:.3f} f2_adj={r2_adj:.3f}')
    print()

    return r2, r2_adj

In [None]:
# linear regression for the feature sets

r2s = []
for feature in features:
    variables = list(feature) + [target]
    
    r2, r2_adj = linear_regression_wrapper(rdata[variables])
    r2s.append(r2_adj)



# TODO
# Add the last run including all features.
# Apply the "linear_regression_wrapper"
# defined in the notebook to "rdata"

# TIP:
# r2, r2_adj = ...

r2, r2_adj = linear_regression_wrapper(rdata) # REMOVE

r2s.append(r2_adj)

In [None]:
fig, ax = plt.subplots(1)
ax.plot(r2s)
ax.set_title('$r^2_{adj}$');


---

In [None]:
# variable skew reduction with log transformation

# exclude categorical data
tmp = data.select_dtypes(include=[np.number])
tmp = tmp.drop(get_dichotomous(tmp), axis=1)

skew = pd.DataFrame(tmp.skew(), columns=['skew'])
skew['log_skew'] = np.log(tmp + 1).skew()
skew['log_skew/skew'] = np.abs(skew['log_skew'] / skew['skew'])

display(skew.sort_values('log_skew/skew'))

---
---
---