In [164]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np

In [165]:
x1 = np.array([1, 2, 3])
x2 = 2*x1

y = np.array([4, 6, 8])

In [166]:
all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).T

In [167]:
X.shape

(3, 3)

In [168]:
X

array([[1., 1., 2.],
       [1., 2., 4.],
       [1., 3., 6.]])

In [169]:
def solve_normal_equation(X, y):
    try:
        theta = np.linalg.inv(X.T @ X) @ X.T @ y
        return theta
    except np.linalg.LinAlgError:
        print('The matrix is singular')
        print("X.T @ X = \n", X.T @ X)
        return None
    
### Assignment question: Use np.linalg.solve instead of inv. Why is this better?

In [170]:
solve_normal_equation(X, y)

The matrix is singular
X.T @ X = 
 [[ 3.  6. 12.]
 [ 6. 14. 28.]
 [12. 28. 56.]]


In [171]:
np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)

(2, 2)

In [134]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

data = np.array([x1, x2]).T

lr.fit(data, y)
lr.coef_, lr.intercept_


# Assignment question: figure why sklearn is able to solve the problem

(array([0.4, 0.8]), 2.0)

In [135]:
# Regularization

eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X

array([[1.00001, 1.     , 2.     ],
       [1.     , 2.00001, 4.     ],
       [1.     , 3.     , 6.00001]])

In [136]:
np.linalg.matrix_rank(X)

3

In [137]:
solve_normal_equation(X, y)

array([2.00023248, 1.19987743, 0.40001887])

In [138]:
# Drop variables
X = np.array([all_ones, x1]).T
print(X)

[[1. 1.]
 [1. 2.]
 [1. 3.]]


In [139]:
solve_normal_equation(X, y)

array([2., 2.])

In [140]:
# Dummy variables

## dataset
num_records = 12
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)

df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df

Unnamed: 0,windspeed,vehicles,direction,pollution
0,5,418,W,29
1,6,427,E,17
2,1,464,N,60
3,6,139,E,43
4,0,257,S,55
5,9,172,E,49
6,5,306,S,77
7,7,373,N,6
8,9,438,W,28
9,0,125,S,85


In [141]:
def fit_data(df, X, y):
    try:
        lr = LinearRegression()
        lr.fit(X, y)
        rep = f"y = {lr.intercept_:0.2f}"
        for i, coef in enumerate(lr.coef_):
            rep += f" + {coef:0.2f}*{df.columns[i]}"
        return rep
    except Exception as e:
        print(e)
        return None
        

In [142]:
fit_data(df, df[df.columns[:-1]], df['pollution'])

could not convert string to float: 'W'


In [143]:
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

In [144]:
enc = OrdinalEncoder()

In [145]:
df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2

Unnamed: 0,windspeed,vehicles,direction,pollution
0,5,418,3.0,29
1,6,427,0.0,17
2,1,464,1.0,60
3,6,139,0.0,43
4,0,257,2.0,55
5,9,172,0.0,49
6,5,306,2.0,77
7,7,373,1.0,6
8,9,438,3.0,28
9,0,125,2.0,85


In [146]:
fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])

'y = 87.59 + -4.25*windspeed + -0.10*vehicles + 3.64*direction'

In [147]:
pd.Series({x: i for i, x in enumerate(enc.categories_[0])})

E    0
N    1
S    2
W    3
dtype: int64

In [148]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

In [149]:
direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohe

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [150]:
col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]

In [152]:
direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df

Unnamed: 0,Is it E?,Is it N?,Is it S?,Is it W?
0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0
5,1.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0
7,0.0,1.0,0.0,0.0
8,0.0,0.0,0.0,1.0
9,0.0,0.0,1.0,0.0


In [153]:
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]


0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64

In [154]:
X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])

In [155]:
X

array([[  5., 418.,   0.,   0.,   0.,   1.],
       [  6., 427.,   1.,   0.,   0.,   0.],
       [  1., 464.,   0.,   1.,   0.,   0.],
       [  6., 139.,   1.,   0.,   0.,   0.],
       [  0., 257.,   0.,   0.,   1.,   0.],
       [  9., 172.,   1.,   0.,   0.,   0.],
       [  5., 306.,   0.,   0.,   1.,   0.],
       [  7., 373.,   0.,   1.,   0.,   0.],
       [  9., 438.,   0.,   0.,   0.,   1.],
       [  0., 125.,   0.,   0.,   1.,   0.],
       [  7., 310.,   0.,   1.,   0.,   0.],
       [  7., 332.,   0.,   0.,   1.,   0.]])

In [156]:
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

In [157]:
X_aug

array([[  1.,   5., 418.,   0.,   0.,   0.,   1.],
       [  1.,   6., 427.,   1.,   0.,   0.,   0.],
       [  1.,   1., 464.,   0.,   1.,   0.,   0.],
       [  1.,   6., 139.,   1.,   0.,   0.,   0.],
       [  1.,   0., 257.,   0.,   0.,   1.,   0.],
       [  1.,   9., 172.,   1.,   0.,   0.,   0.],
       [  1.,   5., 306.,   0.,   0.,   1.,   0.],
       [  1.,   7., 373.,   0.,   1.,   0.,   0.],
       [  1.,   9., 438.,   0.,   0.,   0.,   1.],
       [  1.,   0., 125.,   0.,   0.,   1.,   0.],
       [  1.,   7., 310.,   0.,   1.,   0.,   0.],
       [  1.,   7., 332.,   0.,   0.,   1.,   0.]])

In [158]:
X_aug.shape

(12, 7)

In [181]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (7, 7))

In [174]:
pd.DataFrame(X_aug.T @ X_aug)

Unnamed: 0,0,1,2,3,4,5,6
0,12.0,62.0,3761.0,3.0,3.0,4.0,2.0
1,62.0,432.0,20075.0,21.0,15.0,12.0,14.0
2,3761.0,20075.0,1333861.0,738.0,1147.0,1020.0,856.0
3,3.0,21.0,738.0,3.0,0.0,0.0,0.0
4,3.0,15.0,1147.0,0.0,3.0,0.0,0.0
5,4.0,12.0,1020.0,0.0,0.0,4.0,0.0
6,2.0,14.0,856.0,0.0,0.0,0.0,2.0


In [182]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])


array([[0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [185]:
direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1

Unnamed: 0,Is it N?,Is it S?,Is it W?
0,0.0,0.0,1.0
1,0.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,1.0,0.0
5,0.0,0.0,0.0
6,0.0,1.0,0.0
7,1.0,0.0,0.0
8,0.0,0.0,1.0
9,0.0,1.0,0.0


In [186]:
X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug

array([[  1.,   5., 418.,   0.,   0.,   1.],
       [  1.,   6., 427.,   0.,   0.,   0.],
       [  1.,   1., 464.,   1.,   0.,   0.],
       [  1.,   6., 139.,   0.,   0.,   0.],
       [  1.,   0., 257.,   0.,   1.,   0.],
       [  1.,   9., 172.,   0.,   0.,   0.],
       [  1.,   5., 306.,   0.,   1.,   0.],
       [  1.,   7., 373.,   1.,   0.,   0.],
       [  1.,   9., 438.,   0.,   0.,   1.],
       [  1.,   0., 125.,   0.,   1.,   0.],
       [  1.,   7., 310.,   1.,   0.,   0.],
       [  1.,   7., 332.,   0.,   1.,   0.]])

In [187]:
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (6, 6))

In [211]:
# Interepeting dummy variables

## dataset

X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])

In [214]:
from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)

array([[0],
       [0],
       [0],
       [1],
       [1]])

In [215]:
X_binary = 1 - l.fit_transform(X)

In [216]:
X_binary    

array([[1],
       [1],
       [1],
       [0],
       [0]])

In [217]:
lr = LinearRegression()
lr.fit(X_binary, y)

In [218]:
lr.coef_, lr.intercept_

(array([-0.7]), 5.8999999999999995)

In [219]:
y[(X_binary==0).flatten()].mean()

5.9

In [220]:
y[(X_binary==1).flatten()].mean()

5.2