In [1]:
import numpy as np
import pandas as pd

PATH = "data/winequality-red.csv"
df = pd.read_csv(PATH)
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

y_label = "quality"
X_train, X_test = train_test_split(df, train_size=0.8)
y_train, y_test = X_train[y_label], X_test[y_label]
X_train.drop(columns=[y_label], inplace=True)
X_test.drop(columns=[y_label], inplace=True)

#x_label = "free sulfur dioxide"
x_label = "alcohol"
x_train, x_test = X_train[x_label], X_test[x_label]
x_train = x_train.to_numpy().reshape(-1,1)
x_test  = x_test.to_numpy().reshape(-1,1)

reg = LinearRegression().fit(x_train, y_train)
predictions = reg.predict(x_test)
mse = mean_squared_error(predictions, y_test)

print(f'MSE is {mse}')
print(f'beta_1 is {reg.coef_[0]}')
print(f'beta_0 is {reg.intercept_}')
print(f'R2 is {reg.score(x_train, y_train)}')


MSE is 0.4723502743930517
beta_1 is 0.34654656318820065
beta_0 is 2.0215994478490438
R2 is 0.21204771216486196


In [3]:
def printRegression(reg, x, y):
    print(f'mean square error is {mean_squared_error(reg.predict(x),y)}')
    print(f'beta_1 is {reg.coef_[0]}')
    print(f'beta_0 is {reg.intercept_}')
    print(f'R2 is {reg.score(x, y)}')


In [4]:
# A synthetic example where all of the points are on a line.

x = np.arange(0, 10, 1)
y = 2*x + 3
x = x.reshape(-1,1)
reg = LinearRegression().fit(x,y)
printRegression(reg, x, y)


mean square error is 1.3016204936146695e-29
beta_1 is 1.999999999999999
beta_0 is 3.0000000000000053
R2 is 1.0


In [5]:
# A synthetic example where all of the points are on a noisey line.
x = np.arange(0, 10, 1)
mu, sigma = 0.0, 1.0
eps = np.random.normal(mu, sigma, len(x))
y = 2*x + 3 + eps
x = x.reshape(-1,1)
reg = LinearRegression().fit(x,y)
printRegression(reg, x, y)


mean square error is 1.8437026583224434
beta_1 is 2.0890175674806617
beta_0 is 2.799763772530291
R2 is 0.9512849262797923


In [6]:
from linear_regression import LinearRegression

def printReg(reg):
    print(f'{reg.beta0_hat=}')
    print(f'{reg.beta1_hat=}')
    print(f'{reg.r2()=}')

x = np.arange(0,10,1)
y = 20*x + 300
reg = LinearRegression()
reg.fit(x,y)
printReg(reg)

reg_gd = LinearRegression()
reg_gd.gd_fit(x,y)
printReg(reg_gd)

reg_ridge = LinearRegression()
reg_ridge.ridge_fit(x,y)
printReg(reg_ridge)

reg_lasso = LinearRegression()
reg_lasso.lasso_fit(x,y)
printReg(reg_lasso)

my_reg = LinearRegression()
x = X_train[x_label].values
y = y_train.values
my_reg.fit(x,y)
printReg(my_reg)


reg.beta0_hat=300.00000000000017
reg.beta1_hat=19.999999999999996
reg.r2()=1.0
reg.beta0_hat=298.9952894234932
reg.beta1_hat=20.160225950423946
reg.r2()=0.9999114305750781
reg.beta0_hat=224.16497325386024
reg.beta1_hat=31.86236579693264
reg.r2()=0.4954230546064924
reg.beta0_hat=298.7053879684314
reg.beta1_hat=20.20248950039098
reg.r2()=0.9998529487401713
reg.beta0_hat=2.0215994478491393
reg.beta1_hat=0.34654656318818844
reg.r2()=0.21204771216486207
