In [1]:
# input: 1) X: the independent variables (data matrix), an (N x D)-dimensional matrix, as a numpy array
#        2) y: the dependent variable, an N-dimensional vector, as a numpy array
## output: 1) the regression coefficients, a (D+1)-dimensional vector, as a numpy array
## note: remember to either expect an initial column of 1's in the input X, or to append this within your code

In [82]:
# Exercise 2
import numpy as np
def multivarlinreg(X, y):
    X = add_one(X)
    w = np.linalg.inv(X.T @ X) @ X.T @ y
    return w
    
def add_one(X):
    row, col = np.shape(X)
    one_col = np.ones(row)
    X = np.c_[one_col, X]
    return X

def print_names(weights, names):
    for i in range(len(weights)):
        print(f"{names[i]}:  {weights[i]}")
    print("\n")

In [83]:
wine_train = np.loadtxt("redwine_training.txt")
names = ["Intercept", "Fixed acidity", "Volatile acidity","Citric acid","Residual sugar","Chlorides","Free sulfur dioxide",
         "Total sulfur dioxide", "Density","pH","Sulfates","Alcohol"]

x_train = wine_train[:, :-1]
x_train_col1 = x_train[:, :1]
y_train = wine_train[:, -1:]

weights_one = multivarlinreg(x_train_col1, y_train)
print("Weights for first feature:")
print_names(weights_one, names)
#Fixed acidity is not that important, has a small coefficient
weights_full = multivarlinreg(x_train, y_train)
print("Weights for all features:")
print_names(weights_full, names)
#Desnity is very important, high negative correlation, chlorides is a little importat, rest are not very correlated

Weights for first feature:
Intercept:  [5.2057261]
Fixed acidity:  [0.05035934]


Weights for all features:
Intercept:  [51.65737174]
Fixed acidity:  [0.01958527]
Volatile acidity:  [-1.06193618]
Citric acid:  [0.02588963]
Residual sugar:  [0.05022816]
Chlorides:  [-2.75489463]
Free sulfur dioxide:  [0.00565346]
Total sulfur dioxide:  [-0.00380729]
Density:  [-47.20924227]
pH:  [-0.42663938]
Sulfates:  [0.85047813]
Alcohol:  [0.2378959]




In [102]:
# Exercise 3
def rmse(pred, true):
    n = len(pred)
    error = np.linalg.norm(true - pred)
    rms = np.sqrt((error**2)/n)
    return rms

def test(weights, test_x, test_y):
    X = add_one(test_x)
    pred = X @ weights
    error = rmse(pred, test_y)
    return error

wine_test = np.loadtxt("redwine_testing.txt")

x_test = wine_test[:, :-1]
x_test_col1 = x_test[:, :1]
y_test = wine_test[:, -1:]

rmse_one = test(weights_one, x_test_col1, y_test)
rmse_full = test(weights_full, x_test, y_test)
print(f"RMSE for first feature: {rmse_one}")
print(f"RMSE for all features: {rmse_full}")
#Using all the features results in a lower error which matches the previous exercise.

RMSE for first feature: 0.786089275416223
RMSE for all features: 0.644717277302212
