In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
WineQualityDf = pd.read_csv("../data/WineQualityDf_cleaned.csv")
WineQualityDf = WineQualityDf.drop("Unnamed: 0", axis=1)

X_matrix = WineQualityDf.drop("quality", axis=1).to_numpy()
y = WineQualityDf["quality"].to_numpy()

# Separating data into traning and testing datasets

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_matrix, y, test_size=0.2, random_state=62)

ones_column_train = np.ones(X_train.shape[0])
ones_column_train = ones_column_train.reshape((X_train.shape[0],1))
X_train = np.hstack((ones_column_train, X_train))

ones_column_test = np.ones(X_test.shape[0])
ones_column_test = ones_column_test.reshape((X_test.shape[0],1))
X_test = np.hstack((ones_column_test, X_test))

In [4]:
X_train

array([[ 1.  , 11.1 ,  0.18, ...,  3.22,  0.64, 10.1 ],
       [ 1.  , 11.6 ,  0.41, ...,  3.13,  0.53, 10.  ],
       [ 1.  , 11.5 ,  0.45, ...,  3.26,  1.11, 11.  ],
       ...,
       [ 1.  ,  6.8 ,  0.48, ...,  3.34,  0.6 , 10.4 ],
       [ 1.  ,  8.1 ,  1.33, ...,  3.54,  0.48, 10.9 ],
       [ 1.  ,  7.7 ,  0.18, ...,  3.37,  0.78, 11.8 ]])

## Model training

In [7]:
coefficients = (np.linalg.inv(X_train.T @ X_train)) @ X_train.T @ y_train
coefficients

array([ 6.74370143e+00,  1.31175071e-02, -1.15799614e+00, -2.07245472e-01,
       -8.19955630e-03, -1.17109425e+00,  3.90508567e-03, -3.20319979e-03,
       -2.16337341e+00, -5.48623572e-01,  8.20808533e-01,  2.96205162e-01])

In [13]:
def get_coeff(X, y, n):
    coeff_distributions = []
    for i in range(0, n):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        ones_column_train = np.ones(X_train.shape[0])
        ones_column_train = ones_column_train.reshape((X_train.shape[0],1))
        X_train = np.hstack((ones_column_train, X_train))

        ones_column_test = np.ones(X_test.shape[0])
        ones_column_test = ones_column_test.reshape((X_test.shape[0],1))
        X_test = np.hstack((ones_column_test, X_test))

        coefficients = (np.linalg.inv(X_train.T @ X_train)) @ X_train.T @ y_train
        coeff_distributions.append(list(coefficients))
    coeff_distributions = np.array(coeff_distributions)

    return coeff_distributions


In [37]:
N = 40000
coeff_distributions = get_coeff(X_matrix, y, N).T

In [39]:
for coeff in coeff_distributions:
    coeff_value = np.mean(coeff)
    coeff_ste = np.std(coeff) / len(X_train)**(0.5)
    coeff_t = coeff_value / coeff_ste
