In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

## Prepare dataset

In [2]:
df = pd.read_csv('wine.csv', sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
1,7.8,0.88,0.00,2.6,0.098,25.0,67,0.99680,3.20,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54,0.99700,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60,0.99800,3.16,0.58,9.8,6
4,7.4,0.70,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1195,6.2,0.43,0.22,1.8,0.078,21.0,56,0.99633,3.52,0.60,9.5,6
1196,7.9,0.58,0.23,2.3,0.076,23.0,94,0.99686,3.21,0.58,9.5,6
1197,7.7,0.57,0.21,1.5,0.069,4.0,9,0.99458,3.16,0.54,9.8,6
1198,7.7,0.26,0.26,2.0,0.052,19.0,77,0.99510,3.15,0.79,10.9,6


## Create A and b

In [3]:
def make_A_b(deg, data, y):
    # data: dataframe, y: Series, deg: int
    data.reset_index(inplace=True, drop=True)
    y.reset_index(inplace=True, drop=True)
    A = pd.Series(np.ones(data.shape[0]))
    for i in range(data.shape[1]):
        for d in range(1, deg + 1):
            A = pd.concat((A, data.iloc[:, i]**d), axis=1)
    return A, y

In [4]:
def solve(A, b):
    return np.linalg.pinv(A) @ b

## Part 1 (all attributes)

**Solve**

In [5]:
x = df.iloc[:,:-1]
y = df.iloc[:, -1]
A, b = make_A_b(1,x, y)
v = solve(A, b)

**Result**

In [6]:
v

array([ 4.32363757e+01,  4.79658267e-02, -1.06797380e+00, -2.68453927e-01,
        3.50267451e-02, -1.59557504e+00,  3.47539059e-03, -3.79299466e-03,
       -3.98102920e+01, -2.40172280e-01,  7.74368364e-01,  2.69212248e-01])

**Norm**

In [7]:
np.linalg.norm(A @ v - b)

22.095010916094765

## Part 2 (1 attributes)

In [8]:
def cross_validation(k, df):
    best_one = None
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    pieces = int(df.shape[0]/k)
    for i in range(df.shape[1]-1):
        last = -pieces
        for _ in range(k):
            last += pieces
            x = df.drop(labels=range(last,last + pieces)).filter(items=[df.columns[i]], axis=1)
            y = pd.concat([df.iloc[0:last, -1],df.iloc[last + pieces:, -1]])
            A, b = make_A_b(1,x, y)
            v = solve(A, b)
            est_diff = np.linalg.norm(A @ v - b)
            if best_one is not None:
                if est_diff < best_one[0]:
                    best_one = est_diff, v, i
            else:
                best_one = est_diff, v, i
    return best_one[1], best_one[2]

**Solve**

In [9]:
k = 5
best = cross_validation(k, df)

**Result**

In [10]:
best[0]

array([1.77579888, 0.37458485])

**Choosen attribute**

In [11]:
df.columns[best[1]]

'alcohol'

**Norm**

In [12]:
A, b = make_A_b(1, df.filter(items=[df.columns[best[1]]]), df.iloc[:,-1])
np.linalg.norm(A @ best[0] - b)

24.184396815304787

## Part 3 (1 - 3 attributes)

In [13]:
def find_model(df, deg):
    best_one = None
    df = shuffle(df)
    df.reset_index(inplace=True, drop=True)
    for i in range(df.shape[1] - 3):
        for j in range(i + 1,df.shape[1] - 2):
            for k in range(j + 1, df.shape[1] - 1):
                x = pd.concat((df.iloc[:,i], df.iloc[:,j], df.iloc[:,k]),axis=1)
                y = df.iloc[:,-1]
                A, b = make_A_b(deg,x, y)
                v = solve(A, b)
                est_diff = np.linalg.norm(A @ v - b)
                if best_one is not None:
                    if est_diff < best_one[0]:
                        best_one = est_diff, v, (i, j, k)
                else:
                    best_one = est_diff, v, (i, j, k)
    return best_one

**Solve**

In [14]:
deg = 1
best = find_model(df, deg)

**Result**

In [15]:
best[1]

array([ 3.19033261e+00, -1.26488671e+00, -2.80309069e-03,  3.14204597e-01])

**Choosen attributes**

In [16]:
[df.columns[i] for i in best[2]]    

['volatile acidity', 'total sulfur dioxide', 'alcohol']

**Norm**

In [17]:
best[0]

22.640229234488324