In [1]:
from sklearn import neighbors
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import numpy as np
import pandas as pd

In [2]:
filename = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
df = pd.read_csv(filename, sep = ';')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

Todos os nossos resultados usando a métrica RMSE devem ser melhor do que o desvio padrão de y_train:

In [6]:
np.std(y_train)

0.8860105442249219

### MinMax Scaler

In [7]:
scaler = preprocessing.MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_test_minmax = scaler.transform(X_test)

## Standard Scaler

In [8]:
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

## Cálculo dos resultados

In [15]:
ks = [3, 5, 7, 9, 11]
scalers = ['no scaler', 'minmax', 'std']

results = []
for scaler in scalers:
    if scaler == 'minmax':
        X_train_, X_test_ = X_train_minmax, X_test_minmax
    elif scaler == 'std':
        X_train_, X_test_ = X_train_std, X_test_std
    else:
        X_train_, X_test_ = X_train, X_test
    model = linear_model.LinearRegression()
    model.fit(X_train_, y_train)
    y_pred = model.predict(X_test_)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    result = { 'model': 'LR', 'scaler': scaler, 'rmse': rmse}
    results.append(result)

    for k in ks:
        model = neighbors.KNeighborsRegressor(n_neighbors = k, metric='euclidean')
        model.fit(X_train_, y_train)
        y_pred = model.predict(X_test_)
        rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
        result = { 'model': f'kNN (k={k}, peso=-)', 'scaler': scaler, 'rmse': rmse}
        results.append(result)

        model = neighbors.KNeighborsRegressor(n_neighbors = k, weights='distance', metric='euclidean')
        model.fit(X_train_, y_train)
        y_pred = model.predict(X_test_)
        rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
        result = { 'model': f'kNN (k={k}, peso=distancia)', 'scaler': scaler, 'rmse': rmse}
        results.append(result)


## Apresentação dos resultados do melhor para o pior

In [16]:
df_results = pd.DataFrame(results)
df_results.sort_values(by='rmse')

Unnamed: 0,model,scaler,rmse
32,"kNN (k=11, peso=distancia)",std,0.620329
30,"kNN (k=9, peso=distancia)",std,0.622775
28,"kNN (k=7, peso=distancia)",std,0.623511
26,"kNN (k=5, peso=distancia)",std,0.639191
21,"kNN (k=11, peso=distancia)",minmax,0.643013
19,"kNN (k=9, peso=distancia)",minmax,0.648218
17,"kNN (k=7, peso=distancia)",minmax,0.649837
15,"kNN (k=5, peso=distancia)",minmax,0.661669
24,"kNN (k=3, peso=distancia)",std,0.675146
13,"kNN (k=3, peso=distancia)",minmax,0.686098
