In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [3]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [4]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.5142857142857142


In [5]:
def fit_predict(train, test, y_train, y_test, scaler, kernel = 'linear', C = 1.0, degree = 3):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    lr = SVC(kernel = kernel, degree = degree, C = C)
    lr.fit(train_scaled, y_train)
    y_pred = lr.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

### Kernel tuning

In [6]:
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    print('Accuracy score using {0} kernel:'.format(kernel), end = ' ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), kernel)

Accuracy score using linear kernel: 0.5091836734693878
Accuracy score using poly kernel: 0.5255102040816326
Accuracy score using rbf kernel: 0.5612244897959183
Accuracy score using sigmoid kernel: 0.40408163265306124


### Penalty tuning

In [7]:
for с in np.logspace(3-1, , base = 2, num = 6):
    print('Accuracy score using penalty = {0} with rbf kernel:'.format(с), end = ' ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 'rbf', с)

Accuracy score using penalty = 0.5 with rbf kernel: 0.5408163265306123
Accuracy score using penalty = 0.8705505632961241 with rbf kernel: 0.560204081632653
Accuracy score using penalty = 1.5157165665103982 with rbf kernel: 0.5581632653061225
Accuracy score using penalty = 2.6390158215457893 with rbf kernel: 0.5642857142857143
Accuracy score using penalty = 4.59479341998814 with rbf kernel: 0.5775510204081633
Accuracy score using penalty = 8.0 with rbf kernel: 0.5918367346938775


### Choosing degree for poly kernel

In [8]:
for degree in range(2, 6):
    print('Accuracy score using degree = {0} with poly kernel:'.format(degree), end = ' ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 'poly', 1.5, degree = degree)

Accuracy score using degree = 2 with poly kernel: 0.48673469387755103
Accuracy score using degree = 3 with poly kernel: 0.5183673469387755
Accuracy score using degree = 4 with poly kernel: 0.5214285714285715
Accuracy score using degree = 5 with poly kernel: 0.5306122448979592


In [9]:
original_score = 0.514285714286
best_score = 0.591836734694
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 15.08 %
