In [1]:
import pandas as pd
import numpy as np
import joblib

from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_val_score


In [2]:
red_wine_data = pd.read_csv('winequality-red.csv', sep=';')
white_wine_data = pd.read_csv('winequality-white.csv', sep=';')

red_wine_data


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
# split the data into features using train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    red_wine_data.drop('quality', axis=1), 
    red_wine_data['quality'], 
    test_size=0.2, 
    random_state=42
)

X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
493,8.7,0.690,0.31,3.0,0.086,23.0,81.0,1.00020,3.48,0.74,11.6
354,6.1,0.210,0.40,1.4,0.066,40.5,165.0,0.99120,3.25,0.59,11.9
342,10.9,0.390,0.47,1.8,0.118,6.0,14.0,0.99820,3.30,0.75,9.8
834,8.8,0.685,0.26,1.6,0.088,16.0,23.0,0.99694,3.32,0.47,9.4
705,8.4,1.035,0.15,6.0,0.073,11.0,54.0,0.99900,3.37,0.49,9.9
...,...,...,...,...,...,...,...,...,...,...,...
1130,9.1,0.600,0.00,1.9,0.058,5.0,10.0,0.99770,3.18,0.63,10.4
1294,8.2,0.635,0.10,2.1,0.073,25.0,60.0,0.99638,3.29,0.75,10.9
860,7.2,0.620,0.06,2.7,0.077,15.0,85.0,0.99746,3.51,0.54,9.5
1459,7.9,0.200,0.35,1.7,0.054,7.0,15.0,0.99458,3.32,0.80,11.9


In [4]:
# Using the StandardScaler to standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the SVM model on the red wine dataset: {accuracy:.2f}")

Accuracy of the SVM model on the red wine dataset: 0.56


In [None]:
# DONT RUN THIS CODE, IT WILL TAKE A LONG TIME

# I want to use the grid search to find the best hyperparameters for the SVM model
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(
    SVC(), 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    verbose=3, 
    n_jobs=-1
)


# Fit the grid search to the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters from grid search: {best_params}")
print(f"Best cross-validation score: {best_score:.2f}")

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best parameters from grid search: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Best cross-validation score: 0.65


In [58]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)

# Calculate the accuracy of the best model
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Accuracy of the best SVM model on the test set: {best_accuracy:.2f}")


Accuracy of the best SVM model on the test set: 0.65


In [None]:
# DONT RUN THIS CODE, IT WILL TAKE A LONG TIME

# Now let's apply the same process to the white wine dataset
# Split the white wine data into features and target variable
X_white = white_wine_data.drop("quality", axis=1)
y_white = white_wine_data["quality"]

# Split the white wine data into training and test sets
X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(
    X_white, y_white, test_size=0.2, random_state=42
)
# Standardize the features for the white wine dataset
scaler_white = StandardScaler()
X_train_white_scaled = scaler_white.fit_transform(X_train_white)
X_test_white_scaled = scaler_white.transform(X_test_white)

# Create and train the SVM model for the white wine dataset
svm_model_white = SVC(kernel='linear', random_state=42)
svm_model_white.fit(X_train_white_scaled, y_train_white)

# Make predictions on the test set for the white wine dataset
y_pred_white = svm_model_white.predict(X_test_white_scaled)

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 'scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

# Calculate the accuracy of the model for the white wine dataset
accuracy_white = accuracy_score(y_test_white, y_pred_white)
print(f"Accuracy of the SVM model on the white wine dataset: {accuracy_white:.2f}")

# Perform grid search for hyperparameter tuning on the white wine dataset
grid_search_white = GridSearchCV(
    SVC(), 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    verbose=3, 
    n_jobs=-1
)

# Fit the grid search to the training data for the white wine dataset
grid_search_white.fit(X_train_white_scaled, y_train_white)

# Get the best parameters and the best score for the white wine dataset
best_params_white = grid_search_white.best_params_
best_score_white = grid_search_white.best_score_

print(f"Best parameters from grid search for white wine: {best_params_white}")
print(f"Best cross-validation score for white wine: {best_score_white:.2f}")

# Evaluate the best model on the test set for the white wine dataset
best_model_white = grid_search_white.best_estimator_
y_pred_best_white = best_model_white.predict(X_test_white_scaled)

# Calculate the accuracy of the best model for the white wine dataset
best_accuracy_white = accuracy_score(y_test_white, y_pred_best_white)
print(f"Accuracy of the best SVM model on the white wine test set: {best_accuracy_white:.2f}")

In [None]:
# Now let's apply the same process to the white wine dataset
# Split the white wine data into features and target variable
X_white = white_wine_data.drop("quality", axis=1)
y_white = white_wine_data["quality"]

# Split the white wine data into training and test sets
X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(
    X_white, y_white, test_size=0.2, random_state=42
)

# Standardize the features for the white wine dataset
scaler_white = StandardScaler()
X_train_white_scaled = scaler_white.fit_transform(X_train_white)
X_test_white_scaled = scaler_white.transform(X_test_white)

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

grid = list(ParameterGrid(param_grid))
results = []

# Barra di avanzamento
for params in tqdm(grid, desc="Grid search progress"):
    model = SVC(**params)
    scores = cross_val_score(model, X_train_white_scaled, y_train_white, cv=5)
    mean_score = np.mean(scores)
    results.append({'params': params, 'score': mean_score})

# Trova i migliori parametri
best_result = max(results, key=lambda x: x['score'])

print("Migliori parametri:", best_result['params'])
print(f"Miglior score medio: {best_result['score']:.2f}")

# Instanciate the best model with the best parameters
best_model_white = SVC(**best_result['params'])

# Fit the best model on the training data
best_model_white.fit(X_train_white_scaled, y_train_white)

# Make predictions on the test set for the white wine dataset
y_pred_best_white = best_model_white.predict(X_test_white_scaled)

# Calculate the accuracy of the best model for the white wine dataset
best_accuracy_white = accuracy_score(y_test_white, y_pred_best_white)
print(f"Accuracy of the best SVM model on the white wine test set: {best_accuracy_white:.2f}")

Grid search progress: 100%|██████████| 12/12 [01:12<00:00,  6.06s/it]


Migliori parametri: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Miglior score medio: 0.5783531420230928
Accuracy of the best SVM model on the white wine test set: 0.59
