# SVM Classifier

**Import Lib**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

**Load the dataset**

In [4]:
data = pd.read_csv("./winequality-white.csv")

**Overview**

In [5]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
data.value_counts(['quality'])

quality
6          2198
5          1457
7           880
8           175
4           163
3            20
9             5
Name: count, dtype: int64

In [7]:
# Standardize the features
scaler = MinMaxScaler()
# Separate the features (X) and the target (y)
X = data.drop('quality', axis=1)
y = data['quality']
# Apply the SelectKBest feature selection method with chi-squared test as an example
k = 5  # You can adjust this value based on your dataset and problem
best_features = SelectKBest(score_func=chi2, k=k)
fit = best_features.fit(X, y)

# Get the scores and feature names
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': fit.scores_})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
print(feature_scores)

# Select the top k features
selected_features = feature_scores.head(k)['Feature'].tolist()
X_selected = X[selected_features]


                 Feature        Score
6   total sulfur dioxide  3358.298433
5    free sulfur dioxide   947.712845
3         residual sugar   501.235224
10               alcohol   155.078234
1       volatile acidity    12.622636
0          fixed acidity     7.922042
4              chlorides     2.529005
2            citric acid     0.851118
9              sulphates     0.579206
8                     pH     0.428745
7                density     0.005065


In [8]:
# data = data[['residual sugar', 'alcohol','density','chlorides', 'quality']]
data.value_counts([ 'quality'])

quality
6          2198
5          1457
7           880
8           175
4           163
3            20
9             5
Name: count, dtype: int64

**Split Data**

In [9]:

# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)


In [10]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-1.476971,-0.676949,-0.777631,-0.396463,-0.876675,0.090254,0.273654,-1.015152,0.727373,-1.572934,0.811436
1,-0.773832,0.225769,-0.199185,-0.945154,-0.50239,0.265528,-0.762947,-1.63357,0.465637,-0.870325,1.954627
2,-0.539453,-0.075137,-0.11655,-0.376867,9.088661,1.492444,0.839073,0.248282,-0.450439,-0.079891,-0.90335
3,-0.656643,-0.576647,0.048721,-1.023539,-0.455604,-0.201869,-0.998539,-1.477303,-0.843043,2.203587,0.893092
4,-1.82854,0.225769,-1.438712,-0.435656,-0.783104,0.090254,-0.904302,-1.181394,0.727373,-1.133804,1.056405


**Train And Evaluate Model**

In [12]:


# Initialize and train the model
model = SVC(kernel='rbf', C=1.0, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5663265306122449
