In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

file_path = 'DATA/wines-quality.csv'
df = pd.read_csv(file_path)
print(df.columns)
df.head()

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'color'],
      dtype='object')


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [11]:
### 2 - Créer une nouvelle colonne 'good_quality' : 1 si quality >= 6, sinon 0
df['good_quality'] = (df['quality'] >= 6).astype(int)

# Encodage des variables catégorielles (comme 'color')
df = pd.get_dummies(df, columns=['color'], drop_first=True)
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,good_quality,color_white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,False
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,False
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,False
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1,False
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,False


In [12]:
# Séparer les données en variables explicatives (X) et variable cible (y)
X = df.drop(['quality', 'good_quality'], axis=1)
y = df['good_quality']

In [13]:
# Diviser les données en ensemble d'entraînement et de test (80% entraînement, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Standardiser les données (important pour K-NN et Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
################# Régression Logistique

from sklearn.linear_model import LogisticRegression

# Entraîner le modèle de régression logistique avec max_iter augmenté
logreg_model = LogisticRegression(max_iter=500)
logreg_model.fit(X_train_scaled, y_train)

# Prédictions
y_pred_logreg = logreg_model.predict(X_test_scaled)

# Évaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.7261538461538461
Confusion Matrix:
 [[265 186]
 [170 679]]
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.59      0.60       451
           1       0.78      0.80      0.79       849

    accuracy                           0.73      1300
   macro avg       0.70      0.69      0.70      1300
weighted avg       0.72      0.73      0.72      1300



In [16]:
################# K-Nearest Neighbors (K-NN)

from sklearn.neighbors import KNeighborsClassifier

# Initialiser le modèle K-NN (choisir k = 5 voisins ici)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Prédictions
y_pred_knn = knn_model.predict(X_test_scaled)

# Évaluation
print("K-NN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))

K-NN Accuracy: 0.7469230769230769
Confusion Matrix:
 [[279 172]
 [157 692]]
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.62      0.63       451
           1       0.80      0.82      0.81       849

    accuracy                           0.75      1300
   macro avg       0.72      0.72      0.72      1300
weighted avg       0.75      0.75      0.75      1300



In [17]:
#################Arbre de Décision (Decision Tree)

from sklearn.tree import DecisionTreeClassifier
# Initialiser le modèle Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)

tree_model.fit(X_train, y_train) # Pas besoin de scaler pour l'arbre de décision

# Prédictions
y_pred_tree = tree_model.predict(X_test)

# Évaluation
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
print("Classification Report:\n", classification_report(y_test, y_pred_tree))

Decision Tree Accuracy: 0.7507692307692307
Confusion Matrix:
 [[305 146]
 [178 671]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.68      0.65       451
           1       0.82      0.79      0.81       849

    accuracy                           0.75      1300
   macro avg       0.73      0.73      0.73      1300
weighted avg       0.76      0.75      0.75      1300



In [18]:
# Comparaison des matrices de confusion
print("\nLogistic Regression - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("\nK-NN - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nDecision Tree - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))


Logistic Regression - Confusion Matrix:
 [[265 186]
 [170 679]]

K-NN - Confusion Matrix:
 [[279 172]
 [157 692]]

Decision Tree - Confusion Matrix:
 [[305 146]
 [178 671]]


In [19]:
# Comparaison des rapports de classification (precision, recall, f1-score)
print("\nLogistic Regression - Classification Report:\n", classification_report(y_test,
y_pred_logreg))
print("\nK-NN - Classification Report:\n", classification_report(y_test, y_pred_knn))
print("\nDecision Tree - Classification Report:\n", classification_report(y_test, y_pred_tree))


Logistic Regression - Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.59      0.60       451
           1       0.78      0.80      0.79       849

    accuracy                           0.73      1300
   macro avg       0.70      0.69      0.70      1300
weighted avg       0.72      0.73      0.72      1300


K-NN - Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.62      0.63       451
           1       0.80      0.82      0.81       849

    accuracy                           0.75      1300
   macro avg       0.72      0.72      0.72      1300
weighted avg       0.75      0.75      0.75      1300


Decision Tree - Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.68      0.65       451
           1       0.82      0.79      0.81       849

    accuracy                           0.75      1300
   macro