In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Set a random seed for reproducibility
np.random.seed(42)

Observing Data

In [None]:
file_path = 'winequality-red-4.csv'

# Step 1: Read the header line
with open(file_path, 'r', encoding='utf-8') as f:
    header_line = f.readline().strip()

print("Original Header Line:")
print(header_line)

# Step 2: Split the header by semicolon
raw_columns = header_line.split(';')

# Step 3: Clean each column name by removing extra quotes
clean_columns = [col.replace('""', '"').strip('"') for col in raw_columns]

print("\nCleaned Column Names:")
print(clean_columns)

# Step 4: Load the data using the cleaned column names
data = pd.read_csv(
    file_path,
    sep=';',               
    header=None,           
    names=clean_columns,  
    skiprows=1,           
    engine='python',       
    on_bad_lines='skip'    
)

In [None]:
# Initial Data Inspection
print("\nData Info:")
print(data.info())
print("\nChecking for missing values:")
print(data.isnull().sum())


In [None]:
data.head()


In [None]:
data.describe()


In [None]:
data.columns

In [None]:
data.columns = [col.strip('"') for col in data.columns]
data.columns
# Check if 'quality' column exists
if 'quality' in data.columns:
    print("\n'quality' column is present.")
else:
    print("\n'quality' column is missing. Available columns:")
    print(data.columns)



PreProcessing

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='quality', data=data, palette='viridis')
plt.title('Distribution of Wine Quality Scores')
plt.xlabel('Quality Score')
plt.ylabel('Count')
plt.show()

In [None]:
# Histograms for each feature
data.hist(bins=20, figsize=(15, 12), color='skyblue', edgecolor='black')
plt.suptitle('Histograms of Wine Features', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(data, hue='quality', palette='viridis', diag_kind='kde')
plt.suptitle('Pairwise Relationships Between Features', y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
correlation = data.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Wine Features')
plt.show()

In [None]:
# Plot boxplots for all numerical features
plt.figure(figsize=(15, 10))
sns.boxplot(data=data, palette='viridis')
plt.title('Boxplots of Wine Features')
plt.xticks(rotation=45)
plt.show()


Feature Engineering

Converting Quality to a binary label to distinguish high quality wine from the rest. Quality of 7 and above is good while quality of under 7 is average or bad (0).

In [None]:
# Define 'good' wine as quality >= 7
data['quality_label'] = (data['quality'] >= 7).astype(int)

# Verify the distribution of the new label
plt.figure(figsize=(6,4))
sns.countplot(x='quality_label', data=data, palette='viridis')
plt.title('Distribution of Binary Quality Labels')
plt.xlabel('Quality Label (0 = Not Good, 1 = Good)')
plt.ylabel('Count')
plt.show()


In [None]:
X = data.drop(['quality', 'quality_label'], axis=1)
y = data['quality_label']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Convert scaled features back to DataFrame for easier handling
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("\nFirst 5 Rows of Scaled Features:")
print(X_scaled.head())

In [None]:
# Check class distribution
print("\nClass Distribution:")
print(y.value_counts())

# Plot class distribution
plt.figure(figsize=(6,4))
sns.countplot(x=y, palette='viridis')
plt.title('Class Distribution of Quality Labels')
plt.xlabel('Quality Label (0 = Not Good, 1 = Good)')
plt.ylabel('Count')
plt.show()


In [None]:
# Split the data into training and testing sets
# Stratify to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining and Testing Set Sizes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")


In [None]:
# --- Support Vector Machine (SVM) ---
svm_model = SVC(C=1.0, kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# --- Artificial Neural Network (ANN) ---
mlp_model = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam',
                          max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

# --- Random Forest ---
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
def evaluate_model(model, X_test, y_test):
    """Evaluate a model and return performance metrics."""
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    return acc, prec, rec, f1

# Evaluate all models
svm_metrics = evaluate_model(svm_model, X_test, y_test)
mlp_metrics = evaluate_model(mlp_model, X_test, y_test)
rf_metrics = evaluate_model(rf_model, X_test, y_test)

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': ['SVM', 'ANN', 'Random Forest'],
    'Accuracy': [svm_metrics[0], mlp_metrics[0], rf_metrics[0]],
    'Precision': [svm_metrics[1], mlp_metrics[1], rf_metrics[1]],
    'Recall': [svm_metrics[2], mlp_metrics[2], rf_metrics[2]],
    'F1-Score': [svm_metrics[3], mlp_metrics[3], rf_metrics[3]]
})

print("\nModel Performance Comparison:")
print(comparison_df)

In [None]:
# Visualize the performance comparison
plt.figure(figsize=(10,6))
sns.barplot(x='Model', y='Accuracy', data=comparison_df, palette='viridis', label='Accuracy')
sns.barplot(x='Model', y='Precision', data=comparison_df, palette='viridis', label='Precision', alpha=0.7)
sns.barplot(x='Model', y='Recall', data=comparison_df, palette='viridis', label='Recall', alpha=0.5)
sns.barplot(x='Model', y='F1-Score', data=comparison_df, palette='viridis', label='F1-Score', alpha=0.3)
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xlabel('Model')
plt.ylim(0,1)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()


In [None]:
# Function to plot confusion matrix
def plot_confusion_matrix(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Good', 'Good'])
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()

# Plot for SVM
plot_confusion_matrix(svm_model, X_test, y_test, 'Support Vector Machine')

# Plot for ANN
plot_confusion_matrix(mlp_model, X_test, y_test, 'Artificial Neural Network')

# Plot for Random Forest
plot_confusion_matrix(rf_model, X_test, y_test, 'Random Forest')

In [None]:
from sklearn.metrics import roc_curve, auc

# Function to plot ROC curve
def plot_roc_curve(model, X_test, y_test, model_name):
    y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([-0.01, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.show()

# Plot ROC for SVM
plot_roc_curve(svm_model, X_test, y_test, 'Support Vector Machine')

# Plot ROC for ANN
plot_roc_curve(mlp_model, X_test, y_test, 'Artificial Neural Network')

# Plot ROC for Random Forest
plot_roc_curve(rf_model, X_test, y_test, 'Random Forest')


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

# Function to plot Precision-Recall curve
def plot_precision_recall(model, X_test, y_test, model_name):
    y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
    avg_precision = average_precision_score(y_test, y_prob)
    
    plt.figure()
    plt.plot(recall, precision, color='blue', lw=2, label=f'AP = {avg_precision:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.legend(loc="upper right")
    plt.show()

# Plot Precision-Recall for SVM
plot_precision_recall(svm_model, X_test, y_test, 'Support Vector Machine')

# Plot Precision-Recall for ANN
plot_precision_recall(mlp_model, X_test, y_test, 'Artificial Neural Network')

# Plot Precision-Recall for Random Forest
plot_precision_recall(rf_model, X_test, y_test, 'Random Forest')


In [None]:
# Extract feature importances from Random Forest
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = X.columns

# Plot Feature Importances
plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=feature_names[indices], palette='viridis')
plt.title('Feature Importances from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
from sklearn.model_selection import learning_curve

# Function to plot learning curves
def plot_learning_curve(model, X, y, model_name):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, scoring='accuracy',
        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    plt.figure()
    plt.plot(train_sizes, train_scores_mean, 'o-', color='blue', label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='green', label='Cross-validation score')
    plt.title(f'Learning Curve - {model_name}')
    plt.xlabel('Training Size')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

# Plot Learning Curve for SVM
plot_learning_curve(svm_model, X_scaled, y, 'Support Vector Machine')

# Plot Learning Curve for ANN
plot_learning_curve(mlp_model, X_scaled, y, 'Artificial Neural Network')

# Plot Learning Curve for Random Forest
plot_learning_curve(rf_model, X_scaled, y, 'Random Forest')


In [None]:
# Melt the comparison dataframe for easier plotting
comparison_melted = comparison_df.melt(id_vars='Model', var_name='Metric', value_name='Score')

plt.figure(figsize=(10,6))
sns.barplot(x='Model', y='Score', hue='Metric', data=comparison_melted, palette='viridis')
plt.title('Model Performance Metrics Comparison')
plt.ylim(0,1)
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score

# Function to perform cross-validation and print scores
def cross_validate_model(model, X, y, model_name):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"\nCross-Validation Accuracy Scores for {model_name}: {scores}")
    print(f"Mean Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f})")

# Cross-validate SVM
cross_validate_model(svm_model, X_scaled, y, 'Support Vector Machine')

# Cross-validate ANN
cross_validate_model(mlp_model, X_scaled, y, 'Artificial Neural Network')

# Cross-validate Random Forest
cross_validate_model(rf_model, X_scaled, y, 'Random Forest')


In [None]:

# Function to print classification report
def print_classification_report(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=['Not Good', 'Good'])
    print(f"\nClassification Report for {model_name}:\n{report}")

# Print reports for each model
print_classification_report(svm_model, X_test, y_test, 'Support Vector Machine')
print_classification_report(mlp_model, X_test, y_test, 'Artificial Neural Network')
print_classification_report(rf_model, X_test, y_test, 'Random Forest')
