In [2]:
!pip install ucimlrepo
!pip install xgboost



In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
import pandas as pd
import numpy as np

In [4]:
# fetch dataset
wine_quality = fetch_ucirepo(id=186)

# data (as pandas dataframes)
X = wine_quality.data.features
y = wine_quality.data.targets

print(X.shape)
print(y.shape)

(6497, 11)
(6497, 1)


In [5]:
# 1. Split the dataset into training and testing sets (80/20 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Instantiate the StandardScaler
scaler = StandardScaler()

# 3. Fit on training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Display the shapes to verify
print(f"Shape of X_train_scaled: {X_train_scaled.shape}")
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train_scaled: (5197, 11)
Shape of X_test_scaled: (1300, 11)
Shape of y_train: (5197, 1)
Shape of y_test: (1300, 1)


In [6]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Trains a model and evaluates it using Accuracy, AUC, Precision, Recall, F1-Score, and MCC.
    """
    # 1. Fit the model
    model.fit(X_train, np.ravel(y_train))

    # 2. Generate predictions and probabilities
    y_pred = model.predict(X_test)
    # Some models might not support predict_proba easily, but the task assumes they do for this step
    y_proba = model.predict_proba(X_test)

    # 3. Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'AUC Score': roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted'),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, average='weighted', zero_division=0),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

    return pd.Series(metrics)

In [7]:
# 1. Define the models dictionary with specified parameters
le = LabelEncoder()
y_train_encoded = le.fit_transform(np.ravel(y_train))
y_test_encoded = le.transform(np.ravel(y_test))

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='mlogloss')
}

# 2. Create an empty dictionary for results
results = {}

# 3 & 4. Iterate through models and store evaluation metrics using the v2 function
for name, model in models.items():
    print(f"Processing {name}...")
    # Passing the pre-scaled features and pre-encoded targets
    results[name] = evaluate_model(model, X_train_scaled, y_train_encoded, X_test_scaled, y_test_encoded)

# 5. Convert results to DataFrame, transpose, and sort by MCC (Matthews Correlation Coefficient)
model_comparison = pd.DataFrame(results).T
print("\nUpdated Model Comparison Results:")
display(model_comparison.sort_values(by='MCC', ascending=False))

Processing Logistic Regression...
Processing Decision Tree...
Processing K-Nearest Neighbors...
Processing Naive Bayes...
Processing Random Forest...
Processing XGBoost...

Updated Model Comparison Results:


Unnamed: 0,Accuracy,AUC Score,Precision,Recall,F1-Score,MCC
Random Forest,0.669231,0.850631,0.673772,0.669231,0.65738,0.48248
XGBoost,0.655385,0.82592,0.646191,0.655385,0.646375,0.466446
Decision Tree,0.594615,0.699867,0.598428,0.594615,0.594627,0.399938
K-Nearest Neighbors,0.545385,0.733073,0.532721,0.545385,0.534968,0.299032
Logistic Regression,0.536154,0.711083,0.500927,0.536154,0.50193,0.250956
Naive Bayes,0.465385,0.670613,0.488536,0.465385,0.460981,0.237242


In [8]:
import joblib

# 1 & 2. Iterate through models and save each with a formatted filename
for name, model in models.items():
    filename = name.lower().replace(" ", "_") + ".joblib"
    joblib.dump(model, filename)
    print(f"Model saved: {filename}")

# 3. Save the scaler and label encoder
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(le, 'label_encoder.joblib')

# 4. Confirmation message
print("\nAll 6 models and 2 preprocessing objects have been successfully exported to the current directory.")

Model saved: logistic_regression.joblib
Model saved: decision_tree.joblib
Model saved: k-nearest_neighbors.joblib
Model saved: naive_bayes.joblib
Model saved: random_forest.joblib
Model saved: xgboost.joblib

All 6 models and 2 preprocessing objects have been successfully exported to the current directory.


In [9]:
import pandas as pd

# 1. Concatenate X_test and y_test horizontally
test_df = pd.concat([X_test, y_test], axis=1)

# 2. Export to CSV without the index
test_df.to_csv('test_data.csv', index=False)

# 3. Verification: display first few rows of the saved file
print("Sample of test_data.csv:")
display(pd.read_csv('test_data.csv').head())
print(f"\nFile 'test_data.csv' created successfully with shape: {test_df.shape}")

Sample of test_data.csv:


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.17,0.74,12.8,0.045,24.0,126.0,0.9942,3.26,0.38,12.2,8
1,7.7,0.64,0.21,2.2,0.077,32.0,133.0,0.9956,3.27,0.45,9.9,5
2,6.8,0.39,0.34,7.4,0.02,38.0,133.0,0.99212,3.18,0.44,12.0,7
3,6.3,0.28,0.47,11.2,0.04,61.0,183.0,0.99592,3.12,0.51,9.5,6
4,7.4,0.35,0.2,13.9,0.054,63.0,229.0,0.99888,3.11,0.5,8.9,6



File 'test_data.csv' created successfully with shape: (1300, 12)
