In [1]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_csv("wine.csv")

In [11]:
df

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [13]:
df.shape

(178, 14)

In [15]:
df.columns

Index(['Alcohol', 'Malic_Acid', 'Ash', 'Ash_Alcanity', 'Magnesium',
       'Total_Phenols', 'Flavanoids', 'Nonflavanoid_Phenols',
       'Proanthocyanins', 'Color_Intensity', 'Hue', 'OD280', 'Proline',
       'Customer_Segment'],
      dtype='object')

In [17]:
X = df.drop("Customer_Segment", axis=1)
y = df["Customer_Segment"]

In [19]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

In [23]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [25]:
results = []

In [27]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

In [51]:
df_results = pd.DataFrame(results)
print("\n Base Model Performance:")
df_results


 Base Model Performance:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,1.0,1.0,1.0,1.0
1,Random Forest,1.0,1.0,1.0,1.0
2,SVM,1.0,1.0,1.0,1.0
3,Gradient Boosting,0.944444,0.946296,0.944444,0.943997


In [31]:
param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

In [33]:
grid_rf = GridSearchCV(
    RandomForestClassifier(),
    param_grid_rf,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1
)

In [53]:
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
print("\n Best Random Forest Params:", grid_rf.best_params_)


 Best Random Forest Params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 50}


In [37]:
param_dist_svm = {
    "C": np.logspace(-2, 2, 5),
    "gamma": ['scale', 0.01, 0.1, 1],
    "kernel": ['linear', 'rbf']
}

In [39]:
random_svm = RandomizedSearchCV(
    SVC(),
    param_dist_svm,
    n_iter=10,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    random_state=42
)

In [55]:
random_svm.fit(X_train, y_train)
best_svm = random_svm.best_estimator_
print("\n Best SVM Params:", random_svm.best_params_)


 Best SVM Params: {'kernel': 'linear', 'gamma': 0.1, 'C': 0.01}


In [43]:
tuned_results = []

In [45]:
y_pred_rf = best_rf.predict(X_test)
tuned_results.append({
    "Model": "Random Forest (Tuned)",
    "Accuracy": accuracy_score(y_test, y_pred_rf),
    "Precision": precision_score(y_test, y_pred_rf, average='weighted'),
    "Recall": recall_score(y_test, y_pred_rf, average='weighted'),
    "F1-Score": f1_score(y_test, y_pred_rf, average='weighted')
})

In [47]:
y_pred_svm = best_svm.predict(X_test)
tuned_results.append({
    "Model": "SVM (Tuned)",
    "Accuracy": accuracy_score(y_test, y_pred_svm),
    "Precision": precision_score(y_test, y_pred_svm, average='weighted'),
    "Recall": recall_score(y_test, y_pred_svm, average='weighted'),
    "F1-Score": f1_score(y_test, y_pred_svm, average='weighted')
})

In [57]:
final_results = pd.concat([df_results, pd.DataFrame(tuned_results)], ignore_index=True)
print("\n Final Model Comparison:")
print(final_results.sort_values(by="F1-Score", ascending=False))


 Final Model Comparison:
                   Model  Accuracy  Precision    Recall  F1-Score
0    Logistic Regression  1.000000   1.000000  1.000000  1.000000
1          Random Forest  1.000000   1.000000  1.000000  1.000000
2                    SVM  1.000000   1.000000  1.000000  1.000000
4  Random Forest (Tuned)  1.000000   1.000000  1.000000  1.000000
5            SVM (Tuned)  1.000000   1.000000  1.000000  1.000000
3      Gradient Boosting  0.944444   0.946296  0.944444  0.943997
