### Import necessary libraries

This code sets up a machine learning workflow for multi-label classification by importing tools for data preprocessing, feature transformation, model building (Logistic Regression, Random Forest with One-vs-Rest), evaluation, and label encoding.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

### Define data path

This line defines the file path to the dataset (Dataset.csv) stored on the E: drive for use in the ML project.

In [2]:
DATA_PATH = r"E:\Backup\Mahima\ML Internship 2025\Dataset.csv"

### Load and preprocess data

This code loads the dataset into a pandas DataFrame and standardizes column names by converting them to lowercase, stripping spaces, and replacing spaces with underscores.

In [3]:
df = pd.read_csv(DATA_PATH)
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

### Prepare target variable and features

This code processes the cuisines column into the top 10 most frequent cuisines using Counter, converts them into multi-label binary format with MultiLabelBinarizer, prepares X by dropping cuisines, and separates features into categorical and numerical columns.

In [4]:
y_list = df["cuisines"].fillna("").apply(lambda s: [t.strip() for t in str(s).split(",") if t.strip()])
ct = Counter()
_ = y_list.apply(ct.update)
top = [c for c,_ in ct.most_common(10)]
y_list_top = y_list.apply(lambda items: [i for i in items if i in top])

mlb = MultiLabelBinarizer(classes=top)
Y = mlb.fit_transform(y_list_top)
X = df.drop(columns=["cuisines"])

cat_cols = [c for c in X.columns if X[c].dtype==object]
num_cols = [c for c in X.columns if X[c].dtype!=object]

### Create preprocessing pipelines for numerical and categorical features

This code defines a ColumnTransformer that preprocesses numerical features by imputing missing values with the median and scaling them, and categorical features by imputing with the most frequent value and applying one-hot encoding.

In [5]:
preprocess = ColumnTransformer([
    ("num", Pipeline([("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())]), num_cols),
    ("cat", Pipeline([("impute", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
])

### Split data into training and testing sets

This code splits the dataset into training and testing sets, using 80% for training and 20% for testing with a fixed random seed for reproducibility.

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Train and evaluate Logistic Regression model

This code builds a pipeline that preprocesses the data and trains a One-vs-Rest Logistic Regression model, then prints a classification report comparing predictions against the test set.

In [10]:
ovr_lr = Pipeline([("prep", preprocess),("clf", OneVsRestClassifier(LogisticRegression(max_iter=300)))])
ovr_lr.fit(X_train, Y_train)
print("Logistic Regression:\n", classification_report(Y_test, ovr_lr.predict(X_test), target_names=mlb.classes_))

Logistic Regression:
               precision    recall  f1-score   support

North Indian       0.68      0.59      0.63       805
     Chinese       0.63      0.28      0.39       555
   Fast Food       0.87      0.25      0.39       401
     Mughlai       1.00      0.09      0.17       185
     Italian       0.71      0.16      0.26       159
      Bakery       1.00      0.03      0.06       138
 Continental       0.57      0.28      0.38       137
        Cafe       0.88      0.27      0.41       141
    Desserts       1.00      0.09      0.16       114
South Indian       1.00      0.08      0.15       149

   micro avg       0.70      0.31      0.43      2784
   macro avg       0.84      0.21      0.30      2784
weighted avg       0.77      0.31      0.40      2784
 samples avg       0.33      0.26      0.28      2784



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Train and evaluate RandomForest model

This code builds a pipeline that preprocesses the data and trains a One-vs-Rest Random Forest classifier with 200 trees, then prints the classification report on the test set.

In [8]:
ovr_rf = Pipeline([("prep", preprocess),("clf", OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42)))])
ovr_rf.fit(X_train, Y_train)
print("RandomForest:\n", classification_report(Y_test, ovr_rf.predict(X_test), target_names=mlb.classes_))

RandomForest:
               precision    recall  f1-score   support

North Indian       0.70      0.55      0.62       805
     Chinese       0.66      0.15      0.24       555
   Fast Food       0.92      0.18      0.30       401
     Mughlai       1.00      0.03      0.05       185
     Italian       0.75      0.04      0.07       159
      Bakery       1.00      0.01      0.01       138
 Continental       0.67      0.04      0.08       137
        Cafe       0.95      0.26      0.40       141
    Desserts       1.00      0.09      0.16       114
South Indian       0.88      0.10      0.18       149

   micro avg       0.73      0.24      0.36      2784
   macro avg       0.85      0.14      0.21      2784
weighted avg       0.80      0.24      0.32      2784
 samples avg       0.29      0.20      0.23      2784



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
