In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import random
# Load dataset
df = pd.read_csv("Dataset .csv", encoding='latin1')
# Step 1: Preprocessing
df['Cuisines'] = df['Cuisines'].fillna(df['Cuisines'].mode()[0])
# Simplify cuisine names (better coverage & normalize)
def simplify_cuisine(cuisine):
    cuisine = cuisine.lower()
    if 'indian' in cuisine:
        return 'Indian'
    elif 'chinese' in cuisine:
        return 'Chinese'
    elif 'italian' in cuisine:
        return 'Italian'
    elif 'american' in cuisine:
        return 'American'
    elif 'continental' in cuisine:
        return 'Continental'
    elif 'mexican' in cuisine:
        return 'Mexican'
    elif 'japanese' in cuisine:
        return 'Japanese'
    elif 'thai' in cuisine:
        return 'Thai'
    elif 'fast food' in cuisine or 'burger' in cuisine or 'pizza' in cuisine:
        return 'Fast Food'
    elif 'south indian' in cuisine:
        return 'South Indian'
    elif 'north indian' in cuisine:
        return 'North Indian'
    else:
        return 'Other'
df['Cuisine_Simplified'] = df['Cuisines'].apply(simplify_cuisine)
# Filter out cuisines with fewer than 50 samples
value_counts = df['Cuisine_Simplified'].value_counts()
df = df[df['Cuisine_Simplified'].isin(value_counts[value_counts > 50].index)]
# Encode labels
le_cuisine = LabelEncoder()
df['Cuisine_Label'] = le_cuisine.fit_transform(df['Cuisine_Simplified'])
# Encode categorical features safely with fillna before encoding
for col in ['City', 'Locality', 'Currency', 'Rating text']:
    df[col] = df[col].fillna('Unknown')

le_city = LabelEncoder()
df['City_Label'] = le_city.fit_transform(df['City'])
le_locality = LabelEncoder()
df['Locality_Label'] = le_locality.fit_transform(df['Locality'])
le_currency = LabelEncoder()
df['Currency_Label'] = le_currency.fit_transform(df['Currency'])
le_rating_text = LabelEncoder()
df['Rating_Text_Label'] = le_rating_text.fit_transform(df['Rating text'])
# Step 2: Feature Selection
features = ['Price range', 'Aggregate rating', 'Votes',
            'City_Label', 'Locality_Label', 'Currency_Label', 'Rating_Text_Label']
X = df[features]
y = df['Cuisine_Label']
# Handle missing numerical data (if any)
X = X.fillna(X.median())
# Step 3: Train/Test Split with stratify to maintain class balance in splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Step 4: Train Random Forest with balanced class weights
model = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)
# Step 5: Predict & Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Accuracy: {accuracy:.4f}")
# Pick 5 random cuisine labels from the test set for detailed report
labels_in_test = sorted(np.unique(y_test))
sample_labels = random.sample(list(labels_in_test), min(5, len(labels_in_test)))
sample_target_names = le_cuisine.inverse_transform(sample_labels)
print("\n✅ Classification Report (for 5 randomly selected cuisines):")
print(classification_report(y_test, y_pred, labels=sample_labels,target_names=sample_target_names, zero_division=0))
# Confusion matrix (full matrix for all test classes)
cm = confusion_matrix(y_test, y_pred, labels=labels_in_test)
print("\nConfusion Matrix:\n", cm)


✅ Accuracy: 0.4035

✅ Classification Report (for 5 randomly selected cuisines):
              precision    recall  f1-score   support

       Other       0.33      0.31      0.32       406
        Thai       0.06      0.10      0.08        51
    American       0.24      0.13      0.17        63
      Indian       0.55      0.69      0.61       854
    Japanese       0.17      0.07      0.10        14

   micro avg       0.47      0.52      0.49      1388
   macro avg       0.27      0.26      0.25      1388
weighted avg       0.45      0.52      0.48      1388


Confusion Matrix:
 [[  8   2   0   5  26   2   0   3  16   1]
 [  2  15   0  18 103   2   0   0  30   6]
 [  0   0   0   2   8   1   0   0   7   1]
 [  6  27   0  27 101   0   2   2  53  17]
 [  3  44   1  75 589  13   1   0 100  28]
 [  2   3   0   5  54   2   0   0  16   0]
 [  1   0   0   1   6   0   1   1   4   0]
 [  4   1   0   0   1   0   0   0   5   0]
 [  6  28   0  55 160   9   2   0 124  22]
 [  1   3   0   5  21  