In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
df = pd.read_csv("../data/feature_engineered_data.csv")
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df = df.dropna(subset=['aggregate_rating'])
features = [
    'price_range',
    'votes',
    'restaurant_name_length',
    'address_length',
    'cuisine_count',
    'has_table_booking',
    'has_online_delivery'
]

X = df[features].apply(pd.to_numeric, errors='coerce')
y = df['aggregate_rating']
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=8, random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=30,   # ðŸ”¥ reduced
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    results.append([
        name,
        mean_absolute_error(y_test, preds),
        np.sqrt(mean_squared_error(y_test, preds)),
        r2_score(y_test, preds)
    ])

results_df = pd.DataFrame(
    results,
    columns=["Model", "MAE", "RMSE", "R2 Score"]
)

results_df.to_csv("../results/model_comparison.csv", index=False)

results_df



Unnamed: 0,Model,MAE,RMSE,R2 Score
0,Linear Regression,1.078238,1.291835,0.266802
1,Decision Tree,0.229419,0.350447,0.946042
2,Random Forest,0.220302,0.337217,0.95004
