In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('ZomataoData.csv')

# Data Exploration
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData types and missing values:")
print(df.info())
print("\nDescriptive statistics:")
print(df.describe())

# Data Cleaning
# Handle missing values
df['Cuisines'].fillna('Unknown', inplace=True)
df['Rating'].fillna(df['Rating'].median(), inplace=True)

# Feature Engineering
# Extract number of cuisines
df['Num_Cuisines'] = df['Cuisines'].apply(lambda x: len(str(x).split(',')))

# Convert boolean columns to numeric
bool_cols = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
for col in bool_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# Extract city tier (assuming cities with more restaurants are bigger)
city_counts = df['City'].value_counts()
df['City_Tier'] = df['City'].map(city_counts)
df['City_Tier'] = pd.qcut(df['City_Tier'], 3, labels=['Small', 'Medium', 'Large'])

# Convert currency to binary (local vs foreign)
df['Local_Currency'] = df['Currency'].apply(lambda x: 1 if 'Pula' in str(x) or 'Real' in str(x) or 'Dollar' in str(x) else 0)

# Data Visualization
plt.figure(figsize=(15, 10))

# Rating distribution
plt.subplot(2, 2, 1)
sns.histplot(df['Rating'], bins=20, kde=True)
plt.title('Rating Distribution')

# Price range vs rating
plt.subplot(2, 2, 2)
sns.boxplot(x='Price range', y='Rating', data=df)
plt.title('Price Range vs Rating')

# Online delivery vs rating
plt.subplot(2, 2, 3)
sns.boxplot(x='Has Online delivery', y='Rating', data=df)
plt.title('Online Delivery vs Rating')

# Table booking vs rating
plt.subplot(2, 2, 4)
sns.boxplot(x='Has Table booking', y='Rating', data=df)
plt.title('Table Booking vs Rating')

plt.tight_layout()
plt.show()

# Correlation analysis
numeric_cols = ['Average Cost for two', 'Price range', 'Votes', 'Num_Cuisines', 'Rating']
corr_matrix = df[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Feature Selection and Preprocessing
# Select features for modeling
features = ['City', 'Cuisines', 'Average Cost for two', 'Price range', 'Votes', 
            'Has Table booking', 'Has Online delivery', 'Num_Cuisines', 'City_Tier', 
            'Local_Currency']
target = 'Rating'

X = df[features]
y = df[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
numeric_features = ['Average Cost for two', 'Price range', 'Votes', 'Num_Cuisines']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['City', 'Cuisines', 'City_Tier']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature selection and modeling pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', RandomForestRegressor(random_state=42))])

# Model Training
pipeline.fit(X_train, y_train)

# Model Evaluation
y_pred = pipeline.predict(X_test)

print("\nModel Performance:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred)))

# Feature Importance
# Get feature names after one-hot encoding
preprocessor.fit(X_train)
feature_names = numeric_features + list(pipeline.named_steps['preprocessor']
                          .named_transformers_['cat']
                          .named_steps['onehot']
                          .get_feature_names_out(categorical_features))

# Get selected features
selected_features = pipeline.named_steps['feature_selection'].get_support()
selected_feature_names = [f for f, s in zip(feature_names, selected_features) if s]

# Get feature importances from the Random Forest model
importances = pipeline.named_steps['regressor'].feature_importances_

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({
    'Feature': selected_feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
plt.title('Top 20 Important Features')
plt.tight_layout()
plt.show()

# Compare different models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Linear Regression': LinearRegression(),
    'SVR': SVR()
}

results = {}
for name, model in models.items():
    pipeline.set_params(regressor=model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {'RMSE': rmse, 'R2': r2}

results_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(results_df)

# Hyperparameter Tuning for the best model (Random Forest)
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Final Evaluation
y_pred_final = best_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_final))
final_r2 = r2_score(y_test, y_pred_final))

print("\nFinal Model Performance:")
print("RMSE:", final_rmse)
print("R2 Score:", final_r2)

# Example prediction
example_data = pd.DataFrame({
    'City': ['Makati City'],
    'Cuisines': ['Japanese, Sushi'],
    'Average Cost for two': [1500],
    'Price range': [4],
    'Votes': [365],
    'Has Table booking': [0],
    'Has Online delivery': [0],
    'Num_Cuisines': [2],
    'City_Tier': ['Large'],
    'Local_Currency': [1]
})

predicted_rating = best_model.predict(example_data)
print("\nExample Prediction:")
print(f"Predicted Rating: {predicted_rating[0]:.2f}")
