In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the datasets
url_country_code = "https://github.com/dsrscientist/dataset4/raw/main/Country-Code.xlsx"
url_zomato = "https://github.com/dsrscientist/dataset4/raw/main/zomato.csv"

country_code_data = pd.read_excel(url_country_code)
zomato_data = pd.read_csv(url_zomato)

# Merge the datasets using Country Code
data = pd.merge(zomato_data, country_code_data, left_on='Country Code', right_on='Country Code')

# Display basic information about the dataset
print(data.info())
print(data.head())

# Preprocessing
# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
categorical_cols = ['City', 'Locality', 'Cuisines', 'Currency']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Drop irrelevant columns
data = data.drop(['Restaurant ID', 'Restaurant Name', 'Country Code', 'Address', 'Locality Verbose', 'Rating color', 'Rating text'], axis=1)

# Fill missing values with mean
data = data.fillna(data.mean())

# Split the data into features (X) and target (y) for both tasks
X_avg_cost = data.drop(['Average Cost for two', 'Price range'], axis=1)
y_avg_cost = data['Average Cost for two']
X_price_range = data.drop(['Average Cost for two', 'Price range'], axis=1)
y_price_range = data['Price range']

# Split the data into training and testing sets for both tasks
X_train_avg_cost, X_test_avg_cost, y_train_avg_cost, y_test_avg_cost = train_test_split(X_avg_cost, y_avg_cost, test_size=0.2, random_state=42)
X_train_price_range, X_test_price_range, y_train_price_range, y_test_price_range = train_test_split(X_price_range, y_price_range, test_size=0.2, random_state=42)

# Train RandomForestRegressor for both tasks
reg_avg_cost_model = RandomForestRegressor(random_state=42)
reg_avg_cost_model.fit(X_train_avg_cost, y_train_avg_cost)
y_pred_avg_cost = reg_avg_cost_model.predict(X_test_avg_cost)
mse_avg_cost = mean_squared_error(y_test_avg_cost, y_pred_avg_cost)
print("Average Cost for two Mean Squared Error:", mse_avg_cost)

reg_price_range_model = RandomForestRegressor(random_state=42)
reg_price_range_model.fit(X_train_price_range, y_train_price_range)
y_pred_price_range = reg_price_range_model.predict(X_test_price_range)
mse_price_range = mean_squared_error(y_test_price_range, y_pred_price_range)
print("Price Range Mean Squared Error:", mse_price_range)
