In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from skopt import BayesSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('rental prices.csv')  # Replace with your actual dataset path

# Drop unnecessary columns
data = data.drop(columns=['Point of Contact', 'Posted On', 'Tenant Preferred'])

print(data.head())


# Extract floor information
def extract_floor_info(floor_info):
    try:
        parts = floor_info.split(' out of ')
        if len(parts) != 2:
            raise ValueError("Unexpected format")
        if parts[0].lower() == 'ground':
            floor_number = 0
        elif parts[0].lower() == 'upper basement':
            floor_number = -1
        elif parts[0].lower() == 'lower basement':
            floor_number = -2
        else:
            floor_number = int(parts[0])
        total_floors = int(parts[1])
        return floor_number, total_floors
    except Exception as e:
        print(f"Error processing floor info '{floor_info}': {e}")
        return None, None

if 'Floor' in data.columns:
    data['Floor Number'], data['Total Floors'] = zip(*data['Floor'].apply(extract_floor_info))
    data = data.drop(columns=['Floor'])
    data = data.dropna(subset=['Floor Number'])

# Encode categorical features
label_encoder = LabelEncoder()

if 'Area Type' in data.columns:
    data['Area Type Encoded'] = label_encoder.fit_transform(data['Area Type'])
    data = data.drop(columns=['Area Type'])

if 'Area Locality' in data.columns:
    data['Area Locality Encoded'] = label_encoder.fit_transform(data['Area Locality'])
    data = data.drop(columns=['Area Locality'])

if 'Furnishing Status' in data.columns:
    data['Furnishing Status Encoded'] = label_encoder.fit_transform(data['Furnishing Status'])
    data = data.drop(columns=['Furnishing Status'])

if 'City' in data.columns:
    data['City Encoded'] = label_encoder.fit_transform(data['City'])
    data = data.drop(columns=['City'])

# Normalize/scale numerical features
scaler = StandardScaler()
numerical_features = ['Size', 'Bathroom', 'Floor Number', 'Total Floors']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Define features (X) and target (y)
X = data.drop(columns=['Rent'])
y = data['Rent']

# Define cross-validation method
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scorer = make_scorer(mean_squared_error)

# Function to evaluate model with cross-validation
def evaluate_model(model):
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring=mse_scorer)
    return np.mean(mse_scores)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR(),
    'XGBoost Regressor': XGBRegressor(random_state=42),
    'Neural Network': MLPRegressor(random_state=42, max_iter=3000, hidden_layer_sizes=(150, 100), learning_rate_init=0.01),
    'Gaussian Process Regressor': GaussianProcessRegressor()
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f'{model_name} Mean Squared Error: {mse}')


   BHK   Rent  Size            Floor    Area Type             Area Locality  \
0    2  10000  1100  Ground out of 2   Super Area                    Bandel   
1    2  20000   800       1 out of 3   Super Area  Phool Bagan, Kankurgachi   
2    2  17000  1000       1 out of 3   Super Area   Salt Lake City Sector 2   
3    2  10000   800       1 out of 2   Super Area               Dumdum Park   
4    2   7500   850       1 out of 2  Carpet Area             South Dum Dum   

      City Furnishing Status  Bathroom  
0  Kolkata       Unfurnished         2  
1  Kolkata    Semi-Furnished         1  
2  Kolkata    Semi-Furnished         1  
3  Kolkata       Unfurnished         1  
4  Kolkata       Unfurnished         1  
Error processing floor info '3': Unexpected format
Error processing floor info 'Ground': Unexpected format
Error processing floor info '1': Unexpected format
Error processing floor info '1': Unexpected format
Linear Regression Mean Squared Error: 2195232784.961626
Decision Tree 

Trying an ensemble approach

In [11]:
from sklearn.ensemble import StackingRegressor

# Define base models
base_models = [
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('rf', RandomForestRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('svr', SVR()),
    ('xgb', XGBRegressor(random_state=42)),
    ('nn', MLPRegressor(random_state=42, max_iter=3000, hidden_layer_sizes=(150, 100), learning_rate_init=0.01))
]

# Define the meta-model
meta_model = LinearRegression()

# Create the stacking regressor
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions and evaluate the model
stacking_predictions = stacking_regressor.predict(X_test)
stacking_mse = mean_squared_error(y_test, stacking_predictions)
print(f'Stacking Regressor Mean Squared Error: {stacking_mse}')

Stacking Regressor Mean Squared Error: 1591184695.1646278


In [12]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# Trying a different meta-model, e.g., Ridge Regression
meta_model = Ridge(alpha=1.0)

# Create the stacking regressor with the new meta-model
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions and evaluate the model
stacking_predictions = stacking_regressor.predict(X_test)
stacking_mse = mean_squared_error(y_test, stacking_predictions)
print(f'Stacking Regressor Mean Squared Error with Ridge: {stacking_mse}')

Stacking Regressor Mean Squared Error with Ridge: 1591184695.1622167
