In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import joblib

# Load the dataset
filename = 'C:\\Users\\welun\\ITD 105\\Regression\\House_Rent_Dataset.csv'
dataframe = pd.read_csv(filename)

# Dropping less relevant columns
dataframe.drop('Posted On', axis=1, inplace=True)

# Selecting only the required columns
selected_columns = ['BHK', 'Size', 'Floor', 'Area Type', 'Area Locality' , 'City' ,'Furnishing Status', 'Rent', 'Bathroom'] #'Tenant Preferred'
dataframe = dataframe[selected_columns]

# Handling the 'Floor' feature more effectively
# Extracting numerical floor info and handling ground floor
dataframe['Floor'] = dataframe['Floor'].str.extract('(\d+)').fillna(0).astype(int)

# Specifying categorical and numerical columns
categorical_cols = ['Furnishing Status', 'Area Type', 'Area Locality' , 'City' , ] #'Tenant Preferred'
numerical_cols = ['BHK', 'Size', 'Floor', 'Bathroom']

# Creating a Column Transformer for OneHotEncoding categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Splitting the dataset
X = dataframe.drop('Rent', axis=1)
y = dataframe['Rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a pipeline with preprocessing and the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning setup
param_grid = {
    'regressor__n_estimators': [100, 200],  # Example: Trying two different values
    # Add other parameters here if needed
}

# Grid Search with Cross-Validation
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Save the trained model
model_filename = 'C:\\Users\\welun\\ITD 105\\Regression\\rent_prediction_model.pkl'
joblib.dump(best_model, model_filename)

# Output model performance metrics and the path to the saved model
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)
print("Model saved as:", model_filename)


Mean Squared Error: 2199935423.351155
Root Mean Squared Error: 46903.46920379296
R-squared: 0.44799876140545625
Mean Absolute Error: 17025.16758712303
Model saved as: C:\Users\welun\ITD 105\Regression\rent_prediction_model.pkl
