In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint

# Load the data
df = pd.read_csv('houseRent.csv')

# Split the data into features (X) and target variable (y)
X = df[['type', 'state', 'baths', 'beds', 'sqfeet']]
y = df['price']

# One-hot encode categorical features
ohe = OneHotEncoder()
X_cat = ohe.fit_transform(X[['type', 'state']])
X_cat = pd.DataFrame(X_cat.toarray(), columns=ohe.get_feature_names_out(['type', 'state']))

# Scale numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(X[['baths', 'beds', 'sqfeet']])
X_num = pd.DataFrame(X_num, columns=['baths', 'beds', 'sqfeet'])

# Combine the categorical and numerical features
X = pd.concat([X_cat, X_num], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fine-tune the model using Randomized Search
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

# Get the best parameters and best score from the random search
print('Best Parameters:', random_search.best_params_)
print('Best Score:', random_search.best_score_)

# Fit the model with the best hyperparameters
rf = RandomForestRegressor(**random_search.best_params_)
rf.fit(X_train, y_train)

# Make predictions on the test set
rf_pred = rf.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_pred)
print('Random Forest MSE:', rf_mse)



new_data = pd.DataFrame({
    'type': ['house'],
    'state': ['ca'],
    'baths': [1],
    'beds': [2],
    'sqfeet': [1000]
})
new_data_cat = ohe.transform(new_data[['type', 'state']])
new_data_cat = pd.DataFrame(new_data_cat.toarray(), columns=ohe.get_feature_names_out(['type', 'state']))
new_data_num = new_data[['baths', 'beds', 'sqfeet']]
new_data_num = scaler.transform(new_data_num)
new_data_num = pd.DataFrame(new_data_num, columns=['baths', 'beds', 'sqfeet'])
new_data = pd.concat([new_data_cat, new_data_num], axis=1)
prediction = rf.predict(new_data)
print('Predicted rent:', prediction)
