In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Load the data
df = pd.read_csv('houseRent.csv')

# Split the data into features (X) and target variable (y)
X = df[['type', 'state', 'baths', 'beds', 'sqfeet']]
y = df['price']

# One-hot encode categorical features
ohe = OneHotEncoder()
X_cat = ohe.fit_transform(X[['type', 'state']])
X_cat = pd.DataFrame(X_cat.toarray(), columns=ohe.get_feature_names_out(['type', 'state']))

# Scale numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(X[['baths', 'beds', 'sqfeet']])
X_num = pd.DataFrame(X_num, columns=['baths', 'beds', 'sqfeet'])

# Combine the categorical and numerical features
X = pd.concat([X_cat, X_num], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fine-tune the model using Grid Search
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn = KNeighborsRegressor()

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score from the grid search
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

# Fit the model with the best hyperparameters
knn = KNeighborsRegressor(**grid_search.best_params_)
knn.fit(X_train, y_train)

# Make predictions on the test set
knn_pred = knn.predict(X_test)

# Evaluate the model using Mean Squared Error
knn_mse = mean_squared_error(y_test, knn_pred)
print('K-Nearest Neighbors MSE:', knn_mse)

# Make predictions on new data
new_data = pd.DataFrame({
    'type': ['house'],
    'state': ['ca'],
    'baths': [1],
    'beds': [2],
    'sqfeet': [1000]
})
new_data_cat = ohe.transform(new_data[['type', 'state']])
new_data_cat = pd.DataFrame(new_data_cat.toarray(), columns=ohe.get_feature_names_out(['type', 'state']))
new_data_num = new_data[['baths', 'beds', 'sqfeet']]
new_data_num = scaler.transform(new_data_num)
new_data_num = pd.DataFrame(new_data_num, columns=['baths', 'beds', 'sqfeet'])
new_data = pd.concat([new_data_cat, new_data_num], axis=1)
prediction = knn.predict(new_data)
print('Predicted rent:', prediction)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
Best Score: -90.56684129639771
K-Nearest Neighbors MSE: 2432550470.9831862
Predicted rent: [2556.42857143]
