<a href="https://www.kaggle.com/code/kelvinobiri/manhattan-house-rent-prediction?scriptVersionId=247505637" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Import Modules

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold , cross_val_score
from sklearn.svm import SVR

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns


## Read Data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/sonnynomnom/Codecademy-Machine-Learning-Fundamentals/master/StreetEasy/manhattan.csv")

df.head()

## Getting Independent Features And Target

In [None]:

x = df[['bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee', 'has_roofdeck', 'has_washer_dryer', 'has_doorman', 'has_elevator', 'has_dishwasher', 'has_patio', 'has_gym']]
y= df.rent

## Getting correlation between independent features and Target
for i in x:
   corr= np.corrcoef(df[i],df['rent'])
   print(corr)


## Correlation Matrix

In [None]:
corr_matrix = x.select_dtypes('number').corr()
plt.figure(figsize=(corr_matrix.shape[0], corr_matrix.shape[1]))
sns.heatmap(corr_matrix, annot=True)

## Removing Highly correlated independent features 
**based on the heatmap we can see bedrooms and size_sqft are highly correlated to avoid **Multicollinearity* i will take out **bedrooms* and leave **size_sqft**

In [None]:
## removed **bedrooms**
x = df[[ 
        'bathrooms', 
        'size_sqft', 
        'min_to_subway', 
        'floor', 
        'building_age_yrs', 
        'no_fee', 
        'has_roofdeck', 
        'has_washer_dryer', 
        'has_doorman', 
        'has_elevator', 
        'has_dishwasher', 
        'has_patio', 
        'has_gym']]

## Plotting Dependent Features Against Target Feature

In [None]:
for i in x:
 plt.scatter(df[i],df['rent'])
 plt.xlabel(i)
 plt.ylabel('rent')
 plt.show()

## Detecting Outlliers

In [None]:
for i in x:
 sns.boxplot(df[i])
 plt.xlabel(i)
 plt.show()


## Detecting Null Features

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.isnull().sum()

## Cross Validation Evaluating For The Best Model 

In [None]:
models = {
    # "Ridge": Ridge(),
    # "Lasso": Lasso(),
    "Linear Regression": LinearRegression(),
    # "Random Forest": RandomForestRegressor(),
    "SVR": SVR()
}

kf=KFold(n_splits=6,shuffle=True,random_state=42)

best_score = -np.inf
best_model_name = None

for name, model in models.items():
    scores = cross_val_score(model, x, y, cv=kf)
    mean_score = np.mean(scores)
    print(f"{name}: {mean_score:.4f} (±{np.std(scores):.4f})")
    if mean_score > best_score:
        best_score = mean_score
        best_model_name = name

model = models[best_model_name]


print(f"Best model: {model}") 

## Training Data and Predicting

In [None]:
final_model = model.fit(x, y)

new_data = [[
    1,      # bathrooms
    750,    # size_sqft
    5,      # min_to_subway
    2,      # floor
    10,     # building_age_yrs
    1,      # no_fee
    0,      # has_roofdeck
    1,      # has_washer_dryer
    0,      # has_doorman
    1,      # has_elevator
    1,      # has_dishwasher
    0,      # has_patio
    1       # has_gym
]]

predicted_rent = final_model.predict(new_data)
print(f"Predicted rent: ${predicted_rent[0]:.2f}")

In [None]:
for i in x:
    plt.scatter(df[i], y, label='Actual')

    X_line = x.copy()
    for col in x.columns:
        if col != i:
            X_line[col] = X_line[col].mean()
    y_pred_line = final_model.predict(X_line)

    plt.plot(df[i], y_pred_line, color='red', label='Regression line')
    plt.xlabel(i)
    plt.ylabel('rent')
    plt.title(f'{i} vs Rent')
    plt.legend()
    plt.show()