In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor  
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import pandas as pd
import numpy as np
import sklearn.metrics as metrics

In [2]:
# reading our dataframe

dataset = pd.read_csv('data.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
dataset = pd.get_dummies(dataset, columns=['ocean_proximity'])

In [4]:
# Check for missing values in the DataFrame
print(dataset.isnull().sum())

# Fill missing values with median value of each column
dataset.fillna(dataset.median(), inplace=True)
dataset.info()

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
median_house_value              0
ocean_proximity_<1H OCEAN       0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20640 non-null  float64
 1   latitude                    20640 non-null  float64
 2   housing_median_age          20640 non-null  float64
 3   total_rooms                 20640 non-null  float64
 4   total_bedrooms              20640 non-null  float64
 5 

In [5]:
# splitting data in training and test data

X, y = dataset.drop(columns=['median_house_value']), dataset['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(f'Size of train data: {len(X_train)}')
print(f'Size of test data: {len(X_test)}')

Size of train data: 16512
Size of test data: 4128


In [6]:
# Regression tree model fitting

rt_model = DecisionTreeRegressor()
rt_model.fit(X_train, y_train)
    
rt_model_pred = rt_model.predict(X_test)
mse=metrics.mean_squared_error(y_test, rt_model_pred) 
    
print(f'Regression tree model MSE: {round(mse,4)}')

Regression tree model MSE: 4893504484.4922


In [7]:
rf = RandomForestRegressor(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
mse=metrics.mean_squared_error(y_test, rf_pred) 
    
print(f'Random forest model MSE: {round(mse,4)}')

Random forest model MSE: 2376308913.6321


In [8]:
rf = RandomForestRegressor(n_estimators=30, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
mse=metrics.mean_squared_error(y_test, rf_pred) 
    
print(f'Random forest model MSE: {round(mse,4)}')

Random forest model MSE: 2483654205.1762


In [9]:
gbm = GradientBoostingRegressor(n_estimators=5000, max_depth=4, random_state=42)
gbm.fit(X_train, y_train)

gbm_pred = gbm.predict(X_test)
mse=metrics.mean_squared_error(y_test, gbm_pred) 
    
print(f'GBM MSE: {round(mse,4)}')

GBM MSE: 2113046929.636
