# Reading Data

In [3]:
import numpy as np
import pandas as pd
data = pd.read_csv ("housing.csv")
print (f"data shape: {data.shape}")
data.head()

data shape: (20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Preprocessing

In [4]:
print(data.info())
print (data.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952

In [5]:
data = data.dropna()
data.shape

(20433, 10)

In [6]:
# difining train and test data

from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.get_dummies (data, columns= ['ocean_proximity'])

X=data.drop ('median_house_value', axis=1)
y = data ['median_house_value']

x_train, x_test, y_train, y_test = train_test_split (X, y, test_size=0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(16346, 13)
(4087, 13)
(16346,)
(4087,)


In [7]:
# Normalizing data

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train = scaler.fit_transform (x_train)
x_test = scaler.transform (x_test)


# Model training

In [8]:
import xgboost as xgb

model = xgb.XGBRegressor(n_estimators=150, learning_rate=0.1, max_depth=5, subsample=0.6)
model.fit (x_train, y_train)


In [9]:
# predicting y based on x_(ytain and test)

y_pred_train = model.predict (x_train)
y_pred_test = model.predict (x_test)

# Evaluating Metric: RMSE Calculation

In [10]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse_train= np.sqrt(mean_squared_error (y_train, y_pred_train))
rmse_test= np.sqrt(mean_squared_error (y_test, y_pred_test))
print(y_test.mean())
print (rmse_train)
print (rmse_test)

206333.52165402495
40465.43606062413
51029.76870478212


# Hyperparameter Tuning

In [None]:
#Hyperparameter tuning using GridSearch

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8]

}
grid_search = GridSearchCV (xgb.XGBRFRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit (x_train, y_train)

print("Best params:", grid_search.best_estimator_)

KeyboardInterrupt: 

In [None]:
import joblib
joblib.dump (grid_search.best_estimator_, 'best_model.pkl')

['best_model.pkl']

### Evaluating metric after optimization

In [None]:
best_model = grid_search.best_estimator_
y_pred_train_bm = best_model.predict(x_train)
y_pred_test_bm = best_model.predict(x_test)
rmse_train_bm= np.sqrt(mean_squared_error (y_train, y_pred_train_bm))
rmse_test_bm= np.sqrt(mean_squared_error (y_test, y_pred_test_bm))
print(y_test.mean())
print (rmse_train_bm)
print (rmse_test_bm)

206859.55199412772
100928.07896020719
100785.01162156006
