# Importing the Libararies 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
name= ['crime', 'zone', 'industry', 'charles', 'no', 'rooms', 'age', 'distance', 'radial', 'tax', 'pupil', 'aam', 'lower', 'med_price']
df = pd.read_csv("housing.csv",delim_whitespace=True,names=name)
df.head()

Unnamed: 0,crime,zone,industry,charles,no,rooms,age,distance,radial,tax,pupil,aam,lower,med_price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


# Pipelines

In [3]:
X, y = df.iloc[:,:-1], df.iloc[:,-1]

# RandomForest Regressor with pipeline

In [5]:
rf_pipeline = Pipeline([("st_scaler", StandardScaler()),
                        ("rf_model", RandomForestRegressor())])
scores = cross_val_score(rf_pipeline, X, y, scoring='neg_mean_squared_error', cv=10)
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final RMSE :", final_avg_rmse)

Final RMSE : 4.206616758067607


# XGBoost

In [8]:
y = df.med_price
X = df.drop(['med_price'], axis=1)

In [9]:
X_train,y_train,X_test,y_test = train_test_split(X,y, test_size=0.2,random_state=123)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


In [11]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [12]:
# Create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123)

# Fit the regressor to the training set
xg_reg.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_reg.predict(X_test)

# Compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))

print("RMSE: %f" % (rmse))

RMSE: 3.782443


In [13]:
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

In [14]:
params = {"booster":"gblinear","objective":"reg:linear"}

In [15]:
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=10)



In [16]:
preds = xg_reg.predict(DM_test)

In [17]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE : %f" % (rmse))

RMSE : 6.120722


In [18]:
# Convert the training and testing sets into DMatrixes: DM_train, DM_test
DM_train = xgb.DMatrix(X_train, y_train)
DM_test =  xgb.DMatrix(X_test, y_test)

# Create the parameter dictionary: params
params = {"booster":"gblinear", "objective":"reg:linear"}

# Train the model: xg_reg
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

# Predict the labels of the test set: preds
preds = xg_reg.predict(DM_test)

# Compute and print the RMSE
rmse = np.sqrt(mean_squared_error(y_test,preds))
print("RMSE: %f" % (rmse))

RMSE: 6.935592


# XGBoost with Pipeline

In [19]:
xgb_pipeline = Pipeline([("st_scaler", StandardScaler()),
                        ("xgb_model", xgb.XGBRegressor())])
scores = cross_val_score(rf_pipeline, X, y, scoring='neg_mean_squared_error', cv=10)
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final RMSE :", final_avg_rmse)

Final RMSE : 4.236029456347665


# Tuning XGBoost hyperparameter in pipeline

In [22]:
xgb_pipeline = Pipeline([("st_scaler", StandardScaler()),
                       ("xgb_model", xgb.XGBRegressor())])
gbm_param_grid = {
    'xgb_model__subsample': np.arange(0.05, 1, .05),
    'xgb_model__max_depth': np.arange(3,20,1),
    'xgb_model__colsample_bytree': np.arange(.1,1.05,.05)
}

In [23]:
from sklearn.model_selection import RandomizedSearchCV

In [25]:
randomized_neg_mse = RandomizedSearchCV(estimator=xgb_pipeline, param_distributions=gbm_param_grid, n_iter=10, scoring='neg_mean_squared_error', cv=4)

In [26]:
randomized_neg_mse.fit(X, y)

In [27]:
print("Best rmse: ", np.sqrt(np.abs(randomized_neg_mse.best_score_)))

Best rmse:  4.685382405013096
