In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from utils.hyper_params import perform_random_search
import json


In [6]:
df= pd.read_csv('./data/train_preprocessed.csv')
scaler = StandardScaler()
X=scaler.fit_transform(df.drop('SalePrice', axis=1))
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice',axis =1), df['SalePrice'], test_size=0.2, random_state=0)
params = perform_random_search(X_train,y_train)

In [7]:
model = XGBRegressor(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
print(mean_squared_error(predictions, y_test))
print(r2_score(y_test, y_pred))

811826315.7773973
0.8824435472488403


In [8]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

In [9]:
print(scores)
scores.mean()

[0.93299788 0.77586806 0.89375132 0.88200676 0.88745505]


np.float64(0.8744158148765564)

In [12]:
grid_search = pd.read_csv('../xgb-grid-search-results-01.csv')
best=(best:=grid_search.loc[(grid_search["rank_test_score"]==1)]).loc[best['std_score_time'].idxmax()]
params=json.loads(best['params'].replace("'",'"'))
model_best = XGBRegressor(**params)
model_best.fit(X_train, y_train)
y_pred = model_best.predict(X_test)
predictions = [round(value) for value in y_pred]
print(mean_squared_error(predictions, y_test))
print(r2_score(y_test, y_pred))

1103683719.6849315
0.8401814699172974


In [13]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(model_best, X_train, y_train, cv=cv, scoring='r2')
print(scores)
scores.mean()

[0.93282354 0.80984277 0.90231097 0.89117372 0.9031918 ]


np.float64(0.8878685593605041)

The results using random search are slightly better after getting the best params from random search, yet the cross validated score is almost the same. 