<a href="https://colab.research.google.com/github/mervegb/deep-learning/blob/main/xgboost_explanation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

#Impute is used to handle missing values

data = pd.read_csv('./train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True) #remove rows where the SalePrice column has missing values

y = data.SalePrice #extracts SalePrice column as target variable for prediction
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

train_X, test_X, train_y, test_y = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.25)

my_imputer = SimpleImputer()
train_X = my_imputer.fit_transform(train_X) #calculates the mean of each feature on the training set and replaces missing values with those means
test_X = my_imputer.transform(test_X)

In [7]:
from xgboost import XGBRegressor

#Boosting => trains model sequentially which each model learns from the errors of previous one

#XGBoost is for gradient boosting
#XGBoost uses regularization technique (L1,L2) to prevent overfitting
#XGBoost can handle missing data and parallel processing which makes it faster than traditional decision trees
#XGBRegressor designed for regression tasks where the goal is to predict the continuous output

my_model = XGBRegressor()
my_model.fit(train_X, train_y, verbose=False) #fit method is used to train the model on the given dataset

In [8]:
from sklearn.metrics import mean_absolute_error

#evaluate the model and make predictions
predictions = my_model.predict(test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

Mean Absolute Error : 17606.895184075343


In [9]:
#XGBoost has few parameters that can affect your model's accuracy and training speed
#n_estimators => specifies how many times to go through the modeling cycle, too low value causes underfitting, too large causes overfitting
#early_stopping_rounds => early stopping causes the model to stop iterating when the validation score stops improving
#It's smart to set a high value for n_estimators and then use early_stopping_rounds to find the optimal time to stop iterating.


my_model = XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)], verbose=False)



In [10]:
#In general, small learning rate will yield more accurate XGBoost models, though it will also take the model longer to train since it does more iteration

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)], verbose=False)



In [None]:
#On larger datasets you can use parallelism to build models faster
#n_jobs