# XGBoost Regression Template

```.zsh
$ conda install -c conda-forge xgboost
```

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

## Read dataset into python

In [None]:
from sklearn.datasets import fetch_california_housing

dbunch = fetch_california_housing(as_frame=True)
df = dbunch['frame']

In [None]:
df.info()

## Prepare raw data for XGBoost

### Encode string features

In [None]:
def encode_string_features(df, use_cats=True):
    out_df = df.copy()
    for feature, feature_type in df.dtypes.items():
        if feature_type == 'object':
            if use_cats:
                out_df[feature] = out_df[feature].astype('category')
            else:
                from sklearn.preprocessing import LabelEncoder
                out_df[feature] = LabelEncoder().fit_transform(out_df[feature].astype('str'))
    return out_df

df = encode_string_features(df, use_cats=False)

### Encode date and timestamp features

In [None]:
def encode_datetime_features(df, datetime_features, datetime_attributes):
    out_df = df.copy()
    for datetime_feature in datetime_features:
        for datetime_attribute in datetime_attributes:
            if datetime_attribute == 'days_since_epoch':
                out_df[f'{datetime_feature}_{datetime_attribute}'] = \
                    (out_df[datetime_feature] - pd.Timestamp(year=1970, month=1, day=1)).dt.days
            else:
                out_df[f'{datetime_feature}_{datetime_attribute}'] = \
                    getattr(out_df[datetime_feature].dt, datetime_attribute)
    return out_df

datetime_features = [

]
datetime_attributes = [
    'year',
    'month',
    'day',
    'quarter',
    'day_of_year',
    'day_of_week',
    'days_since_epoch',
]

df = encode_datetime_features(df, datetime_features, datetime_attributes)

### Transform the target if necessary

## Train and Evaluate the XGBoost regression model

In [None]:
# list(df.columns)

In [None]:
features = dbunch['feature_names']
target = dbunch['target_names'][0]

### Split the data into training and validation sets

In [None]:
# Temporal Validation Set
def train_test_split_temporal(df, datetime_column, n_test):
    idx_sort = np.argsort(df[datetime_column])
    idx_train, idx_test = idx_sort[:-n_valid], idx_sort[-n_valid:]
    return df.iloc[idx_train, :], df.iloc[idx_test, :]


# Random Validation Set
def train_test_split_random(df, n_test):
    np.random.seed(42)
    idx_sort = np.random.permutation(len(df))
    idx_train, idx_test = idx_sort[:-n_valid], idx_sort[-n_valid:]
    return df.iloc[idx_train, :], df.iloc[idx_test, :]

In [None]:
n_valid = 1000

# my_train_test_split = lambda d, n_valid: train_test_split_temporal(d, 'date_column', n_valid)
my_train_test_split = lambda d, n_valid: train_test_split_random(d, n_valid)

train_df, valid_df = my_train_test_split(df, n_valid)
train_df.shape, valid_df.shape

### Train using xgboost API

In [None]:
dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], enable_categorical=True)
dvalid = xgb.DMatrix(data=valid_df[features], label=valid_df[target], enable_categorical=True)

# default values for important parameters
params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',
}
num_boost_round = 50

evals_result = {}
m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
              evals=[(dtrain, 'train'), (dvalid, 'valid')],
              verbose_eval=10,
              evals_result=evals_result)

### Train using the sklearn interface

### Evaluate the model and check for overfitting

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))


my_eval_metric = root_mean_squared_error
my_eval_metric(dvalid.get_label(), m.predict(dvalid))

In [None]:
pd.DataFrame({
    'train': evals_result['train']['rmse'],
    'valid': evals_result['valid']['rmse']
}).plot(); plt.xlabel('boosting round'); plt.ylabel('objective');

### Check feature importance

In [None]:
fig, ax = plt.subplots(figsize=(5,2))
feature_importances = pd.Series(m.get_score(importance_type='weight')).sort_values(ascending=False)
feature_importances.plot.barh(ax=ax)
plt.title('Feature Importance');

## Model Iteration

In [None]:
features = [
    'MedInc',
     'HouseAge',
     'AveRooms',
     'AveBedrms',
     'Population',
     'AveOccup',
     'Latitude',
     'Longitude'
]
target = 'MedHouseVal'

dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], enable_categorical=True)
dvalid = xgb.DMatrix(data=valid_df[features], label=valid_df[target], enable_categorical=True)

params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',
}
num_boost_round = 50

m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
              evals=[(dtrain, 'train'), (dvalid, 'valid')],verbose_eval=10)

### Feature selection

#### Drop low-importance features

In [None]:
feature_importances_weight = pd.Series(m.get_score(importance_type='weight')).sort_values(ascending=False)
feature_importances_cover = pd.Series(m.get_score(importance_type='cover')).sort_values(ascending=False)
feature_importances_gain = pd.Series(m.get_score(importance_type='gain')).sort_values(ascending=False)

In [None]:
# features = list(feature_importances_weight[:30].index)
# features = list(feature_importances_cover[:35].index)
features = list(feature_importances_gain[:8].index)

dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], enable_categorical=True)
dvalid = xgb.DMatrix(data=valid_df[features], label=valid_df[target], enable_categorical=True)

params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',
}
num_boost_round = 50

m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
              evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=10)

#### Drop one feature at a time

In [None]:
# drop each feature one-at-a-time
scores = []
for i, feature in enumerate(features):
    drop_one_features = features[:i] + features[i+1:]

    dtrain = xgb.DMatrix(data=train_df[drop_one_features], label=train_df[target], enable_categorical=True)
    dvalid = xgb.DMatrix(data=valid_df[drop_one_features], label=valid_df[target], enable_categorical=True)

    params = {
        'learning_rate': 0.3,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 1,
        'colsample_bynode': 1,
        'objective': 'reg:squarederror',
    }
    num_boost_round = 50

    m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
                evals=[(dtrain, 'train'), (dvalid, 'valid')],
                verbose_eval=False)
    score = my_eval_metric(dvalid.get_label(), m.predict(dvalid))
    scores.append(score)

results_df = pd.DataFrame({
    'feature': features,
    'score': scores
})
results_df.sort_values(by='score')

In [None]:
features = [
    'MedInc',
     'HouseAge',
     'AveRooms',
     'AveBedrms',
     'Population',
     'AveOccup',
     'Latitude',
     'Longitude'
]

dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], enable_categorical=True)
dvalid = xgb.DMatrix(data=valid_df[features], label=valid_df[target], enable_categorical=True)

params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',
}
num_boost_round = 50

m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
              evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=10)

### Tune the XGBoost hyperparameters

In [None]:
params = {
    'learning_rate': 0.3,
    'max_depth': 10,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',}
num_boost_round = 50

m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
              evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=10)

In [None]:
multiplier = 50
params = {
    'learning_rate': 0.3/multiplier,
    'max_depth': 10,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',}
num_boost_round = 50*multiplier

m = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
              evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=200)

In [None]:
my_eval_metric(dvalid.get_label(), m.predict(dvalid))