# XGBoost Regression Template

```.zsh
$ conda install -c conda-forge xgboost
```

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

## Read dataset into python

In [None]:
from sklearn.datasets import fetch_california_housing

dbunch = fetch_california_housing(as_frame=True)
features = dbunch.feature_names 
target = dbunch.target_names[0]
df = dbunch.frame

df.info()

In [None]:
from sklearn.model_selection import train_test_split 

n_valid = 2000 
train_df, valid_df = train_test_split(df, test_size=n_valid, random_state=42)
train_df.shape, valid_df.shape

## Training

### 'train` function`

In [None]:
# default values for important parameters
params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',
}
num_boost_round = 50

dtrain = xgb.DMatrix(data=train_df[features], label=train_df[target], enable_categorical=False)
dvalid = xgb.DMatrix(data=valid_df[features], label=valid_df[target], enable_categorical=False)
evals_result = {}
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
                  evals=[(dtrain, 'train'), (dvalid, 'valid')],
                  verbose_eval=10,
                  evals_result=evals_result)

In [None]:
pd.DataFrame({
    'train': evals_result['train']['rmse'],
    'valid': evals_result['valid']['rmse']
}).plot(); plt.xlabel('boosting round'); plt.ylabel('objective');

### `XGBClassifier` wrapper

In [None]:
params = {
    'learning_rate': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bynode': 1,
    'objective': 'reg:squarederror',
}
num_boost_round = 50

reg = xgb.XGBRegressor(n_estimators=num_boost_round, **params)
reg.fit(train_df[features], train_df[target], 
        eval_set=[(train_df[features], train_df[target]), (valid_df[features], valid_df[target])], 
        verbose=10);
        
# m = reg.get_booster()

## Feature Importance

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

# make a scorer for RMSE
scorer = make_scorer(mean_squared_error, squared=False)
permu_imp = permutation_importance(reg, valid_df[features], valid_df[target], 
                                   n_repeats=30, random_state=0, scoring=scorer)

permu_imp = pd.Series(permu_imp['importances_mean'], index=features)
permu_imp.sort_values(ascending=False).plot.barh()
plt.title('Permutation Importance on Out-of-Sample Set')
plt.xlabel('change in RMSE');


## Partial Dependence

In [None]:
from sklearn.inspection import PartialDependenceDisplay

PartialDependenceDisplay.from_estimator(reg, 
                                        valid_df[features], 
                                        ['MedInc', ['Longitude', 'Latitude']]);