In [1]:
# Let's get into XGBoost :)

# XGBoost is the leading model for working with data
# XGBoost models focus heavily on model tuning for peak accuracy
    # XGBoost is an implementation of the Gradient Boosted Decicion Tree algorithm
    # What are Gradient Boosted Deicion Trees?
        # A model that goes through cycles that repeatedly builds new models and combines them into an ensemble model
        
            # Cycles: Naive Model ====> Calculate Errors ======> Build Model Predicting Errors ======> Add Last Model Ensemble ======> Calculate Errors
            
        # What we're doing here is:
            # 1. we're calculating the errors for each observation in the dataset
            # 2. We then build a new model predicting those errors
            # 3. We add those predictors to the "ensemble" of models then repeat

    # To make a prediction, we add the predictions from each previous model. We use these to continuously fine tune our model to calculate new errors with new data to build the next one
    # XGBoost requires One key factor though:
        # We need base predictions to start the cycle!!
        # Initial predictions can be naive, and wildly inaccurate, this will kickstart the process and get fine tuned throughout it
        
# Let's get started!

In [21]:
# Initial Setup
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

# Import our dataset and drop missing values from 'Withdrew' column
df = pd.read_csv('./datasets/withdrawl_data.cvs')
df.dropna(axis=0, subset=['Withdrew'], inplace=True)

# Update 'No/Yes' object to '0/1' int64
df.update(df.Withdrew.replace('Yes',1))
df.update(df.Withdrew.replace('No',0))

# Define x and y of
y = df.Withdrew
x = df.drop(['Withdrew'], axis=1).select_dtypes(exclude=['object'])

# Split our datasets 
model_x, test_x, model_y, test_y = train_test_split(x.as_matrix(), y.as_matrix(), test_size=0.25)

# Impute the data
    # .fit_transform fits the data to the model, then transforms it
    # .transorm just imputes all missing values in the data
my_imputer = Imputer()
model_x = my_imputer.fit_transform(model_x)
test_x = my_imputer.transform(test_x)



In [37]:
# Now lets build and fit the model with XGBoost
from xgboost import XGBRegressor

# Define Model
xgb_model = XGBRegressor()

# Fit the model
xgb_model.fit(model_x, model_y, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [41]:
# Now let's make some predictions with our model
predictions = xgb_model.predict(test_x)
predictions

array([ 0.03286496,  0.11411265,  0.144741  ,  0.47576138,  0.09272575,
        0.05374989,  0.4620709 ,  0.0028688 ,  0.46653074,  0.9864986 ,
        0.98769724,  0.63574755, -0.07120657,  0.9812927 ,  0.97256005,
        0.9879961 ,  0.05758655,  0.60597897,  1.0065932 ,  1.0424104 ,
        0.70486283,  0.05088085,  0.5135442 ], dtype=float32)

In [56]:
# Now let's evaluate our model using MAE
from sklearn.metrics import mean_absolute_error

print("MAE: " + str(mean_absolute_error(predictions, test_y)))

MAE: 0.3474625854388527


In [51]:
# MODEL TUNING: Now let's fine tune our model to increase its accuracy and training speed

# Parameters to know
    # 1. n_estimators
        # - specifies the # of times to go through the model cycle
        # - n_estimators help find the sweet spot between underfitting and overfitting
            # underfitting: inaccurate predictions on model data and new data
            # overfitting: highly accurate predictions on model data, inaccurate on new data
            # - n_estimators=1000
            
    # 2. early_stopping_rounds
        # - is a way to automatically find the ideal n_estimators
        # - This causes a model to automatically stop iterating when validation scores stop improving
        # - it's smart enough to stop for n_estimators and smart enough to find the optimal time
        # - specify the number of rounds to iterate through a cycle
            # - early_stopping_rounds=5
                # stop after 5 rounds of iterating
                
    # 3. learning_rate
        # - Instead of adding the predictions from each component, multiple them by a small number before adding them
        # - This reduces the model's possibility of overfitting
        # - You can set a high level of n_estimators without overfitting
        # - A small learning rate will yield more XGBoost models

    # 4. n_jobs
        # - n_jobs will help make building models parallel so models build faster
        # - on large datasets, small datasets won't really be impacted much

In [73]:
# Let's re-create our fit by adding n_estimates, and early_stopping rounds

# Now lets build and fit the model with XGBoost
from xgboost import XGBRegressor

# Define Model with n_estimators and s learning_rate
new_xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=1)

# Fit the model with early_stopping_rounds
# eval_set allows you to evaluate your test data against this fit
xgb_model.fit(model_x, model_y, early_stopping_rounds=5, eval_set=[(test_x,test_y)],verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [79]:
# Now let's make some predictions with our model
# Fit the model
new_xgb_model.fit(model_x, model_y, verbose=False)
new_predictions = new_xgb_model.predict(test_x)
print("MAE: " + str(mean_absolute_error(new_predictions, test_y)))

MAE: 0.3379264888556107
