# Installation

Here we follow the instructions from https://xgboost.readthedocs.io/en/latest/build.html

1. git clone --recursive https://github.com/dmlc/xgboost
2. cd xgboost
3. mkdir build
4. cd build
5. cmake .. -DUSE_CUDA=ON # because I want to use GPU to fasten the training
6. make -j
7. cd ../python-package
8. python setup.py install

# Uninstall 

1. rm /home/fanta/.local/lib/python3.6/site-packages/xgboost-0.7-py3.5.egg 
2. rm -r xgboost

# Template

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.model_selection import ShuffleSplit, GridSearchCV

def PCA_transformation(X_train, X_test, categorial_columns):

    pca = PCA(n_components=24) # In this case the explained_variance_ratio_ exceed 0.99999

    X = pd.concat((X_train, X_test), ignore_index=True)

    X_apply = X.drop(labels=categorial_columns, axis=1)

    X_pca = pca.fit_transform(X_apply)

    X_pca_std = X_pca.std(axis = 0)

    X_pca_norm = X_pca/X_pca_std

    df_pca = pd.DataFrame(data=X_pca_norm, columns=["Feature_{:02d}".format(i) for i in range(24)])

    df_combined = pd.concat([X[categorial_columns], df_pca], axis=1)

    return df_combined.iloc[:len(X_train)], df_combined.iloc[len(X_train):]

def Cross_Validation(model, X, y, params, n_splits=5, test_size=0.2, random_state=42, scoring='neg_mean_squared_error', output_model_name="models/myModel"):

    cv = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    regression_cv = GridSearchCV(model, param_grid=params, cv=cv, scoring=scoring, verbose=2)

    regression_cv.fit(X, y)

    print(regression_cv.best_score_)
    print(regression_cv.best_params_)

    joblib.dump(regression_cv, output_model_name)

def XGBRegressor_Fitting(X, y, params, scoring='neg_mean_squared_error', output_model_name='models/XGBRegressor'):

    print("Starting XGBoost regression \n")
    regressor = XGBRegressor(tree_method='gpu_hist') #use the GPU

    Cross_Validation(regressor, X, y, params, n_splits=5, test_size=0.2, random_state=42, scoring=scoring,
                     output_model_name=output_model_name)

if __name__ == '__main__':
    
    """
    Start loading the data
    """
    X_train = pd.read_csv("KNN_minkowski_features.csv", index_col=0)
    y_train = X_train[X_train.columns[-1]]
    X_train.drop(X_train.columns[-1], axis=1, inplace=True)

    X_test = pd.read_csv("KNN_minkowski_features_test.csv", index_col=0)
    
    """
    Finishing loading the data
    """
    
    """
    PCA Transformation to reduce the dimensionality
    """
    X_train_pca, X_test_pca = PCA_transformation(X_train, X_test, ['shop_id', 'item_id', 'cats'])

    n_estimators = np.arange(150, 301, 50)
    learning_rate = np.logspace(-3, -1, 3)
    gamma = np.logspace(0, 1, 4)
    max_depth = [7]
    min_child_weight = np.linspace(1, 17, 5).astype(int)
    colsample_bytree = [0.7]
    subsample = [0.8]

    params = {'n_estimators': n_estimators,
              'learning_rate': learning_rate,
              'gamma': gamma,
              'max_depth': max_depth,
              'min_child_weight': min_child_weight,
              'colsample_bytree': colsample_bytree,
              'subsample': subsample}

    XGBRegressor_Fitting(X_train_pca, y_train, params,
                         output_model_name="XGBRegressor_KNN_minkowski_Meta_PCA")