# XG BOOST
- XGBoost is an implementation of gradient-boosting decision trees. 
- XGBoost is designed for speed, ease of use, and performance on large datasets

In [320]:
# First, put this prompt: "conda install -c conda-forge py-xgboost" in anaconda to download xgboost package
# install xgboost in jupyter
!pip install xgboost



In [361]:
# import the classifier from the xgboost package
from xgboost import XGBClassifier
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [363]:
#load dataset
#data = np.loadtxt("uniform_small_d_1.tex")
#data = np.loadtxt("uniform_large_d_1.tex")
#data = np.loadtxt("gaussian_small_d_1.tex")
data = np.loadtxt("gaussian_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,4.645936,4.44442,5.882756,6.699313,3.362616,5.624176,4.552078,2.622931,6.745673,5.324437,...,5.005013,5.525472,5.835458,4.334682,5.882338,4.320398,7.020972,4.373049,6.877316,1.0
1,6.685283,6.17486,5.757977,6.520946,5.247406,7.182799,7.582562,5.345074,6.239722,5.959306,...,5.936925,7.035587,5.807185,6.260498,5.713241,4.741151,8.523618,7.544684,6.321774,0.0
2,5.833938,5.480186,4.660813,2.640568,4.991246,5.329018,4.527029,4.486931,5.577468,4.682285,...,3.801542,3.500481,5.433775,4.678873,5.093956,4.844797,5.067531,5.539606,4.852789,1.0
3,3.84069,3.33242,4.881624,4.554784,4.809637,2.666808,5.156898,4.46481,5.552007,4.911071,...,5.185953,5.964207,5.399361,3.848238,5.883973,5.656945,4.562846,4.012647,6.632066,1.0
4,5.44389,5.533707,5.561488,4.913582,5.843202,5.324853,5.74933,5.860838,5.765502,5.11737,...,3.1634,5.900974,4.625896,5.144522,4.693454,6.53456,4.659843,4.017394,6.300626,1.0


In [365]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(25):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,5.0,4.0,6.0,7.0,3.0,6.0,5.0,3.0,7.0,5.0,...,5.005013,5.525472,5.835458,4.334682,5.882338,4.320398,7.020972,4.373049,6.877316,1.0
1,7.0,6.0,6.0,7.0,5.0,7.0,8.0,5.0,6.0,6.0,...,5.936925,7.035587,5.807185,6.260498,5.713241,4.741151,8.523618,7.544684,6.321774,0.0
2,6.0,5.0,5.0,3.0,5.0,5.0,5.0,4.0,6.0,5.0,...,3.801542,3.500481,5.433775,4.678873,5.093956,4.844797,5.067531,5.539606,4.852789,1.0
3,4.0,3.0,5.0,5.0,5.0,3.0,5.0,4.0,6.0,5.0,...,5.185953,5.964207,5.399361,3.848238,5.883973,5.656945,4.562846,4.012647,6.632066,1.0
4,5.0,6.0,6.0,5.0,6.0,5.0,6.0,6.0,6.0,5.0,...,3.1634,5.900974,4.625896,5.144522,4.693454,6.53456,4.659843,4.017394,6.300626,1.0


In [367]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [369]:
#save training data (not necessary)
#X_test.to_csv('X_test_XGB.csv', index=False)
#y_test.to_csv('y_test_XGB.csv', index=False)

In [371]:
# create model instance
# n_estimators: number of trees(estimators) the model uses --> the more used, the more accurate the model is
# max_depth: maximum depth of tree --> higher number makes model more complex, but too high can cause overfitting
# learning_rate: quantifies each tree's contribution to total prediction --> lower number takes longer, but can lead to better generalization
# objective: binary:logistic outputs probabilities. if classification is wanted, use binary:hinge
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic', enable_categorical = 'True')

# fit model with the training data
bst.fit(X_train, y_train)

# make predictions for the test dataset
preds = bst.predict(X_test)

# print predictions
print(preds)

# print model Accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(y_test, preds))

[1 0 1 1 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0
 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 1]
Accuracy: 0.9
