# XG BOOST
- XGBoost is an implementation of gradient-boosting decision trees. 
- XGBoost is designed for speed, ease of use, and performance on large datasets

In [89]:
# First, put this prompt: "conda install -c conda-forge py-xgboost" in anaconda to download xgboost package
# install xgboost in jupyter
!pip install xgboost



In [90]:
# import the classifier from the xgboost package
from xgboost import XGBClassifier
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [91]:
#load dataset
data = np.loadtxt("uniform_small_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
#print(df_table)

In [92]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(26):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.20028,0.241395,0.898875,0.281621,0.395712,0.43372,0.53292,0.442125,0.8778,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.980781,0.912337,0.435496,0.462082,0.492288,0.808459,0.560076,0.299746,0.875868,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.667864,0.9624,0.65785,1.033765,0.785822,0.711929,0.617504,1.288232,0.621426,1.0
3,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.140005,0.176484,0.300608,0.312733,0.961515,0.7965,0.419527,0.588289,0.339199,0.0
4,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.521792,1.19901,1.065463,0.438707,1.295819,1.231076,0.803302,1.130015,0.616849,1.0


In [93]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [94]:
#save training data (not necessary)
#X_test.to_csv('X_test_XGB.csv', index=False)
#y_test.to_csv('y_test_XGB.csv', index=False)

In [95]:
# create model instance
# n_estimators: number of trees(estimators) the model uses --> the more used, the more accurate the model is
# max_depth: maximum depth of tree --> higher number makes model more complex, but too high can cause overfitting
# learning_rate: quantifies each tree's contribution to total prediction --> lower number takes longer, but can lead to better generalization
# objective: binary:logistic outputs probabilities. if classification is wanted, use binary:hinge
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic', enable_categorical = 'True')

# fit model with the training data
bst.fit(X_train, y_train)

# make predictions for the test dataset
preds = bst.predict(X_test)

# print predictions
print(preds)

# print model Accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(y_test, preds))

[0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 1
 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 1 0 0 1 0
 1 1 0 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 0 0 0 0 0 1 1 0]
Accuracy: 0.98
