# XG BOOST
- XGBoost is an implementation of gradient-boosting decision trees. 
- XGBoost is designed for speed, ease of use, and performance on large datasets

In [48]:
# First, put this prompt: "conda install -c conda-forge py-xgboost" in anaconda to download xgboost package
# install xgboost in jupyter
!pip install xgboost



In [49]:
# import the classifier from the xgboost package
from xgboost import XGBClassifier
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [50]:
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
#print(df_table)

In [51]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(26):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,2.0,3.0,2.0,2.0,3.0,3.0,2.0,3.0,2.0,3.0,...,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,3.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,...,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,...,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [52]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [53]:
#save training data (not necessary)
#X_test.to_csv('X_test_XGB.csv', index=False)
#y_test.to_csv('y_test_XGB.csv', index=False)

In [54]:
# create model instance
# n_estimators: number of trees(estimators) the model uses --> the more used, the more accurate the model is
# max_depth: maximum depth of tree --> higher number makes model more complex, but too high can cause overfitting
# learning_rate: quantifies each tree's contribution to total prediction --> lower number takes longer, but can lead to better generalization
# objective: binary:logistic outputs probabilities. if classification is wanted, use binary:hinge
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic', enable_categorical = 'True')

# fit model with the training data
bst.fit(X_train, y_train)

# make predictions for the test dataset
preds = bst.predict(X_test)

# print predictions
print(preds)

# print model Accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(y_test, preds))

[0 0 1 0 0 0 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0
 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 1 0
 1 0 1 0 1 1 1 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 1 1 1 0]
Accuracy: 1.0
