In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

### 1. Read data

In [None]:
train = pd.read_csv('Dataset/tree_train_data.csv')

In [None]:
train.shape

### 2. Reclass the classes

In [None]:
# XGB needs target classes to be in [0, num_classes)
def reclass(df):
    typeDict = {real_type:n for (real_type, n) in zip(np.sort(df.TripType.unique()), range(38))}
    df.TripType = df.TripType.apply(lambda x: typeDict[x])
    return df

In [None]:
train = reclass(train)

### 3. Split data by 80:20

In [None]:
def splitData(df, testsize = 0.2, randseed = 0):
    X = df.drop(['TripType'], axis=1)
    y = df.TripType
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testsize, random_state = randseed)
    return (X_train, X_test, y_train, y_test)

In [None]:
X1_train, X1_test, y1_train, y1_test = splitData(train)

### 4. Save to binary buffer

In [None]:
# The data is stored in a DMatrix object.
def saveBuffer(X, y, name):
    d = xgb.DMatrix(np.array(X), label=np.array(y))
    d.save_binary('Dataset/' + name + '.buffer')

In [None]:
saveBuffer(X1_train, y1_train, 'xgboost_train')
saveBuffer(X1_test, y1_test, 'xgboost_test')

### 5. Do the same thing to test data

In [None]:
test = pd.read_csv('Dataset/tree_test_data.csv')

In [None]:
test.insert(37, column='HEALTH AND BEAUTY AIDS', value=0)

In [None]:
test.to_csv('Dataset/real_test.csv')

In [None]:
dtest = xgb.DMatrix(np.array(test))

In [None]:
dtest.save_binary('Dataset/' + 'dtest' + '.buffer')