In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import xgboost as xgb
import numpy as np
import pandas as pd

In [31]:
MAX_WORDS = 10000

In [32]:
train_data = pd.read_csv('../Data/train.csv')

In [33]:
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_WORDS)
tfidf_vectorizer.fit(train_data.text)

X_train = tfidf_vectorizer.transform(train_data.text)
y_train = train_data.label

In [34]:
EARLY_STOPPING_ROUNDS = 50
MAX_XGBOOST_ROUND = 1500
NUMBER_OF_CLASSES = 2

xgb_params = {'max_depth': 5,  # the maximum depth of each tree
              'eta': 0.2,  # the training step for each iteration
              'objective': 'multi:softmax',
              'num_class': NUMBER_OF_CLASSES,
              'min_child_weight': 2,
              'subsample': 1,
              'colsample_bytree': 0.5,
              'eval_metric': ['mlogloss', 'merror']}

k_fold = KFold(n_splits=2, random_state=1, shuffle=True)
result = xgb.cv(xgb_params, xgb.DMatrix(X_train, label=y_train), 
                num_boost_round=MAX_XGBOOST_ROUND, folds=k_fold,    
                early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose_eval=False)

In [35]:
print(result)

     train-mlogloss-mean  train-mlogloss-std  train-merror-mean  \
0               0.607691            0.001716           0.172663   
1               0.552424            0.003930           0.156258   
2               0.509456            0.002113           0.152660   
3               0.473832            0.000876           0.146535   
4               0.445886            0.001289           0.139896   
..                   ...                 ...                ...   
170             0.104938            0.001047           0.016106   
171             0.104468            0.001069           0.015763   
172             0.104037            0.001025           0.015506   
173             0.103434            0.001257           0.014992   
174             0.102711            0.001405           0.014564   

     train-merror-std  test-mlogloss-mean  test-mlogloss-std  \
0            0.005183            0.611897           0.000552   
1            0.000086            0.559833           0.002870   
2  

In [36]:
clf = xgb.train(xgb_params, xgb.DMatrix(X_train, label=y_train), 
                num_boost_round=MAX_XGBOOST_ROUND,    
                early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose_eval=False)

TypeError: train() got an unexpected keyword argument 'folds'

In [None]:
test_data = pd.read_csv('../Data/test.csv')

X_test = tfidf_vectorizer.transform(test_data.text)
y_test = test_data.label

predicted = clf.predict(X_test)
print(f'Accuracy: {np.mean(predicted == y_test) * 100}%')