In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
import xgboost as xgb
import numpy as np
import pandas as pd

In [39]:
MAX_WORDS = 10000

In [41]:
train_data = pd.read_csv('../Data/train.csv')

In [42]:
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_WORDS)
tfidf_vectorizer.fit(train_data.text)

X_train = tfidf_vectorizer.transform(train_data.text)
y_train = train_data.label

In [43]:
EARLY_STOPPING_ROUNDS = 50
MAX_XGBOOST_ROUND = 1500
NUMBER_OF_CLASSES = 2

xgb_params = {'max_depth': 5,  # the maximum depth of each tree
              'eta': 0.2,  # the training step for each iteration
              'objective': 'multi:softmax',
              'num_class': NUMBER_OF_CLASSES,
              'min_child_weight': 2,
              'subsample': 1,
              'colsample_bytree': 0.5,
              'eval_metric': ['mlogloss', 'merror']}

k_fold = KFold(n_splits=2, random_state=1, shuffle=True)
result = xgb.cv(xgb_params, xgb.DMatrix(X_train, label=y_train), 
                num_boost_round=MAX_XGBOOST_ROUND, folds=k_fold,    
                early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose_eval=False)

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=True)

In [45]:
train_evals_result = {}
clf = xgb.train(xgb_params, xgb.DMatrix(X_train, label=y_train), num_boost_round=MAX_XGBOOST_ROUND,
                evals=[(xgb.DMatrix(X_train, label=y_train), 'TRAIN'), (xgb.DMatrix(X_val, label=y_val), 'VALID')],
                early_stopping_rounds=EARLY_STOPPING_ROUNDS, evals_result=train_evals_result)

[0]	TRAIN-mlogloss:0.60930	TRAIN-merror:0.17547	VALID-mlogloss:0.61214	VALID-merror:0.18630
[1]	TRAIN-mlogloss:0.55197	TRAIN-merror:0.16310	VALID-mlogloss:0.55677	VALID-merror:0.17088
[2]	TRAIN-mlogloss:0.51090	TRAIN-merror:0.15806	VALID-mlogloss:0.51860	VALID-merror:0.16917
[3]	TRAIN-mlogloss:0.47883	TRAIN-merror:0.14644	VALID-mlogloss:0.48810	VALID-merror:0.15696
[4]	TRAIN-mlogloss:0.45021	TRAIN-merror:0.14602	VALID-mlogloss:0.46212	VALID-merror:0.15717
[5]	TRAIN-mlogloss:0.42752	TRAIN-merror:0.13884	VALID-mlogloss:0.44208	VALID-merror:0.15589
[6]	TRAIN-mlogloss:0.40631	TRAIN-merror:0.13648	VALID-mlogloss:0.42350	VALID-merror:0.14946
[7]	TRAIN-mlogloss:0.39002	TRAIN-merror:0.13327	VALID-mlogloss:0.40981	VALID-merror:0.15075
[8]	TRAIN-mlogloss:0.37557	TRAIN-merror:0.13199	VALID-mlogloss:0.39721	VALID-merror:0.15011
[9]	TRAIN-mlogloss:0.36395	TRAIN-merror:0.12824	VALID-mlogloss:0.38717	VALID-merror:0.14540
[10]	TRAIN-mlogloss:0.35397	TRAIN-merror:0.12642	VALID-mlogloss:0.37897	VALID-me

In [46]:
test_data = pd.read_csv('../Data/test.csv')

X_test = tfidf_vectorizer.transform(test_data.text)
y_test = test_data.label

predicted = clf.predict(xgb.DMatrix(X_test, label=y_test))
print(f'Accuracy: {np.mean(predicted == y_test) * 100}%')

Accuracy: 88.3011583011583%
