In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
TRAINING_SAMPLES = 15000
VALIDATION_SAMPLES = 6000
MAX_WORDS = 10000

In [3]:
train_data = pd.read_csv('../Data/train.csv')

nan_idx = train_data[pd.isnull(train_data.text)].index.tolist()
train_data.loc[nan_idx, 'text'] = ' '

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_WORDS)
tfidf_vectorizer.fit(train_data.text)

data = tfidf_vectorizer.transform(train_data.text)
labels = train_data.label

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, shuffle=True)

In [6]:
import xgboost as xgb

EARLY_STOPPING_ROUNDS = 50
MAX_XGBOOST_ROUND = 1500
NUMBER_OF_CLASSES = 2

xgb_params = {'max_depth': 5,  # the maximum depth of each tree
              'eta': 0.2,  # the training step for each iteration
              'objective': 'multi:softmax',
              'num_class': NUMBER_OF_CLASSES,
              'min_child_weight': 2,
              'subsample': 1,
              'colsample_bytree': 0.5,
              'eval_metric': ['mlogloss', 'merror']}

train_evals_result = {}
dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)
trained_model = xgb.train(xgb_params, dtrain, num_boost_round=MAX_XGBOOST_ROUND,
                          evals=[(dtrain, 'TRAIN'), (dval, 'VALID')],
                          early_stopping_rounds=EARLY_STOPPING_ROUNDS, evals_result=train_evals_result,
                          verbose_eval=False)

.02209	TRAIN-merror:0.00102	VALID-mlogloss:0.07657	VALID-merror:0.02740
[224]	TRAIN-mlogloss:0.02179	TRAIN-merror:0.00096	VALID-mlogloss:0.07655	VALID-merror:0.02668
[225]	TRAIN-mlogloss:0.02164	TRAIN-merror:0.00090	VALID-mlogloss:0.07651	VALID-merror:0.02740
[226]	TRAIN-mlogloss:0.02154	TRAIN-merror:0.00090	VALID-mlogloss:0.07641	VALID-merror:0.02716
[227]	TRAIN-mlogloss:0.02141	TRAIN-merror:0.00090	VALID-mlogloss:0.07626	VALID-merror:0.02813
[228]	TRAIN-mlogloss:0.02130	TRAIN-merror:0.00090	VALID-mlogloss:0.07614	VALID-merror:0.02813
[229]	TRAIN-mlogloss:0.02110	TRAIN-merror:0.00084	VALID-mlogloss:0.07585	VALID-merror:0.02837
[230]	TRAIN-mlogloss:0.02096	TRAIN-merror:0.00078	VALID-mlogloss:0.07579	VALID-merror:0.02813
[231]	TRAIN-mlogloss:0.02076	TRAIN-merror:0.00078	VALID-mlogloss:0.07561	VALID-merror:0.02789
[232]	TRAIN-mlogloss:0.02061	TRAIN-merror:0.00084	VALID-mlogloss:0.07546	VALID-merror:0.02740
[233]	TRAIN-mlogloss:0.02045	TRAIN-merror:0.00078	VALID-mlogloss:0.07544	VALID-mer

In [7]:
test_data = pd.read_csv('../Data/test.csv')

nan_idx = test_data[pd.isnull(test_data.text)].index.tolist()
test_data.loc[nan_idx, 'text'] = ' '

x_test = tfidf_vectorizer.transform(test_data.text)
y_test = test_data.label

predicted = trained_model.predict(xgb.DMatrix(x_test, label=y_test))

print(f'Accuracy: {np.mean(predicted == y_test) * 100}%')

Accuracy: 63.57692307692307%
