In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
TRAINING_SAMPLES = 15000
VALIDATION_SAMPLES = 6000
MAX_WORDS = 10000

In [4]:
train_data = pd.read_csv('../Data/train.csv')

nan_idx = train_data[pd.isnull(train_data.text)].index.tolist()
train_data.loc[nan_idx, 'text'] = ' '

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_WORDS)
tfidf_vectorizer.fit(train_data.text)

x_train = tfidf_vectorizer.transform(train_data.text[:TRAINING_SAMPLES])
y_train = train_data.label[:TRAINING_SAMPLES]
x_val = tfidf_vectorizer.transform(train_data.text[TRAINING_SAMPLES : TRAINING_SAMPLES + VALIDATION_SAMPLES])
y_val = train_data.label[TRAINING_SAMPLES : TRAINING_SAMPLES + VALIDATION_SAMPLES]

In [6]:
indices = np.arange(x_train.shape[0])
np.random.shuffle(indices)

x_train = x_train[indices]
y_train = y_train[indices]

In [9]:
xgb = XGBClassifier()

xgb.fit(x_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
test_data = pd.read_csv('../Data/test.csv')

nan_idx = test_data[pd.isnull(test_data.text)].index.tolist()
test_data.loc[nan_idx, 'text'] = ' '

x_test = tfidf_vectorizer.transform(test_data.text)
y_test = test_data.label

predicted = xgb.predict(x_test)
print(f'Accuracy: {np.mean(predicted == y_test) * 100}%')

Accuracy: 63.653846153846146%
