# XGBoost Model using sentence-BERT plus keyword vectorisation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import seaborn as sns

from xgboost import XGBClassifier

import sys
sys.path.append("../")

from src.data import get_data
from src.preprocessing import TextToFeatures
from src.tuning import parameter_search

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/neperiana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/neperiana/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
X, y, X_sub = get_data()

In [10]:
param_grid = {
    # 'max_features': [500, 600],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 7, 12],
    'colsample_bytree': [.75, 1.],
}

results = parameter_search(
    model_class=XGBClassifier,
    param_grid=param_grid,
    n_splits=3, 
    vect_type='sentence-BERT',
    X=X,
    y=y,
)

Searching best params for XGBClassifier...
No folds = 3

Fold 1/3
Searching across 18 candidates
..................
Fold 2/3
Searching across 18 candidates
..................
Fold 3/3
Searching across 18 candidates
..................

In [11]:
results.sort_values(by='f1', ascending=False)[:10]

Unnamed: 0,n_estimators,max_depth,colsample_bytree,tn,tp,f1,auc
17,300,12,1.0,0.843626,0.658836,0.705136,0.751231
14,300,7,0.75,0.836027,0.663118,0.703998,0.749572
8,200,7,0.75,0.835797,0.662812,0.703723,0.749304
16,300,12,0.75,0.843165,0.653027,0.700783,0.748096
11,200,12,1.0,0.840863,0.65425,0.700719,0.747556
15,300,7,1.0,0.835104,0.657,0.699764,0.746052
2,100,7,0.75,0.834646,0.656696,0.699339,0.745671
10,200,12,0.75,0.840862,0.650888,0.698136,0.745875
9,200,7,1.0,0.834183,0.651802,0.695644,0.742993
1,100,3,1.0,0.771314,0.6943,0.694322,0.732807


In [12]:
# Params
n_estimators = 300
max_depth = 12
colsample_bytree = 1

# Train with whole model
scale_pos_weight = sum(np.where(y==0, 1, 0)) / sum(y)

# Extract text features
vectoriser = TextToFeatures(type='sentence-BERT')
X_dtm = vectoriser.fit_transform(X['clean_text'])
X_sub_dtm = vectoriser.fit_transform(X_sub['clean_text'])

# Extract text features - BERT
BERT_vectorizer = TextToFeatures(type='sentence-BERT')
X_dtm_a = BERT_vectorizer.fit_transform(X['clean_text']).to_numpy()
X_sub_dtm_a = BERT_vectorizer.transform(X_sub['clean_text']).to_numpy()

# Extract text features - keywords
tfidf_vectorizer = TextToFeatures(type='count-vec')
X_dtm_b = tfidf_vectorizer.fit_transform(X['keyword'])
X_sub_dtm_b = tfidf_vectorizer.transform(X_sub['keyword'])

# Join 
X_dtm = np.concatenate([X_dtm_a, X_dtm_b], axis=1)
X_sub_dtm = np.concatenate([X_sub_dtm_a, X_sub_dtm_b], axis=1)


# Train model
clf = XGBClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    colsample_bytree=colsample_bytree,
    scale_pos_weight=scale_pos_weight,
)
clf.fit(X_dtm, y)

# Get predictions
y_sub_pred = clf.predict(X_sub_dtm)
print('Proportion of target:', y_sub_pred.sum()/len(y_sub_pred))

Proportion of target: 0.3748084584737971


In [13]:
X_sub['target'] = y_sub_pred
submission_set = X_sub[['id', 'target']]
submission_set.to_csv('../data/submissions/05-xgboost-model-BERT-plus-keywords.csv', index=False)

In [14]:
!kaggle competitions submit -c nlp-getting-started -f ../data/submissions/05-xgboost-model-BERT-plus-keywords.csv -m "xgboost BERT plus keywords"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████████████████████████████████| 22.2k/22.2k [00:01<00:00, 16.1kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets