# XGBoost Model using sentence-BERT

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import seaborn as sns

from xgboost import XGBClassifier

import sys
sys.path.append("../")

from src.data import get_data
from src.preprocessing import TextToFeatures
from src.tuning import parameter_search

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/neperiana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/neperiana/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
X, y, X_sub = get_data()

In [5]:
param_grid = {
    # 'max_features': [500, 600],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 7, 12, 20],
    'colsample_bytree': [.75, 1.],
}

results = parameter_search(
    model_class=XGBClassifier,
    param_grid=param_grid,
    n_splits=3, 
    vect_type='sentence-BERT',
    X=X,
    y=y,
)

Searching best params for XGBClassifier...
No folds = 3

Fold 1/3
Searching across 32 candidates
................................
Fold 2/3
Searching across 32 candidates
................................
Fold 3/3
Searching across 32 candidates
................................

In [6]:
results.sort_values(by='f1', ascending=False)[:10]

Unnamed: 0,n_estimators,max_depth,colsample_bytree,tn,tp,f1,auc
26,400,7,0.75,0.838789,0.670457,0.710343,0.754623
1,100,3,1.0,0.791115,0.702857,0.70925,0.746986
19,300,7,1.0,0.841323,0.665868,0.708916,0.753596
29,400,12,1.0,0.844776,0.663116,0.708564,0.753946
16,300,3,0.75,0.808618,0.68941,0.708505,0.749014
21,300,12,1.0,0.846848,0.661281,0.708344,0.754065
10,200,7,0.75,0.837408,0.668318,0.708245,0.752863
18,300,7,0.75,0.838098,0.667095,0.707659,0.752596
27,400,7,1.0,0.840402,0.664034,0.707174,0.752218
13,200,12,1.0,0.846619,0.659141,0.706815,0.75288


In [8]:
# Params
n_estimators = 400
max_depth = 7
colsample_bytree = .75

# Train with whole model
scale_pos_weight = sum(np.where(y==0, 1, 0)) / sum(y)

# Extract text features
vectoriser = TextToFeatures(type='sentence-BERT')
X_dtm = vectoriser.fit_transform(X['clean_text'])
X_sub_dtm = vectoriser.fit_transform(X_sub['clean_text'])

# Train model
clf = XGBClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    colsample_bytree=colsample_bytree,
    scale_pos_weight=scale_pos_weight,
)
clf.fit(X_dtm, y)

# Get predictions
y_sub_pred = clf.predict(X_sub_dtm)
print('Proportion of target:', y_sub_pred.sum()/len(y_sub_pred))

Proportion of target: 0.3797119215445909


In [9]:
X_sub['target'] = y_sub_pred
submission_set = X_sub[['id', 'target']]
submission_set.to_csv('../data/submissions/04-xgboost-model-BERT.csv', index=False)

In [10]:
# !kaggle competitions submit -c nlp-getting-started -f ../data/submissions/04-xgboost-model-BERT.csv -m "xgboost BERT"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████████████████████████████████| 22.2k/22.2k [00:01<00:00, 17.3kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets