# XGBoost Model

In [1]:
%load_ext autoreload
%autoreload 2

In [37]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
import matplotlib.ticker as mtick
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

import sys
sys.path.append("../")

from src.data import get_data
from src.tuning import parameter_search

In [5]:
X, y, X_sub = get_data()

In [43]:
param_grid = {
    'max_features': [400, 500, 600],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [7, 12, 20, 30],
    'colsample_bytree': [.75, 1.],
}

results = parameter_search(
    model_class=XGBClassifier,
    param_grid=param_grid,
    n_splits=3, 
    X=X,
    y=y,
)

Searching best params for XGBClassifier...
No folds = 3

Fold 1/3
Searching across 96 candidates
................................................................................................
Fold 2/3
Searching across 96 candidates
................................................................................................
Fold 3/3
Searching across 96 candidates
................................................................................................

In [45]:
results.sort_values(by='f1', ascending=False)[:10]

Unnamed: 0,max_features,n_estimators,max_depth,colsample_bytree,tn,tp,f1,auc
74,600,200,12,0.75,0.828198,0.501691,0.579893,0.664944
75,600,200,12,1.0,0.824742,0.501998,0.578393,0.66337
82,600,300,12,0.75,0.818294,0.503524,0.577016,0.660909
76,600,200,20,0.75,0.817604,0.503832,0.576857,0.660718
71,600,100,30,1.0,0.824513,0.499245,0.576229,0.661879
91,600,400,12,1.0,0.809772,0.505664,0.575092,0.657718
88,600,400,7,0.75,0.820828,0.499549,0.574952,0.660188
83,600,300,12,1.0,0.818754,0.500467,0.574824,0.65961
90,600,400,12,0.75,0.812997,0.501691,0.573237,0.657344
80,600,300,7,0.75,0.82935,0.492519,0.572903,0.660935


In [46]:
# Params
max_features = 600
n_estimators = 200
max_depth = 12
colsample_bytree = .75

# Train with whole model
scale_pos_weight = sum(np.where(y==0, 1, 0)) / sum(y)

# Extract text features
vectorizer = TfidfVectorizer(max_features=max_features) 
X_dtm = vectorizer.fit_transform(X['clean_text'])
X_sub_dtm = vectorizer.transform(X_sub['clean_text'])

# Train model
clf = XGBClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    colsample_bytree=colsample_bytree,
    scale_pos_weight=scale_pos_weight,
)
clf.fit(X_dtm.toarray(), y)

# Get predictions
y_sub_pred = clf.predict(X_sub_dtm.toarray())
print('Proportion of target:', y_sub_pred.sum()/len(y_sub_pred))

Proportion of target: 0.39166411277965063


In [47]:
X_sub['target'] = y_sub_pred
submission_set = X_sub[['id', 'target']]
submission_set.to_csv('../data/submissions/03-xgboost-model.csv', index=False)

In [48]:
!kaggle competitions submit -c nlp-getting-started -f ../data/submissions/03-xgboost-model.csv -m "xgboost tfidf tuned even more"

100%|██████████████████████████████████████| 22.2k/22.2k [00:01<00:00, 13.8kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets