# histogram gradient boosting classifier model

faster implementation of gradient boosting classifier

- assume training data in filepath
    + `data/processed/train.csv`
- assume testing data in filepath
    + `data/processed/test.csv`
- assume both have same structure:
    + first column is query 
    + second column is category
- assume input strings are 'clean':
    + in lower case
    + punctuation removed (stop words included)
    + words separated by spaces (no padding)
- output:
    + `models/supportvectormachine/clf_svc.pckl`

In [1]:
import numpy as np
import pandas as pd
import pickle
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier # sklearn 0.21+
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC #(setting multi_class=”crammer_singer”)

split_training_data_filepath = '../data/processed/train.csv'
split_testing_data_filepath  = '../data/processed/test.csv'
model_filepath = '../models/histogramgradientbosting.pckl'

The scikit-learn version is 0.23.2.


In [2]:
# fetch training data only:
df_train = pd.read_csv(split_training_data_filepath, index_col=0)
# do not want the class label to be numerical nor ordinal. 
# df_train['category'] = pd.Categorical(df_train['category'])
df_train['category'] = pd.Categorical(df_train['category'])
display(df_train.sample(3))

Unnamed: 0,query,category
385863,victorian fabric,1072
602376,career change at 50 uk,1297
523565,old mobile homes for sale,915


In [3]:
# transform the text field
tfidf = TfidfVectorizer(
#     stop_words=stop_words,
    strip_accents= 'ascii',
    ngram_range=(1,1), # consider unigrams/bigrams/trigrams?
    min_df = 4,
    max_df = 0.80,
    binary=True, # count term occurance in each query only once
)
# column transformer
all_transforms = make_column_transformer(
    (tfidf, ['query'])
)

# classify using random forest (resistant to overfitting):
clf_hgbc = HistGradientBoostingClassifier(
    max_depth = 8,
    max_iter = 20,
    tol = 1e-4,
)
pipe = make_pipeline(all_transforms, clf_hgbc)


In [4]:
%%time
# scores = cross_val_score(pipe, df_train['query'], df_train['category'], cv=10, scoring='accuracy')
# scores = cross_val_score(clf_svc, X, y, cv=5, scoring='f1_macro')
# scores = cross_val_score(clf_svc, X, df_train['category'], cv=2, scoring='accuracy')
# cross_val_score(pipe, df_train['query'], df_train['category'], cv=5, scoring='accuracy')
# print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [5]:
%%time
X = tfidf.fit_transform(df_train['query']).toarray()
display(X.shape)
X

(485458, 22367)

CPU times: user 3.31 s, sys: 1.12 s, total: 4.43 s
Wall time: 4.43 s


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
%%time
# train model
clf_hgbc.fit(X, df_train['category'])


leads to out-of-memory errors.

In [None]:
%%time
# check model
df_test = pd.read_csv(split_testing_data_filepath, index_col=0)
X_test = tfidf.transform(df_test['query'])
print('read in test data', X_test.shape)
y_predicted = clf_hgbc.predict(X_test)
print('computed', len(y_predicted), 'predictions of test data')

In [None]:
print('number of correct test predictions:', sum(y_predicted == df_test['category']))
print('number of incorrect predictions:', sum(y_predicted != df_test['category']))
print('ratio of correct test predictions:', round(sum(y_predicted == df_test['category'])/len(df_test),3))
print('')