# naive bayes classifier model

faster implementation of gradient boosting classifier

- assume training data in filepath
    + `data/processed/train.csv`
- assume testing data in filepath
    + `data/processed/test.csv`
- assume both have same structure:
    + first column is query 
    + second column is category
- assume input strings are 'clean':
    + in lower case
    + punctuation removed (stop words included)
    + words separated by spaces (no padding)
- output:
    + `models/naivebayes.pckl`

In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.compose import make_column_transformer
from sklearn.naive_bayes import CategoricalNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

split_training_data_filepath = '../data/processed/train.csv'
split_testing_data_filepath  = '../data/processed/test.csv'
model_filepath = '../models/naivebayes.pckl'

In [2]:
# fetch training data only:
df_train = pd.read_csv(split_training_data_filepath, index_col=0)
# do not want the class label to be numerical nor ordinal. 
# df_train['category'] = pd.Categorical(df_train['category'])
df_train['category'] = pd.Categorical(df_train['category'])
display(df_train.sample(3))

Unnamed: 0,query,category
203039,prosecco,37
243065,wooden curtain pole,318
301292,morley fire alarm,180


In [3]:
# transform the text field
tfidf = TfidfVectorizer(
#     stop_words=stop_words,
    strip_accents= 'ascii',
    ngram_range=(1,1), # consider unigrams/bigrams/trigrams?
    min_df = 10,
    max_df = 0.80,
    binary=True, # count term occurance in each query only once
)
# column transformer
all_transforms = make_column_transformer(
    (tfidf, ['query'])
)

# classify using random forest (resistant to overfitting):
clf_cnb = CategoricalNB()
pipe = make_pipeline(all_transforms, clf_cnb)

In [4]:
%%time
# scores = cross_val_score(pipe, df_train['query'], df_train['category'], cv=10, scoring='accuracy')
# scores = cross_val_score(clf_svc, X, y, cv=5, scoring='f1_macro')
# scores = cross_val_score(clf_svc, X, df_train['category'], cv=2, scoring='accuracy')
# cross_val_score(pipe, df_train['query'], df_train['category'], cv=5, scoring='accuracy')
# print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs


In [5]:
%%time
X = tfidf.fit_transform(df_train['query'])
display(X.shape)
X

(485458, 12651)

CPU times: user 2.43 s, sys: 34.1 ms, total: 2.47 s
Wall time: 2.46 s


<485458x12651 sparse matrix of type '<class 'numpy.float64'>'
	with 1385122 stored elements in Compressed Sparse Row format>

In [None]:
%%time
# train model
clf_cnb.fit(X.toarray(), df_train['category'])
with open(model_filepath, 'wb') as filepath:
    pickle.dump(clf_cnb, filepath)
print('trained gbm model and exported to', model_filepath)

leads to time-out errors.

In [None]:
%%time
# check model
df_test = pd.read_csv(split_testing_data_filepath, index_col=0)
X_test = tfidf.transform(df_test['query'])
print('read in test data', X_test.shape)
y_predicted = clf_cnb.predict(X_test)
print('computed', len(y_predicted), 'predictions of test data')
print('number of correct test predictions:', sum(y_predicted == df_test['category']))
print('number of incorrect predictions:', sum(y_predicted != df_test['category']))
print('ratio of correct test predictions:', round(sum(y_predicted == df_test['category'])/len(df_test),3))
print('')