In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from utils import *

In [3]:
X_train = [row[0][0] for row in train]
y_train = [row[1] for row in train]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=1234)
len(X_train), len(X_test), len(y_train), len(y_test)

(114359, 38120, 114359, 38120)

In [5]:
clf = Pipeline([
    ('vect', TfidfVectorizer()), 
    ('clf', MultinomialNB(alpha=0.01)),    
])

model = clf.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [6]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = model.predict(X_test)

In [7]:
print(confusion_matrix(y_test, y_pred))

[[14903  1319   353   694   174]
 [ 2591  5522   570   242   153]
 [ 1329  1210  1861   143    74]
 [ 2211   477   134  1689    83]
 [  938   473    88   137   752]]


In [8]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.68      0.85      0.76     17443
          1       0.61      0.61      0.61      9078
          2       0.62      0.40      0.49      4617
          3       0.58      0.37      0.45      4594
          4       0.61      0.31      0.42      2388

avg / total       0.64      0.65      0.63     38120



In [9]:
train_docs = [(tokenize(row[0][0]), row[1]) for row in train]
test_docs = [(tokenize(row[0][0]), row[1]) for row in test]

In [10]:
X_train_tokens = [' '.join(row[0]) for row in train_docs]

In [11]:
X_train_tokens, X_test_tokens = train_test_split(X_train_tokens, random_state=1234)

In [12]:
clf2 = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model2 = clf.fit(X_train_tokens, y_train)
model2

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])

In [13]:
y_pred2 = model.predict(X_test_tokens)

In [14]:
print(confusion_matrix(y_test, y_pred2))

[[15893   945   202   329    74]
 [ 2377  6127   356   137    81]
 [ 1339  1143  2030    74    31]
 [ 2514   364    66  1614    36]
 [ 1069   452    67    72   728]]


In [15]:
print(classification_report(y_test, y_pred2))

             precision    recall  f1-score   support

          0       0.69      0.91      0.78     17443
          1       0.68      0.67      0.68      9078
          2       0.75      0.44      0.55      4617
          3       0.73      0.35      0.47      4594
          4       0.77      0.30      0.44      2388

avg / total       0.70      0.69      0.67     38120



### After filtering

In [2]:
train3 = load_pickle('../train_labeled_0502.pickle')
test3 = load_pickle('../test_0502.pickle')

In [3]:
%%time
train_docs3 = [(tokenize(row[0][0]), row[1]) for row in train3]
test_docs3 = [(tokenize(row[0])) for row in test3]

CPU times: user 3min 1s, sys: 852 ms, total: 3min 2s
Wall time: 2min 50s


In [4]:
X_train3 = [' '.join(row[0]) for row in train_docs3]
test3 = [' '.join(row) for row in test_docs3]
y_train3 = [row[1] for row in train_docs3]

In [5]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_train3, y_train3, random_state=1234)
len(X_train3), len(X_test3), len(y_train3), len(y_test3)

(112314, 37438, 112314, 37438)

In [24]:
clf3 = Pipeline([
    ('vect', TfidfVectorizer(min_df=10, ngram_range=(1, 3))), 
    ('clf', MultinomialNB(alpha=0.001)),    
])

model3 = clf3.fit(X_train3, y_train3)
model3

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])

In [25]:
y_pred3 = model3.predict(X_test3)

In [26]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test3, y_pred3))

[[14768   848   271   693   160]
 [ 1235  7048   396   222   171]
 [  597   907  2953   112    48]
 [ 1183   411   123  2876    56]
 [  464   391    82   119  1304]]


In [27]:
print(classification_report(y_test3, y_pred3))

             precision    recall  f1-score   support

          0       0.81      0.88      0.84     16740
          1       0.73      0.78      0.75      9072
          2       0.77      0.64      0.70      4617
          3       0.72      0.62      0.66      4649
          4       0.75      0.55      0.64      2360

avg / total       0.77      0.77      0.77     37438



In [28]:
save_pickle('../model/mnb_0501.pickle', model3)