In [1]:
import pandas as pd
import numpy as np

from joblib import dump, load

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("../data/clean_complaints.csv")

In [3]:
df['complaint_text'].value_counts()
print(df['complaint_text'].isnull().sum()) 
df = df.dropna(subset=['complaint_text'])

1


In [4]:
X = df[["complaint_text"]]
y = df["Issue"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [5]:
vect = CountVectorizer()

X_train_vec = vect.fit_transform(X_train['complaint_text'])
X_test_vec = vect.transform(X_test['complaint_text'])

In [6]:
nb = MultinomialNB().fit(X_train_vec, y_train)

y_pred = nb.predict(X_test_vec)

In [7]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8201294732791597
[[11655  2015   469  3834   318]
 [  595  4493    46    94    83]
 [   60    55  2814   112    46]
 [ 4788   807   762 49276  1693]
 [   33    39     8    36  4227]]


In [8]:
vect = CountVectorizer()
clf = MultinomialNB()

pipe = Pipeline([("vect", vect), ("clf", clf)])

param_grid = {
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'clf__fit_prior':[False, True]
}

In [9]:
# rs = RandomizedSearchCV(estimator = pipe, param_distributions = param_grid, verbose = 2, n_jobs = -1)
# rs.fit(X_train['complaint_text'], y_train)


In [10]:
print(rs.best_params_)
print(rs.best_score_)

NameError: name 'rs' is not defined

In [None]:
y_pred = rs.best_estimator_.predict(X_test['complaint_text'])

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8673351592385522
[[13434   276    21  4432   128]
 [ 2139  2861     6   189   116]
 [  468    13  2232   299    75]
 [ 2867     7    16 53962   474]
 [   74     6     1   115  4147]]


In [None]:
vect1 = CountVectorizer()
vect2 = TfidfVectorizer()
clf = MultinomialNB()

pipe = Pipeline([("vect", vect1), ("clf", clf)])

param_grid = {
    'vect': [vect1, vect2],
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'clf__fit_prior':[False, True]
}

In [None]:
rs = RandomizedSearchCV(estimator = pipe, param_distributions = param_grid, verbose = 2, n_jobs = -1)
rs.fit(X_train['complaint_text'], y_train)

In [None]:
print(rs.best_params_)
print(rs.best_score_)

{'vect__ngram_range': (1, 3), 'vect__min_df': 10, 'vect': TfidfVectorizer(), 'clf__fit_prior': False}
0.8517993099973937


In [None]:
y_pred = rs.best_estimator_.predict(X_test['complaint_text'])

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8506530251929649
[[14624   381    27  3150   109]
 [ 1869  3270     6   104    62]
 [  673    22  2170   178    44]
 [ 5654    26    14 51012   620]
 [  160    19     2    76  4086]]
