In [15]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn import svm
import seaborn as sns
import scikitplot as skplt
import joblib
from scipy.stats import uniform as sp_rand

In [2]:
cnb = joblib.load('../models/cnb.pkl')

In [5]:
df = pd.read_csv('../data/The-Office-Lines-V3.csv', encoding='latin-1')

In [6]:
df['line'] = df['line'].str.lower()
speakers = []
for i in df.speaker.value_counts().head(3).index:
    speakers.append(i)

In [7]:
df = df[df.speaker.str.match('|'.join(speakers))]

In [10]:
X = df.line
y = df.speaker
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(X_train)

In [12]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [13]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [16]:
param_grid = {'alpha': sp_rand()}

In [26]:
rsearch = RandomizedSearchCV(estimator=cnb, param_distributions=param_grid, n_iter=1000)
search = rsearch.fit(X_train_tfidf, y_train)



In [27]:
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

0.4566098195704047
0.9994217547431267
