In [2]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.naive_bayes import MultinomialNB

In [3]:
data = pd.read_csv('simpsons.csv')
data = data.dropna()
data.head(5)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [4]:
data_subset = data.loc[data["raw_character_text"].isin(["Lisa Simpson", "Bart Simpson"])]
data_subset.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [9]:
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object

text = data_subset['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text

feature_names = vect.get_feature_names() #Get the words from the vocabulary

In [10]:
docu_feat = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(docu_feat[0:500,0:500]) #Let's print a little part of the matrix: the first 50 words & documents

  (23, 424)	1
  (38, 325)	1
  (43, 266)	1
  (61, 269)	1
  (72, 356)	1
  (78, 264)	1
  (80, 304)	1
  (96, 192)	1
  (98, 396)	1
  (149, 328)	1
  (154, 325)	1
  (155, 451)	1
  (161, 325)	1
  (162, 325)	1
  (184, 461)	1
  (205, 325)	1
  (208, 397)	1
  (229, 270)	1
  (235, 404)	1
  (256, 325)	1
  (284, 325)	1
  (291, 493)	1
  (292, 163)	1
  (315, 300)	1
  (318, 281)	1
  (353, 450)	1
  (355, 397)	1
  (359, 449)	1
  (363, 24)	1
  (363, 449)	1
  (381, 129)	1
  (382, 325)	1
  (383, 70)	1
  (389, 38)	1
  (389, 91)	1
  (391, 446)	1
  (393, 126)	1
  (405, 52)	1
  (405, 319)	1
  (405, 343)	1
  (408, 449)	1
  (414, 196)	1
  (422, 360)	1
  (457, 304)	1


In [12]:
from sklearn.model_selection import train_test_split

y = data_subset['raw_character_text'] # defining the target variable (dependent variable) as y
X = docu_feat
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #test_size=0.3 indicates the percentage of the data that should be held over for testing

In [21]:
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB()
print(clf.predict(X))

['Lisa Simpson' 'Bart Simpson' 'Bart Simpson' ... 'Lisa Simpson'
 'Lisa Simpson' 'Lisa Simpson']


In [24]:
from sklearn.metrics import confusion_matrix

y_test_pred = clf.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix" on the test set
cm

array([[3467,  466],
       [1064, 2129]])

In [27]:
y_test.value_counts()

Bart Simpson    3933
Lisa Simpson    3193
Name: raw_character_text, dtype: int64

In [25]:
conf_matrix = pd.DataFrame(cm, index=['Bart', 'Lisa'], columns = ['Bart_p', 'Lisa_p']) 
conf_matrix

Unnamed: 0,Bart_p,Lisa_p
Bart,3467,466
Lisa,1064,2129


In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

Bart Simpson       0.77      0.88      0.82      3933
Lisa Simpson       0.82      0.67      0.74      3193

    accuracy                           0.79      7126
   macro avg       0.79      0.77      0.78      7126
weighted avg       0.79      0.79      0.78      7126



The accuracy is 79%, which is not great considering there are only two categories. What is we guessed the same catagory all the time?

In [31]:
data_subset['raw_character_text'].value_counts(normalize=True)

Bart Simpson    0.547135
Lisa Simpson    0.452865
Name: raw_character_text, dtype: float64

So we're doing about 9,4% better than when we would guess Bart all the time. Not great but to be expected of short lines of dialogue. 