In [24]:
import pandas as pd

In [25]:
df = pd.read_csv('simpsons.csv')

In [26]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [33]:
df = df.loc[df['raw_character_text'].isin(['Lisa Simpson', 'Bart Simpson'])]
df.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [34]:
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object
 
text = df['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
vect
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14258 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [38]:
docu_feat = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(docu_feat[0:500,0:500]) #Let's print a little part of the matrix: the first 50 words & documents

(24, 424)	1
  (40, 325)	1
  (45, 266)	1
  (63, 269)	1
  (74, 356)	1
  (80, 264)	1
  (82, 304)	1
  (98, 192)	1
  (100, 396)	1
  (151, 328)	1
  (156, 325)	1
  (157, 451)	1
  (163, 325)	1
  (164, 325)	1
  (186, 461)	1
  (207, 325)	1
  (210, 397)	1
  (231, 270)	1
  (237, 404)	1
  (259, 325)	1
  (287, 325)	1
  (294, 493)	1
  (295, 163)	1
  (318, 300)	1
  (321, 281)	1
  (356, 450)	1
  (358, 397)	1
  (362, 449)	1
  (366, 24)	1
  (366, 449)	1
  (386, 129)	1
  (387, 325)	1
  (388, 70)	1
  (394, 38)	1
  (394, 91)	1
  (396, 446)	1
  (398, 126)	1
  (410, 52)	1
  (410, 319)	1
  (410, 343)	1
  (413, 449)	1
  (419, 196)	1
  (428, 360)	1
  (464, 304)	1


In [39]:
#Make a regular matrix out of docu_feat, make it into a DataFrame and concatenate it along the columns
rev_words = pd.concat([df, pd.DataFrame(docu_feat.toarray())], axis=1)
rev_words.head(1)

Unnamed: 0,raw_character_text,spoken_words,0,1,2,3,4,5,6,7,...,14248,14249,14250,14251,14252,14253,14254,14255,14256,14257
0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
#Relabeling the columns. feature_names contains the words in the text. I've used the v (variable) + underscore to distinguish from the words like 'hotel' in the text
#Hard-coding the names like this is not really good practice (better would be some operation on the dataframe), but it's a lot clearer.

rev_words.columns = ['v_raw_character_text', 'v_spoken_words'] + feature_names
rev_words.head()

Unnamed: 0,v_raw_character_text,v_spoken_words,000,007,10,1000,10201,108,1094,11,...,zork,zorrinid,zuckerberg,zuh,zumanity,zur,zz,zzzapp,ãªtre,ãºna
0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Lisa Simpson,Where's Mr. Bergstrom?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Lisa Simpson,That life is worth living.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

#Setting up the data and model
nb = MultinomialNB()
X = docu_feat #selecting the variables to go into my X matrix
y = df['raw_character_text'] #creating the y vector

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #split the data and store it

nb = nb.fit(X_train, y_train)
nb.score(X_test, y_test)



0.6365676567656766

In [46]:
df['raw_character_text'].value_counts(normalize=True)

Bart Simpson    0.544954
Lisa Simpson    0.455046
Name: raw_character_text, dtype: float64

In [47]:
from sklearn.metrics import confusion_matrix
y_test_pred = nb.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[3261,  820],
       [1933, 1561]], dtype=int64)

In [49]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['Bart', 'Lisa'], columns = ['Bart_p', 'Lisa_p']) 
conf_matrix

Unnamed: 0,Bart_p,Lisa_p
Bart,3261,820
Lisa,1933,1561


In [51]:
# calculating accuracy, recall and precision

from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

precision    recall  f1-score   support

Bart Simpson       0.63      0.80      0.70      4081
Lisa Simpson       0.66      0.45      0.53      3494

   micro avg       0.64      0.64      0.64      7575
   macro avg       0.64      0.62      0.62      7575
weighted avg       0.64      0.64      0.62      7575



In [65]:
nb.predict_proba(X[:1, :])

array([[0.02580694, 0.97419306]])

In [67]:
for i in range(1, 28):
    print('sentence', i, df.spoken_words.iloc[i])
    print(clf.predict_proba(X[i]))
    
    i+1 

sentence 1 That life is worth living.
[[0.69449341 0.30550659]]
sentence 2 Victory party under the slide!
[[0.76243062 0.23756938]]
sentence 3 Mr. Bergstrom! Mr. Bergstrom!
[[4.27957359e-04 9.99572043e-01]]
sentence 4 Do you know where I could find him?
[[0.55340771 0.44659229]]
sentence 5 The train, how like him... traditional, yet environmentally sound.
[[0.02857325 0.97142675]]
sentence 6 I see he touched you, too.
[[0.52425227 0.47574773]]
sentence 7 Hey, thanks for your vote, man.
[[0.9501968 0.0498032]]
sentence 8 Well, you got that right. Thanks for your vote, girls.
[[0.78929503 0.21070497]]
sentence 9 Well, don't sweat it. Just so long as a couple of people did... right, Milhouse?
[[0.58792341 0.41207659]]
sentence 10 Lewis?
[[0.81508251 0.18491749]]
sentence 11 Somebody must have voted.
[[0.35143517 0.64856483]]
sentence 12 Uh oh.
[[0.71282318 0.28717682]]
sentence 13 nan
[[0.52988561 0.47011439]]
sentence 14 I demand a recount.
[[0.85777683 0.14222317]]
sentence 15 No.
[[0.5