In [2]:
import seaborn as sns #this is the plotting library I'll be using 
import pandas as pd #"as pd" means that we can use the abbreviation in commands

from sklearn.model_selection import train_test_split
df = pd.read_csv('../simpsons.csv')
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
df = df[(df.raw_character_text=="Bart Simpson") | (df.raw_character_text=="Lisa Simpson")]
print(df)

       raw_character_text                                       spoken_words
1            Lisa Simpson                             Where's Mr. Bergstrom?
3            Lisa Simpson                         That life is worth living.
7            Bart Simpson                     Victory party under the slide!
9            Lisa Simpson                      Mr. Bergstrom! Mr. Bergstrom!
11           Lisa Simpson                Do you know where I could find him?
...                   ...                                                ...
158299       Lisa Simpson                                  Can we have wine?
158301       Lisa Simpson                                   Can I have wine?
158303       Lisa Simpson                        Does Bart have to be there?
158305       Lisa Simpson                            Can we do it this week?
158307       Lisa Simpson  Mr. Bergstrom, we request the pleasure of your...

[25248 rows x 2 columns]


In [4]:
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object

text = df['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
vect
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")
docu_feat = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(docu_feat[0:500,0:500]) #Let's print a little part of the matrix: the first 50 words & documents

There are 14258 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']
  (24, 424)	1
  (40, 325)	1
  (45, 266)	1
  (63, 269)	1
  (74, 356)	1
  (80, 264)	1
  (82, 304)	1
  (98, 192)	1
  (100, 396)	1
  (151, 328)	1
  (156, 325)	1
  (157, 451)	1
  (163, 325)	1
  (164, 325)	1
  (186, 461)	1
  (207, 325)	1
  (210, 397)	1
  (231, 270)	1
  (237, 404)	1
  (259, 325)	1
  (287, 325)	1
  (294, 493)	1
  (295, 163)	1
  (318, 300)	1
  (321, 281)	1
  (356, 450)	1
  (358, 397)	1
  (362, 449)	1
  (366, 24)	1
  (366, 449)	1
  (386, 129)	1
  (387, 325)	1
  (388, 70)	1
  (394, 38)	1
  (394, 91)	1
  (396, 446)	1
  (398, 126)	1
  (410, 52)	1
  (410, 319)	1
  (410, 343)	1
  (413, 449)	1
  (419, 196)	1
  (428, 360)	1
  (464, 304)	1


In [5]:
y = df['raw_character_text'] #We need to take out the price as our Y-variable
X = docu_feat  #this slices the dataframe to include all rows I need


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables


In [7]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB() #clf = classifier
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
print(clf.predict(X[2]))
print(df.spoken_words.iloc[2])

['Bart Simpson']
Victory party under the slide!


In [9]:
from sklearn.metrics import confusion_matrix
y_test_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_test_pred)
cm

array([[3271,  894],
       [1799, 1611]])

In [10]:
# y_test.value_counts() works both
clf.classes_

array(['Bart Simpson', 'Lisa Simpson'], dtype='<U12')

In [11]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['Bart', 'Lisa' ], columns = ['predicted Bart', 'predicted Lisa']) 
conf_matrix

Unnamed: 0,predicted Bart,predicted Lisa
Bart,3271,894
Lisa,1799,1611


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

Bart Simpson       0.65      0.79      0.71      4165
Lisa Simpson       0.64      0.47      0.54      3410

    accuracy                           0.64      7575
   macro avg       0.64      0.63      0.63      7575
weighted avg       0.64      0.64      0.63      7575



In [13]:
df['raw_character_text'].value_counts(normalize=True)

Bart Simpson    0.544954
Lisa Simpson    0.455046
Name: raw_character_text, dtype: float64

So we're doing about 76-54 percent better than when we would guess Bart al the time. not great but to be expected of short lines of dialogue

In [14]:
#probability of second sentence:
y_test_prob = clf.predict_proba(X[2])
print(y_test_prob)

[[0.79366658 0.20633342]]


In [15]:
# loop that prints out lines of dialogue associated with probabilities
for i in range(1, 28):
    y_test_prob = clf.predict_proba(X[i])
    print('sentence', i, df.spoken_words.iloc[i])
#     print( y_test_prob)
    print('Bart:', y_test_prob[0,0], 'Lisa', y_test_prob[0,1] )
    
    i+1

sentence 1 That life is worth living.
Bart: 0.676345532629102 Lisa 0.3236544673708974
sentence 2 Victory party under the slide!
Bart: 0.7936665779408832 Lisa 0.20633342205911578
sentence 3 Mr. Bergstrom! Mr. Bergstrom!
Bart: 0.0016738185124174146 Lisa 0.9983261814875828
sentence 4 Do you know where I could find him?
Bart: 0.5424412124555592 Lisa 0.4575587875444406
sentence 5 The train, how like him... traditional, yet environmentally sound.
Bart: 0.06930172690218564 Lisa 0.930698273097816
sentence 6 I see he touched you, too.
Bart: 0.42381872382568675 Lisa 0.5761812761743126
sentence 7 Hey, thanks for your vote, man.
Bart: 0.9536359347446102 Lisa 0.04636406525539032
sentence 8 Well, you got that right. Thanks for your vote, girls.
Bart: 0.8431657684099719 Lisa 0.15683423159002693
sentence 9 Well, don't sweat it. Just so long as a couple of people did... right, Milhouse?
Bart: 0.8170694268683306 Lisa 0.18293057313167005
sentence 10 Lewis?
Bart: 0.7339270932407512 Lisa 0.2660729067592495