In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn 
print(sklearn.__version__)


0.24.1


In [2]:
df = pd.read_csv('The-Office-Lines-V4.csv')
df= df.drop('Unnamed: 6', 1)

In [3]:
def gender(name):
    if name == 'Michael' or name == 'Jim' or name == 'Dwight' or name == 'Ryan' or name == 'Stanley' or name == 'Kevin' or name == 'Oscar' or name == 'Toby' or name == 'Creed' or name == 'Andy' or name == 'Roy' or name == 'Darryl' or name == 'Gabe' or name == 'Robert California' or name == 'Robert' or name == 'Clark' or name == 'Pete':
        return 0
    elif name == 'Pam' or name == 'Phyllis' or name == 'Angela' or name == 'Kelly' or name == 'Meredith' or name == 'Jan' or name == 'Erin' or name == 'Nellie':
        return 1

In [4]:
df["gender"] = df["speaker"].map(gender)

In [5]:
df = df.dropna()

In [6]:
df['gender'] = df['gender'].astype(int)

In [7]:
df.sample(n=10)

Unnamed: 0,season,episode,title,scene,speaker,line,gender
35375,7,7,Christening,5368,Pam,"Oh, Michael, this is just for family.",1
3128,2,6,The Fight,499,Michael,"I know, but I've been carrying the load on my ...",0
17222,4,15,Night Out,2888,Ryan,"Ah, he basically is man. He's a regular bankin...",0
33648,7,1,Nepotism,5118,Andy,Am I angry that Gabe stole my girlfriend over ...,0
9958,3,10,A Benihana Christmas (Parts 1&2),1677,Dwight,"Oh, OK. Cindy! Yo, Cindy! Cindy! Hold its neck...",0
39808,7,22,"Goodbye, Michael",5953,Darryl,Hey.,0
21192,5,10,The Surplus,3428,Pam,"Let me just say, you've been promising me this...",1
8687,3,6,Diwali,1486,Pam,I kind of thought something would happen tonig...,1
36865,7,11,Classy Christmas (Parts 1&2),5562,Jim,I can't reconnect with you right now. Hold on ...,0
42185,8,5,Spooked,6326,Andy,Ok. Erin I think you know I've been dating som...,0


In [8]:
df['number_of_words'] = df['line'].str.split().str.len()

In [9]:
df['speaker_id'] = pd.factorize(df.speaker)[0]

In [10]:
df.sample()

Unnamed: 0,season,episode,title,scene,speaker,line,gender,number_of_words,speaker_id
17523,4,16,Did I Stutter?,2922,Michael,"Ok... Is there anybody up here, anybody at all...",0,14,0


In [11]:
# we want to predict gender/speaker from text. this is text classification
# apparently multinomial naive bayes is a good algorithm for this, so we'll use that and see if its any good

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [12]:
X = df['line']
y = df['gender']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=.33)

In [14]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((31934,), (31934,), (15730,), (15730,))

In [15]:
X_train.head()

28463                                   Okay.  You owe me.
10883                                               Sally.
51277                        No. She needs her rest again.
4371      What do you do with a drunken sailor? What do...
27832                                       And oh, Jim...
Name: line, dtype: object

In [16]:
vect = CountVectorizer()
tfidf = TfidfTransformer()
multinb = MultinomialNB()

In [17]:
pipe = make_pipeline(vect, tfidf, multinb)

In [18]:
X.shape

(47664,)

In [19]:
y.shape

(47664,)

In [20]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('tfidftransformer', TfidfTransformer()),
                ('multinomialnb', MultinomialNB())])

In [21]:
from sklearn.metrics import classification_report

In [22]:
y_pred = pipe.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.7542275905912269


In [25]:
my_tags = ['male', 'female']

In [26]:
print(classification_report(y_test, y_pred,target_names=my_tags))

              precision    recall  f1-score   support

        male       0.76      1.00      0.86     11870
      female       0.42      0.00      0.01      3860

    accuracy                           0.75     15730
   macro avg       0.59      0.50      0.43     15730
weighted avg       0.67      0.75      0.65     15730



In [27]:
# evaluate
from sklearn.metrics import r2_score, mean_squared_error

In [28]:
y_test_pred = pipe.predict(X_test)

In [29]:
r2_score(y_test, y_test_pred)
# very bad

-0.3272494336311771

In [30]:
mean_squared_error(y_test, y_test_pred)

0.24577240940877304

In [31]:
# for 76% of male lines, the model will label them as male. this is good! however, only 42% of female lines will be correctly labeled as female. Maybe this is because there are more male speakers?
# recall confirms what i thought; model will overfit on male class. 
# bad model then, but 75% accuracy, maybe because 75% of lines are male. 

# well, maybe it would be better at predicting speaker? 
# will be using linear support vector machine for this; as it is apparently one of the best text classification algorithms


In [32]:
from sklearn.metrics import plot_confusion_matrix

In [33]:
pipe.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('tfidftransformer', TfidfTransformer()),
                ('multinomialnb', MultinomialNB())])>

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
pipe.named_steps

{'countvectorizer': CountVectorizer(),
 'tfidftransformer': TfidfTransformer(),
 'multinomialnb': MultinomialNB()}

In [36]:
vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [37]:
tfidf.get_params()

{'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': True}

In [38]:
multinb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [39]:
params = {'countvectorizer__ngram_range': [(1,1),(1,2),(2,2)], 
          'tfidftransformer__use_idf': (True, False),
          'tfidftransformer__norm': ('l1', 'l2'),
          'multinomialnb__alpha': [1, 1e-1, 1e-2]
         }

In [40]:
gs = GridSearchCV(pipe, params, cv=5, scoring='accuracy',  n_jobs=-1, refit=True, verbose=1)

In [41]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('countvectorizer', CountVectorizer()),
                                       ('tfidftransformer', TfidfTransformer()),
                                       ('multinomialnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'countvectorizer__ngram_range': [(1, 1), (1, 2),
                                                          (2, 2)],
                         'multinomialnb__alpha': [1, 0.1, 0.01],
                         'tfidftransformer__norm': ('l1', 'l2'),
                         'tfidftransformer__use_idf': (True, False)},
             scoring='accuracy', verbose=1)

In [42]:
gs.best_estimator_

Pipeline(steps=[('countvectorizer', CountVectorizer(ngram_range=(2, 2))),
                ('tfidftransformer', TfidfTransformer()),
                ('multinomialnb', MultinomialNB(alpha=1))])

In [43]:
gs.best_params_

{'countvectorizer__ngram_range': (2, 2),
 'multinomialnb__alpha': 1,
 'tfidftransformer__norm': 'l2',
 'tfidftransformer__use_idf': True}

In [44]:
gs.best_score_

0.758251417068151

In [45]:
print(classification_report(y_test, y_pred,target_names=my_tags))

              precision    recall  f1-score   support

        male       0.76      1.00      0.86     11870
      female       0.42      0.00      0.01      3860

    accuracy                           0.75     15730
   macro avg       0.59      0.50      0.43     15730
weighted avg       0.67      0.75      0.65     15730



In [46]:
gs_y_test_pred = gs.predict(X_test) 

In [47]:
print(classification_report(y_test, gs_y_test_pred, target_names=my_tags))

              precision    recall  f1-score   support

        male       0.76      1.00      0.86     11870
      female       0.71      0.00      0.01      3860

    accuracy                           0.75     15730
   macro avg       0.73      0.50      0.43     15730
weighted avg       0.75      0.75      0.65     15730



In [49]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, gs_y_test_pred))

[[11866     4]
 [ 3850    10]]
