In [4]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split

### Split Data for Training and Application

In [5]:
df = pd.read_csv('../data/talking_head.csv')
train, test, _,__ = train_test_split(df, df.character, test_size=0.1, random_state=42)
train.to_csv("../data/training.csv", index=False)
test.to_csv("../data/test.csv", index=False)

In [33]:
test["quote"].iloc[0]

'Two whistleblowers… two! I always thought Darryl and Pam might get me fired for something I said.'

### Load Data

In [6]:
df = pd.read_csv("../data/training.csv")

### Preprocessing

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/morrislagrand/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/morrislagrand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [9]:
df.quote = df.quote.apply(normalize_document)
df.character = pd.Categorical(df.character)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.quote.values, df.character.cat.codes, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

(1101,) (473,)


### SVC

In [11]:
from sklearn.svm import LinearSVC

#### Count Vectors

In [35]:
X_train.shape

(1101,)

In [12]:
## Count Vectors
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,1))

# Train matrix
cv_matrix_train = cv.fit_transform(X_train)
cv_matrix_train = cv_matrix_train.toarray()

# Test matrix
cv_matrix_test = cv.transform(X_test)
cv_matrix_test = cv_matrix_test.toarray()

#### TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tt = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)

# Train
tt_matrix_train = tt.fit_transform(cv_matrix_train)
tt_matrix_train = tt_matrix_train.toarray()

# Test
tt_matrix_test = tt.transform(cv_matrix_test)
tt_matrix_test = tt_matrix_test.toarray()

#### Train

In [14]:
svc = LinearSVC(penalty='l2', C=1, random_state=42).fit(tt_matrix_train, y_train)

In [15]:
svc.score(tt_matrix_test, y_test)

0.5010570824524313

#### Save Model results

In [16]:
import pickle

In [17]:
# Save Count Vectors
pickle.dump(cv, open("../models/cv.sav", 'wb'))

In [18]:
# Save TF-IDF
pickle.dump(tt, open("../models/tt.sav", 'wb'))

In [19]:
# Save SVC
pickle.dump(svc, open("../models/svc.sav", 'wb'))

In [36]:
df = pd.read_csv("../data/test.csv")

In [37]:
df.head()

Unnamed: 0,quote_id,quote,character
0,660,Two whistleblowers… two! I always thought Darr...,Michael
1,1353,Co-managing is a give and take. You have to pi...,Jim
2,861,"Yes, I have decided to shun Andy Bernard for t...",Dwight
3,111,"See, that’s what Christmas is all about to me,...",Michael
4,332,"Here’s the thing. When a company screws up, be...",Michael
