In [None]:
import pandas as pd
import itertools
import numpy as np
TRAIN_FILE = 'data/test1/SouthPark.train.tsv'
TEST_FILE = 'data/test1/SouthPark.test.tsv'
epochs = 20
np.random.seed(13)

In [70]:
train_df = pd.read_csv(TRAIN_FILE,  sep='\t', header=None)
train_df.columns = ['season', 'episode', 'character', 'text']

test_df = pd.read_csv(TEST_FILE,  sep='\t', header=None)
test_df.columns = ['season', 'episode', 'character', 'text']

test_df.head(5)


Unnamed: 0,season,episode,character,text
0,16,1,Stan,"Throw it here! Throw it here! Yeah! Nice one,..."
1,16,1,Cartman,"Dude, that sucks, Clyde. A mom shouldn't be ab..."
2,16,1,Kyle,Is your mom always like that dude?
3,16,1,Cartman,"Of course, man. It's cool."
4,16,1,Cartman,"Clyde, Clyde! What have I told you about pissi..."


In [3]:
labels_array = set(train_df['character'])
labels_dict = {l: i for i, l in enumerate(labels_array)}
inv_labels_dict = {v: k for k, v in labels_dict.items()}

NUM_OF_LABELS = len(labels_dict)
labels_dict

{'Announcer': 7,
 'Butters': 18,
 'Cartman': 3,
 'Chef': 19,
 'Gerald': 0,
 'Jimbo': 5,
 'Jimmy': 12,
 'Kenny': 11,
 'Kyle': 16,
 'Liane': 10,
 'Mr. Garrison': 13,
 'Mr. Mackey': 8,
 'Mrs. Garrison': 4,
 'Randy': 2,
 'Sharon': 9,
 'Sheila': 14,
 'Stan': 15,
 'Stephen': 6,
 'Token': 17,
 'Wendy': 1}

In [171]:
import nltk, string

translator = str.maketrans("","", string.punctuation)
stopwords = set(nltk.corpus.stopwords.words('english'))

def my_tokenizer(s):
#     return [ x for x in nltk.tokenize.word_tokenize(s.lower().translate(translator)) if not x in stopwords ]
    return [ x for x in nltk.tokenize.word_tokenize(s.lower().translate(translator))]

In [146]:
def test_model(model, name=None):
    if not name is None:
        print(name)
    model.fit(X_train, y_train)
    print("Train set: %.5f\tTest set: %.5f" % (model.score(X_train, y_train), model.score(X_test, y_test)) )

In [140]:
import gensim
from gensim.models.keyedvectors import KeyedVectors
embbedings = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)



In [147]:
def get_vector(tokens):
    X = [embbedings[tok] for tok in tokens if tok in embbedings]
    ans = np.sum(X , axis=0)
    if ans.shape != (300, ):
        return [0] * 300
    return ans

# Using Google word embeddings

In [None]:
X_train = np.array([get_vector(list(my_tokenizer(text))) for text in train_df['text']])
y_train = np.zeros((len(train_df.index), NUM_OF_LABELS ))

for i , name in enumerate(train_df['character']):
    y_train[i][labels_dict[name]] = 1

X_test = np.array([get_vector(my_tokenizer(text)) for text in test_df['text']])
y_test = np.zeros((len(test_df.index), NUM_OF_LABELS ))
                  
for i , name in enumerate(test_df['character']):
    y_test[i][labels_dict[name]] = 1


### Trying some Dense Layers

In [37]:
import keras
from keras.models import Sequential
from keras.layers import Dense


model = Sequential()

model.add(Dense(128, activation='relu', input_shape=(300,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(NUM_OF_LABELS, activation='sigmoid'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [38]:
model.fit(X_train, y_train,
#           batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, y_test))

score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 19329 samples, validate on 1301 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 2.37178349568
Test accuracy: 0.285933897014


### Desicion Tree Clasifier

In [39]:
y_train = np.array([labels_dict[name] for name in train_df['character']])
y_test = np.array([labels_dict[name] for name in test_df['character']])

from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [40]:
y_test_pred = clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.19139123750960799

### Random Forest Classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=17, random_state=0)
clf = clf.fit(X_train, y_train)

In [42]:
y_test_pred = clf.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.25365103766333591

# CountVectorizer / TfidfVectorizer

In [178]:
def get_tokens(data_frame):
    return [ tok for line in data_frame['text'] for tok in my_tokenizer(line)]

In [179]:
vocab = set(get_tokens(train_df)).union(set(get_tokens(test_df)))
vocab = {tok : i for i , tok in enumerate(vocab)}

In [180]:
cv = CountVectorizer(input='content', encoding='latin1', min_df=2, vocabulary=vocab, tokenizer =my_tokenizer )

X_train = cv.fit_transform(train_df['text'])
X_test = cv.fit_transform(test_df['text'])
y_train = np.array([labels_dict[name] for name in train_df['character']])
y_test = np.array([labels_dict[name] for name in test_df['character']])

In [185]:
test_model(MultinomialNB(), 'MultinomialNB')
test_model(BernoulliNB(), 'BernoulliNB')
test_model(LogisticRegression(), 'LogisticRegression')

MultinomialNB
Train set: 0.48021	Test set: 0.30515
BernoulliNB
Train set: 0.42134	Test set: 0.30208
LogisticRegression
Train set: 0.64514	Test set: 0.36357


# Best Result (Logistic Regresion over TfidfVectorizer -->  0.37325)

In [176]:
tfv = TfidfVectorizer(input='content', encoding='latin1', min_df=2, max_df=200, ngram_range=(1, 2), tokenizer=my_tokenizer, vocabulary=vocab,)
X_train = tfv.fit_transform(train_df['text'])
X_test = tfv.fit_transform(test_df['text'])

In [177]:
test_model(MultinomialNB(), 'MultinomialNB')
test_model(BernoulliNB(), 'BernoulliNB')
test_model(LogisticRegression(), 'LogisticRegression')

MultinomialNB
Train set: 0.41047	Test set: 0.29362
BernoulliNB
Train set: 0.42134	Test set: 0.30208
LogisticRegression
Train set: 0.49909	Test set: 0.37325


# LSTM and Word Embedding

In [188]:
top_words = 400

def get_vector(sentence):
    ans = np.zeros(top_words)
    ids = [vocab[tok] for tok in my_tokenizer(sentence)]
    for i in range(min(top_words, len(ids))):
        ans[i] = ids[i]
    return ans

In [189]:

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import matplotlib.pyplot as plt
from collections import Counter

X_train = np.array([get_vector(line) for line in train_df['text']])
X_test = np.array([get_vector(line) for line in test_df['text']])

y_train = np.zeros((len(train_df.index), NUM_OF_LABELS ))
for i , name in enumerate(train_df['character']):
    y_train[i][labels_dict[name]] = 1

y_test = np.zeros((len(test_df.index), NUM_OF_LABELS ))
for i , name in enumerate(test_df['character']):
    y_test[i][labels_dict[name]] = 1
    
print(Counter(train_df['character']))
print(Counter(test_df['character']))

Counter({'Cartman': 4811, 'Stan': 3898, 'Kyle': 3524, 'Butters': 1336, 'Randy': 1233, 'Mr. Garrison': 512, 'Sharon': 442, 'Chef': 433, 'Kenny': 375, 'Liane': 340, 'Jimmy': 332, 'Sheila': 298, 'Gerald': 287, 'Jimbo': 283, 'Wendy': 276, 'Mr. Mackey': 245, 'Announcer': 183, 'Stephen': 177, 'Mrs. Garrison': 174, 'Token': 170})
Counter({'Cartman': 308, 'Stan': 272, 'Kyle': 232, 'Butters': 140, 'Randy': 140, 'Stephen': 38, 'Sharon': 35, 'Kenny': 30, 'Mr. Mackey': 21, 'Gerald': 19, 'Token': 17, 'Jimmy': 13, 'Liane': 11, 'Sheila': 9, 'Jimbo': 6, 'Announcer': 5, 'Mr. Garrison': 4, 'Wendy': 1})


In [191]:
embedding_vecor_length = 64

model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length))
model.add(LSTM(32, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(16, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(128))
model.add(Dense(NUM_OF_LABELS, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, None, 64)          25600     
_________________________________________________________________
lstm_45 (LSTM)               (None, None, 32)          12416     
_________________________________________________________________
lstm_46 (LSTM)               (None, 16)                3136      
_________________________________________________________________
dense_30 (Dense)             (None, 128)               2176      
_________________________________________________________________
dense_31 (Dense)             (None, 20)                2580      
Total params: 45,908
Trainable params: 45,908
Non-trainable params: 0
_________________________________________________________________
None
Train on 19329 samples, validate on 1301 samples
Epoch 1/1


<keras.callbacks.History at 0x20e7057e908>