In [34]:
%matplotlib inline

In [510]:
import re
import numpy as np
import pandas as pd
from keras.layers import Input, Embedding, Dropout, LSTM, Dense, Flatten
from keras.models import Model
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold

In [15]:
def remove_punc(sentence):
    PATTERN = r'[^a-zA-Z0-9 ]'
    filtered_sentence = re.sub(PATTERN, r' ', sentence)
    return filtered_sentence.strip()

In [2]:
sunrise = pd.read_csv('sunrise_preprocess.csv')
sunset = pd.read_csv('sunset_preprocess.csv')
dat = pd.concat([sunrise, sunset])

In [278]:
dat['line']

Unnamed: 0,speaker,line,scene
0,Jesse,Do you have any idea what they were arguing ...,1
1,Jesse,Do you do you speak English?,1
2,Céline,"Yeah. No, I'm sorry, my German is not very goo...",1
3,Jesse,No.,1
4,Céline,"Well, supposedly, men lose the ability to hear...",1
5,Jesse,I guess. Nature's way of allowing couples to g...,1
6,Céline,How bout you?,1
7,Jesse,Umm.,1
8,Jesse,"Look, I was thinking about going to the lounge...",1
9,Céline,Yeah.,1


In [161]:
dat['speaker'].map(lambda x: 1 if x == 'Jesse' else 0)

0      1
1      1
2      0
3      1
4      0
5      1
6      0
7      1
8      1
9      0
10     1
11     1
12     0
13     1
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     0
23     1
24     0
25     1
26     0
27     1
28     0
29     1
      ..
745    1
746    0
747    1
748    0
749    0
750    0
751    0
752    0
753    0
754    0
755    0
756    0
757    1
758    0
759    0
760    1
761    0
762    1
763    0
764    1
765    0
766    1
767    0
768    1
769    0
770    1
771    0
772    1
773    0
774    1
Name: speaker, Length: 1462, dtype: int64

In [162]:
labels = dat['speaker'].map(lambda x: 1 if x == 'Jesse' else 0).values

In [121]:
lines = dat['line'].values

In [280]:
tokenized = [remove_punc(line.strip().lower()) for line in lines]

In [282]:
length = [len(remove_punc(line.strip().lower()).split()) for line in lines]

In [283]:
dat['len'] = length

In [523]:
filtered = dat[dat['len']>3]

In [525]:
filtered['speaker'].value_counts()

Jesse     512
Céline    512
Name: speaker, dtype: int64

In [293]:
labels = filtered['speaker'].map(lambda x: 1 if x == 'Jesse' else 0).values

In [294]:
lines = filtered['line'].values

In [319]:
tokenized = [remove_punc(line.strip().lower()) for line in lines]

In [321]:
cv = CountVectorizer()

In [330]:
tok = cv.fit_transform(tokenized).toarray()

In [493]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [516]:
?Dense

In [526]:
def train_model(xtr, ytr, n1, n2):
    x = Input(shape=(tok.shape[1],))
    y = Dense(n1, activation='relu')(x)
    y = Dropout(0.5)(y)
    y = Dense(n2, activation='relu')(y)
    y = Dense(1, activation='sigmoid')(y)
    model = Model(inputs=x, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    model.fit(xtr, ytr, epochs=10, batch_size=64, verbose=0)
    return model

In [529]:
for n1 in [8,16,32,64]:
    for n2 in [4,8,16,32]:
        cvscores = []
        for train, test in kfold.split(tok, labels):
          # create model
            model = train_model(tok[train], labels[train], n1, n2)
            scores = model.evaluate(tok[test], labels[test], verbose=0)
    #         print("%s: %s: %.2f%%" % (n, model.metrics_names[1], scores[1]*100))
            cvscores.append(scores[1] * 100)
        print("%s %s: %.2f%% (+/- %.2f%%)" % (n1, n2, np.mean(cvscores), np.std(cvscores)))

8 4: 59.56% (+/- 2.70%)
8 8: 59.85% (+/- 4.45%)
8 16: 61.04% (+/- 3.18%)
8 32: 61.82% (+/- 4.41%)
16 4: 60.44% (+/- 2.30%)
16 8: 60.84% (+/- 3.12%)
16 16: 61.42% (+/- 1.78%)
16 32: 61.62% (+/- 2.95%)
32 4: 60.06% (+/- 3.70%)
32 8: 61.62% (+/- 3.00%)
32 16: 61.43% (+/- 3.71%)
32 32: 62.60% (+/- 2.28%)
64 4: 60.44% (+/- 2.18%)
64 8: 62.29% (+/- 2.71%)
64 16: 61.43% (+/- 3.17%)
64 32: 61.42% (+/- 2.79%)


In [522]:
filtered['speaker'].value_counts()

Jesse     512
Céline    512
Name: speaker, dtype: int64

In [506]:
train_model(tok, labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.engine.training.Model at 0x125329b70>