In [79]:
import pandas as pd

TRAIN = 'data/train.csv'
TEST = 'data/test.csv'
SAMPLE = 'data/sample_submission.csv'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000

train = pd.read_csv(TRAIN)
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [80]:
from sklearn.model_selection import train_test_split

X = train.text
y = train.author

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15663,), (15663,), (3916,), (3916,))

In [81]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 23891 unique tokens.


In [82]:
y_train.values.reshape(-1, 1)[:10]

array([['EAP'],
       ['EAP'],
       ['HPL'],
       ['MWS'],
       ['MWS'],
       ['HPL'],
       ['EAP'],
       ['MWS'],
       ['EAP'],
       ['HPL']], dtype=object)

In [83]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train_labels = le.fit_transform(y_train)
test_labels = le.transform(y_test)

train_labels[:10]

array([0, 0, 1, 2, 2, 1, 0, 2, 0, 1])

In [84]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(train_labels.reshape(-1, 1))
test_labels = mlb.transform(test_labels.reshape(-1, 1))

train_labels[:10]

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [85]:
from keras.preprocessing.sequence import pad_sequences

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_data.shape, test_data.shape

((15663, 1000), (3916, 1000))

In [86]:
train_labels.shape, test_labels.shape

((15663, 3), (3916, 3))

In [92]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.optimizers import Adam

model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=50, input_length=MAX_SEQUENCE_LENGTH),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(3, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 1000, 50)      1194600     embedding_input_8[0][0]          
____________________________________________________________________________________________________
flatten_8 (Flatten)              (None, 50000)         0           embedding_8[0][0]                
____________________________________________________________________________________________________
dense_15 (Dense)                 (None, 100)           5000100     flatten_8[0][0]                  
____________________________________________________________________________________________________
dropout_10 (Dropout)             (None, 100)           0           dense_15[0][0]                   
___________________________________________________________________________________________

In [93]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=1)

model.fit(train_data, train_labels, validation_data=(test_data, test_labels), batch_size=64, callbacks=[early_stopping])

Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x112a4de80>

In [94]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.optimizers import Adam
from keras.layers.convolutional import Convolution1D
from keras.layers.pooling import MaxPooling1D

model2 = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=50, input_length=MAX_SEQUENCE_LENGTH),
    Dropout(0.2),
    Convolution1D(64, 5, activation='relu', border_mode='same'),
    Dropout(0.2),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(3, activation='sigmoid')
])

model2.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model2.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_9 (Embedding)          (None, 1000, 50)      1194600     embedding_input_9[0][0]          
____________________________________________________________________________________________________
dropout_11 (Dropout)             (None, 1000, 50)      0           embedding_9[0][0]                
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 1000, 64)      16064       dropout_11[0][0]                 
____________________________________________________________________________________________________
dropout_12 (Dropout)             (None, 1000, 64)      0           convolution1d_2[0][0]            
___________________________________________________________________________________________

In [95]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=1)

model2.fit(train_data, train_labels, validation_data=(test_data, test_labels), batch_size=64, callbacks=[early_stopping])

Train on 15663 samples, validate on 3916 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x1219d7cc0>