## Model 1

In [57]:
import numpy as np

import pandas as pd

from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Conv1D
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

np.random.seed(7)

In [58]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(df.shape)
a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
y = np.array([a2c[a] for a in df.author])
print(y.shape)
y = to_categorical(y)
print(y.shape)

(19579, 3)
(19579,)
(19579, 3)


### Separate punctuation from words

In [59]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [82]:
# Pre-process the text outside of the create_docs function
#df['text'] = df['text'].apply(preprocess)
# convert author labels into numerical variables
df['author_num'] = df.author.map({'EAP':0, 'HPL':1, 'MWS':2})
# Check conversion for first 5 rows
df.head()

Unnamed: 0,id,text,author,author_num
0,id26305,"This process , however , afforded me no mean...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box , from ...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else , not even gold , the S...",HPL,1


### Remove lover frequency words (<=2)
Remvoe the words that don't appear more than twice. Something I could try is to keep significant words even if they have a low frequency. These words could be unique to authors, like the word "Cthulhu" could be unique to H.P. Lovecraft.

In [83]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

### Cut down the length of the sentences in the train dataframe to a 500 character limit.

In [84]:
#accuracy is very close even when sentences are cut short.
train_df = df
train_df = train_df.rename(columns={'text':'original_text'})
train_df['text'] = train_df['original_text'].str[:700]
train_df['text_length'] = train_df['text'].str.len()
#train_df.head()

min_count = 2
docs = train_df
docs = create_docs(df)
tokenizer = Tokenizer(lower=False, filters='')
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 500
#pad the documents to a max length of 500
docs = pad_sequences(sequences=docs, maxlen=maxlen)

#### Create the model with an embedding input layer, 

In [85]:
input_dim = np.max(docs)+1
embedding_dims = 20
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    #model.add(Conv1D(128, 5, activation='relu')) 
    #MaxPooling2D(5)
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

#### Use Early Stopping to prevent overfitting

In [86]:
x_train, x_test, y_train, y_test = train_test_split(docs, y, test_size=0.2)
def fit(num_epochs):
    epochs = num_epochs
    
    print("x_train: ", x_test.shape)
    print("y_train: ", y_train.shape)
    model = create_model()
    hist = model.fit(x_train, y_train,
                     batch_size=16,
                     validation_data=(x_test, y_test),
                     epochs=epochs,
                     callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    return model

In [87]:
model = fit(30)

x_train:  (3916, 500)
y_train:  (15663, 3)
Train on 15663 samples, validate on 3916 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30


### The accuracy here is about 87%

In [499]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_74 (Embedding)     (None, None, 20)          5141740   
_________________________________________________________________
global_average_pooling1d_59  (None, 20)                0         
_________________________________________________________________
dense_58 (Dense)             (None, 3)                 63        
Total params: 5,141,803
Trainable params: 5,141,803
Non-trainable params: 0
_________________________________________________________________
None


### Post Kaggle Submission #1:
###### Model 1 scored a 0.36295 which is still the best score out of my first three submissions.

###### This was a simple model, the embedding layer

#### I'd like to see how adding a convolutional layer to the network would affect its performance. 

In [97]:
test_df = pd.read_csv('test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model.predict_proba(docs)

result = pd.read_csv('sample_submission.csv')
for a, i in a2c.items():
    result[a] = y[:, i]
result.to_csv('my_submission.csv', index=False)



## Model 2
This is based off of the first model, but here I will use sigmoid over softmax.
Note: using optimizer 'sgd' over 'adam' will get a max score of 40%. It plateus pretty quickly..

Using sigmoid gives an accuracy of about 62%. The tests are also much shorter and stop early. It seems like adding more layers makes it less accurate. 

I've noticed that the more layers I add on to this model, the less accurate it becomes. The only layer that has added anything was the Conv1D layer. Adding the layer speed up the test time since it doesn't run through all the epochs, it stops much earlier than the previous model.

In [90]:
from keras.layers import Conv1D, MaxPooling1D, Dropout, GlobalAveragePooling1D
from keras.optimizers import SGD, RMSprop

def one_conv_model(embedding_dims=20, optimizer='rmsprop'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(128, 5, activation='relu')) #added a Conv1D layer
    #model.add(Dropout(0.5))
    model.add(GlobalAveragePooling1D())
    #model.add(Dense(3, activation='sigmoid'))
    #model.add(Dropout(1))
    model.add(Dense(3, activation='sigmoid'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model


### If the batch size is changed from 16 to 128, the number of epochs is reduced by 3/4.
The accuracy of this model is about 85% When batch size = 128, the number of epochs is cut by two-thirds, and the test time is 1 second. The accuary is slightly worse than the model with a batch size of 16. I haven't achieved a higher score than 86% with these models.

In [91]:
def fit_128(num_epochs):
    epochs = num_epochs  
    x_train2, x_test2, y_train2, y_test2 = train_test_split(docs, y, test_size=0.2)
    model = one_conv_model()
    hist = model.fit(x_train2, y_train2,
                     batch_size=128,
                     validation_data=(x_test2, y_test2),
                     epochs=epochs,
                     callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    return model

In [93]:
model2 = fit_128(25)

Train on 15663 samples, validate on 3916 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25


In [94]:
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, None, 20)          5141200   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         12928     
_________________________________________________________________
global_average_pooling1d_4 ( (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 387       
Total params: 5,154,515
Trainable params: 5,154,515
Non-trainable params: 0
_________________________________________________________________


In [96]:
test_df = pd.read_csv('test.csv')
docs = create_docs(test_df)
docs = tokenizer.texts_to_sequences(docs)
docs = pad_sequences(sequences=docs, maxlen=maxlen)
y = model2.predict_proba(docs)

result = pd.read_csv('sample_submission.csv')
for a, i in a2c.items():
    result[a] = y[:, i]
result.to_csv('model_2_sub.csv', index=False)



### Post Kaggle Submission #3:
#### Model 2 scored lower than Model 1, which didn't surprise me. I've been seeing that adding more to these models is actually making it worse. 

##### Model 1: 0.36295 
##### Model 2: 0.42768