In [1]:
import pandas as pd
import nltk
import sklearn
import numpy as np
import keras
from sklearn.feature_extraction import text
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, GRU, Bidirectional, GlobalMaxPooling1D, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Input, Lambda
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.layers import Activation
import tensorflow as tf

Using TensorFlow backend.


In [2]:
df = pd.read_csv('spooky_train.csv', encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
# dropping it for now to keep it simple

df.drop('id', axis=1, inplace=True)

In [5]:
# lowercase text and remove punctuation

punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]


def clean_text(msg):    
    msg = [msg.lower() for msg in msg if msg not in punctuation]
    msg =''.join(msg)
    return [msg for msg in msg.split()]

In [6]:
# apply the changes to text

df['text'] = df['text'].apply(clean_text)

In [7]:
# start assigning values for x and y

X = df['text']
y = df['author']

In [8]:
num_classes = 3

In [9]:

encoder = LabelBinarizer()
y = encoder.fit_transform(y)
print(y)


[[1 0 0]
 [0 1 0]
 [1 0 0]
 ...
 [1 0 0]
 [1 0 0]
 [0 1 0]]


In [10]:
# split data 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [11]:
# use keras to tokenize and pad the text

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = sequence.pad_sequences(X_train,maxlen=50)
X_test = sequence.pad_sequences(X_test,maxlen=50)

In [12]:
epochs = 5
embedding_dim = 100
max_len = 50 # max message length
batch_size = 35
vocab_size = 23366
glove_embeddings_index = None
word_index = tokenizer.word_index

In [13]:
# load the whole embedding into memory
import numpy as np
from numpy import asarray
from numpy import array
from numpy import zeros

embedding_dir = ('C://Users//Tallowtree//py-master//glove.6B.100d.txt')

embeddings_index = dict()
f  = open(embedding_dir, encoding = "utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
t = Tokenizer()
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


In [14]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, weights = [embedding_matrix], input_length = X_train.shape[1], trainable = True))
model.add(SpatialDropout1D(0.25))
model.add(Bidirectional(GRU(32,return_sequences=True)))
model.add(Bidirectional(GRU(32,return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# %%time
history=model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
score = model.evaluate(X_test, y_test, verbose = 1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.6885608656596358
Test accuracy: 0.7793667006541322
