In [226]:
import pandas as pd
import numpy as np

In [227]:
df = pd.read_csv("data/kixi_messages.csv")
df = df.sample(frac=1)
y = df.pop('channel')
labels = np.asarray(pd.get_dummies(y))                    
# one-hot encoded numpy array for output classes

In [228]:
# perform some basic parsing on the dataset (this might be more easily doable with a library too)

df.text = df.text.str.lower()
df.text = df.text.str.replace('\n', ' ')
df.text = df.text.str.replace('[\£]', 'pounds')
df.text = df.text.str.replace('[\_\.\,\_\-\!\?]', ' ')
df.text = df.text.str.replace('[^a-z\s]', '')
df.text = df.text.str.replace('\s+', ' ')

In [229]:
maxlen = 100                                              
# how many words to cut off the messages at

validation_split = 0.8                                    
# how to split the training and testing data

training_samples = round(df.shape[0]*validation_split)    
# number of training samples

validation_samples = df.shape[0] - training_samples       
# number of testing samples

max_words = 10000                                         
# consider only the 1000 most common words in the dataset

In [230]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words)                
# instantiate the keras tokenizer which turns each word into an integer representation

tokenizer.fit_on_texts(df.text)                           
# fit it on our datset

sequences = tokenizer.texts_to_sequences(df.text)         
# generate a sequence of integers to represent each input row

print("Found %s unique tokens." % len(tokenizer.word_index))

Found 17297 unique tokens.


In [231]:
data = pad_sequences(sequences, maxlen=maxlen)            
# if a sequence contains less than 100 words, fill the rest of it with zeros

print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)

Shape of data: (9588, 100)
Shape of labels: (9588, 5)


In [232]:
# split the dataset into training/test

X_train = data[:training_samples]
y_train = labels[:training_samples]
X_test = data[training_samples: training_samples + validation_samples]
y_test = labels[training_samples: training_samples + validation_samples]

In [167]:
# loading the GloVe word representation vectors (download and info here: https://nlp.stanford.edu/projects/glove/)

embeddings_index = {}

with open('/Users/mike/GloVe/common_crawl/glove.42B.300d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Found %s word vectors.' % len(embeddings_index))

Found 1917494 word vectors.


In [233]:
embedding_dim = 300                                        
# dimensionality of GloVe vectors used

embedding_matrix = np.zeros((max_words, embedding_dim))    
# instantiate empty matrix for embedding

for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector         
# if top 10000 dataset word is in GloVe, embed the representation, otherwise leave it zeros 

In [285]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPool1D

model = Sequential()                                       
# instantiate sequential architecture

model.add(Embedding(max_words,                             
                    embedding_dim,
                    input_length=maxlen))
# layer which receives 2D tensor of batchsize (inferred) x max_words, outputs 3D tensor of batchsize x max_words x embedding_dim

model.add(Bidirectional(LSTM(128,                           
                             return_sequences=True, 
                             dropout=0.2,
                             recurrent_dropout=0.2)))      
# bidirectional layer with 128 LSTM units, and dropout to regularize

model.add(GlobalMaxPool1D())               
# pooling layer to receive a 3D tensor and combine clusters into a 2D output

model.add(Dense(128, activation='relu'))                   
# standard hidden layer with 128 relu units

model.add(Dropout(0.2))                                    
# dropout layer for regularisation

model.add(Dense(labels.shape[1], activation='softmax'))    
# output layer with a softmax unit for each class to output a probability

In [286]:
model.layers[0].set_weights([embedding_matrix])            
# load the GloVe embeddings into the first layer of the network

model.layers[0].trainable = False                          
# freeze the GloVe embeddings to the model can't update them

In [287]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 100, 300)          3000000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 100, 256)          439296    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 256)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 5)                 645       
Total params: 3,472,837
Trainable params: 472,837
Non-trainable params: 3,000,000
____________________________________________________________

In [288]:
from keras.callbacks import EarlyStopping

callbacks = [EarlyStopping(monitor='val_acc', patience=3)]  
# stop fitting if validation accuracy goes down for more than 3 epochs in a row

In [289]:
model.compile(optimizer='adam',                             
              loss='categorical_crossentropy', 
              metrics=['acc'])
# compile the model using fairly standard settings

In [290]:
model.fit(X_train,                                          
          y_train, 
          epochs=50, 
          batch_size=128, 
          validation_data=(X_test, y_test), 
          callbacks=callbacks)
# fit the model using a validation set 

Train on 7670 samples, validate on 1918 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


<keras.callbacks.History at 0x1a8fd29c50>

In [None]:
# So the model produces best test accuracy of over 65% after 7 epochs. (vs. about 20% accuracy from chance)
# Pretty good for a relatively simply network and considering the likely crossover in topics between kixi channels!