In [1]:
from itertools import count

import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score

# fix random seed for reproducibility
np.random.seed(1)

Using TensorFlow backend.


Load in the questions and associated tags for each question

In [2]:
q = pd.read_csv("../data/Questions.csv", encoding='latin1')
t = pd.read_csv("../data/Tags.csv", encoding='latin1')

In [3]:
titles = q.Title.str.split()

# Convert the words to a naive embedding
q_embeddings = set()

for title in titles:
    for word in title:
        q_embeddings.add(word.lower())

q_embeddings = dict(zip(q_embeddings, count()))
q.Title = q.Title.map(lambda x: [q_embeddings[word.lower()] for word in x.split()])

In [4]:
# Pad the titles to the same length
max_title_len = q.Title.apply(len).max()

q.Title = [np.concatenate([np.zeros(max_title_len - len(t)), t]) for t in q.Title]

Now we need to assign tags to each question.

In [5]:
# There's definitely a better way to do this but whatever
# This is really bad though

top_tags = set(t.Tag.value_counts()[:501].index.tolist())

t = t[t.Tag.isin(top_tags)]
t = t.groupby('Id').first()

In [6]:
q = q.join(t, on='Id', rsuffix='t_')
q = q[~q.Tag.isna()]

In [7]:
# Split into training and testing data
train, test = train_test_split(q, train_size=0.7)



In [8]:
X_train = train.Title
X_test = test.Title

y_train = train.Tag
y_test = test.Tag

## LSTM Time

In [9]:
lb = LabelBinarizer()
lb.fit(q.Tag.unique())

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [10]:
# One-hot encoding runs out of memory if used on the entire training set
# so intead train by batches
# Could instead limit to top 1000 tags or so
# X_train_batches = np.split(X_train, 25)
# y_train_batches = np.split(y_train, 25)

In [11]:
# np_utils.to_categorical(y_train_batches[0])

def baseline_model(input_length=max_title_len, n_words=max(q_embeddings.values()),
                   embedding_vector_length=32, n_tags=q.Tag.nunique()):
    model = Sequential()
    model.add(Embedding(n_words, embedding_vector_length, input_length=input_length))
    model.add(Dense(10, input_dim=embedding_vector_length, activation='relu'))
    model.add(Flatten())
    model.add(Dense(n_tags, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [15]:
model = baseline_model()
model.fit(np.array(X_train.tolist()), lb.transform(y_train), 
         validation_data=(np.array(X_test.tolist()), lb.transform(y_test)),
         epochs=10, batch_size=10)

Train on 829222 samples, validate on 355381 samples
Epoch 1/10
  8580/829222 [..............................] - ETA: 2:10:20 - loss: 4.1402 - acc: 0.1070

KeyboardInterrupt: 

In [None]:
model.save('./model1')