In [216]:
import keras
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from keras.optimizers import SGD, Adam
import warnings
from sklearn.metrics import f1_score
warnings.filterwarnings('ignore')

In [217]:
# Load in the data
data = pd.read_csv('Train.csv', nrows=20000)
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,1,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...,php image-processing file-upload upload mime-t...
1,2,How can I prevent firefox from closing when I ...,"<p>In my favorite editor (vim), I regularly us...",firefox
2,3,R Error Invalid type (list) for variable,<p>I am import matlab file and construct a dat...,r matlab machine-learning
3,4,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ...",c# url encoding
4,5,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...,php api file-get-contents


In [218]:
# Parameters
max_length = 350  # length of input sequences to the model
n_top_tags = 8  # n most prevelant tags to try to predict
vocab_size = 2000  # How many distinct tokens to take
char_model = False  # type of model to train (character or word)
batch_size = 128
num_epochs = 20

In [219]:
# Convert the tags and texts to lists for the keras tokenizer
tag_list = data['Tags'].tolist()
text_list = data['Body'].tolist()

print(tag_list[:25])
print("="*115)
print(text_list[:2])

['php image-processing file-upload upload mime-types', 'firefox', 'r matlab machine-learning', 'c# url encoding', 'php api file-get-contents', 'proxy active-directory jmeter', 'core-plot', 'c# asp.net windows-phone-7', '.net javascript code-generation', 'sql variables parameters procedure calls', '.net obfuscation reflector', 'algorithm language-agnostic random', 'postfix migration mdaemon', 'documentation latex3 expl3', 'windows-7', 'php url-routing conventions', 'r temporary-files', 'wpf binding', 'javascript code-generation playframework minify', 'php xml hash multidimensional-array simplexml-load-string', 'medical-science cancer healthcare', 'c# .net linq', 'actionscript-3 flex flex3', 'iis', 'c# linq string enumeration']
["<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter w

In [220]:
tag_tokenizer = Tokenizer(num_words=n_top_tags + 1)
tag_tokenizer.fit_on_texts(tag_list)
tag_matrix = tag_tokenizer.texts_to_matrix(tag_list)[:, 1:]

In [221]:
print(tag_matrix.shape)
print(tag_matrix)
print(list(tag_tokenizer.word_index.keys())[:11])

(20000, 8)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['c', 'net', 'java', 'android', 'php', 'javascript', 'jquery', 'asp', 'sql', 'windows', 'ruby']


# Vectorized Model

In [222]:
text_tokenizer = Tokenizer(num_words=vocab_size, char_level=char_model)
text_tokenizer.fit_on_texts(text_list)
text_matrix = text_tokenizer.texts_to_matrix(text_list, mode='tfidf')

# We have a numeric representation of the words in the questions
print(vocab_size)
print(text_matrix[:2])
print(list(text_tokenizer.word_index.keys())[:25])
print(list(text_tokenizer.word_index.keys())[vocab_size-25:vocab_size])

2000
[[0.         1.65506803 2.11881721 ... 0.         0.         0.        ]
 [0.         1.65506803 1.28502094 ... 0.         0.         0.        ]]
['p', 'the', 'i', 'to', 'code', 'a', 'gt', 'lt', 'is', 'and', 'pre', 'in', 'of', 'this', 'it', 'that', '0', '1', 'for', 'have', 'my', 'if', 'on', 'but', 'with']
['listbox', 'repo', 'python2', 'criteria', 'rvm', '42', 'logo', 'traffic', 'her', 'exceptions', 'radius', 'thumbnail', 'inputstream', 'efficient', 'agent', 'x81', 'webpage', 'friend', 'movie', '404', 'databases', 'actions', 'settext', 'suggestion', 'removing']


In [223]:
# Padd all sequences to the same size
y = tag_matrix

x_train, x_val, y_train, y_val = train_test_split(text_matrix, y, test_size=0.20, random_state=42)

In [224]:
x_train.shape

(16000, 2000)

In [225]:
# Make a Multi Logistic Regression Model
log_reg_model = Sequential()
log_reg_model.add(Dense(n_top_tags, activation='sigmoid', input_shape=(vocab_size, )))
log_reg_model.compile(optimizer=SGD(), loss = 'binary_crossentropy', metrics=['accuracy'])

In [226]:
log_reg_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 8)                 16008     
Total params: 16,008
Trainable params: 16,008
Non-trainable params: 0
_________________________________________________________________


In [227]:
log_reg_model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x139629f60>

In [228]:
log_reg_score = f1_score(y_val, log_reg_model.predict(x_val) > 0.5, average=None)
print(log_reg_score)
print(np.mean(log_reg_score))

[0.32375979 0.29441624 0.29765013 0.54166667 0.37823834 0.27210884
 0.4516129  0.19138756]
0.34385506006665667


# Sequence Model

In [229]:
text_tokenizer = Tokenizer(num_words=vocab_size, char_level=char_model)
text_tokenizer.fit_on_texts(text_list)
text_matrix = text_tokenizer.texts_to_sequences(text_list)

# We have a numeric representation of the words in the questions
print(vocab_size)
print(text_matrix[:2])
print(list(text_tokenizer.word_index.keys())[:25])
print(list(text_tokenizer.word_index.keys())[vocab_size-25:vocab_size])

2000
[[1, 383, 50, 4, 376, 22, 34, 54, 9, 34, 141, 54, 92, 304, 344, 472, 1435, 42, 253, 54, 2, 98, 9, 16, 51, 46, 4, 783, 2, 170, 63, 609, 2, 79, 10, 782, 6, 42, 157, 28, 2, 79, 100, 1325, 63, 54, 79, 82, 783, 1, 1, 9, 55, 6, 85, 4, 376, 22, 2, 54, 9, 34, 141, 30, 1192, 2, 54, 1058, 46, 108, 1], [1, 12, 21, 1170, 3, 69, 510, 4, 581, 6, 841, 306, 134, 15, 979, 1607, 807, 4, 83, 16, 1009, 9, 2, 812, 343, 23, 181, 235, 3, 298, 478, 38, 991, 9, 2, 812, 343, 10, 1381, 510, 63, 1009, 14, 9, 26, 58, 3, 64, 9, 55, 6, 85, 4, 708, 510, 30, 1009, 1, 1, 1]]
['p', 'the', 'i', 'to', 'code', 'a', 'gt', 'lt', 'is', 'and', 'pre', 'in', 'of', 'this', 'it', 'that', '0', '1', 'for', 'have', 'my', 'if', 'on', 'but', 'with']
['listbox', 'repo', 'python2', 'criteria', 'rvm', '42', 'logo', 'traffic', 'her', 'exceptions', 'radius', 'thumbnail', 'inputstream', 'efficient', 'agent', 'x81', 'webpage', 'friend', 'movie', '404', 'databases', 'actions', 'settext', 'suggestion', 'removing']


In [230]:
# Padd all sequences to the same size
X = sequence.pad_sequences(text_matrix, maxlen=max_length, padding='pre', truncating='post')

y = tag_matrix

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [231]:
# Try a sequence model instead
seq_model = Sequential()
seq_model.add(Embedding(vocab_size, 100, input_shape=(max_length, )))
seq_model.add(Dropout(.2))
seq_model.add(LSTM(64))
seq_model.add(Dropout(.2))
seq_model.add(Dense(n_top_tags, activation='sigmoid'))
seq_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [232]:
seq_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 350, 100)          200000    
_________________________________________________________________
dropout_19 (Dropout)         (None, 350, 100)          0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 64)                42240     
_________________________________________________________________
dropout_20 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 8)                 520       
Total params: 242,760
Trainable params: 242,760
Non-trainable params: 0
_________________________________________________________________


In [233]:
seq_model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x17072c1d0>

In [234]:
seq_score = f1_score(y_val, seq_model.predict(x_val) > 0.5, average=None)
print(seq_score)
print(np.mean(seq_score))

[0.33423181 0.53053435 0.49115044 0.78100264 0.58823529 0.51327434
 0.57788945 0.58381503]
0.5500166680767511
