In [3]:
import pandas as pd
import keras
from keras.preprocessing import text, sequence
from sklearn.metrics import accuracy_score

import util
from model import get_lstm, get_cnn, get_mlp

In [4]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')

total = train.append(test)
total['tokens'] = total['doc'].apply(util.tokenize)

# LSTM

In [5]:
max_features = util.CONFIG['LSTM']['MAX_FEATURES']
max_len = util.CONFIG['LSTM']['MAX_LEN']
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(total['tokens'])
tokenized_sentence = tokenizer.texts_to_sequences(total['tokens'])
sequence_array = sequence.pad_sequences(tokenized_sentence, maxlen=max_len)

train_num = train.shape[0]
batch_size = util.CONFIG['LSTM']['BATCH_SIZE']
epochs = util.CONFIG['LSTM']['EPOCHS']
num_classes = util.CONFIG['CLASS_NUM']
X = sequence_array
Y = total['label'].values - 1

train_x = X[:train_num]
test_x = X[train_num:]
train_y = Y[:train_num]
test_y = Y[train_num:]

In [6]:
model = get_lstm(max_len, max_features)

model.fit(train_x, 
          keras.utils.to_categorical(train_y, num_classes), 
          batch_size=batch_size, 
          epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe1f6ec5710>

In [7]:
pred = model.predict(test_x).argmax(axis=1)
accuracy_score(test_y, pred)

0.9028532608695652

# CNN

In [8]:
model = get_cnn(max_len, max_features, embed_size=128)
model.fit(train_x, 
          keras.utils.to_categorical(train_y, num_classes), 
          batch_size=batch_size, 
          epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe1d86a4160>

In [9]:
pred = model.predict(test_x).argmax(axis=1)
accuracy_score(test_y, pred)

0.8525815217391305

# MLP + tfidf

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
vectorizer = TfidfVectorizer(tokenizer=util.tokenize)
vectorizer.fit(total['doc'])

train_vector = vectorizer.transform(total.iloc[:train_num, 0])
test_vector = vectorizer.transform(total.iloc[train_num:, 0])

input_size = train_vector.shape[1]

In [12]:
model = get_mlp(input_size=input_size)
model.fit(train_vector, 
          keras.utils.to_categorical(train_y, num_classes), 
          batch_size=batch_size, 
          epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe1af33a1d0>

In [13]:
pred = model.predict(test_vector).argmax(axis=1)
accuracy_score(test_y, pred)

0.9449728260869565