In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D, Convolution1D, MaxPooling1D
from keras.models import Model, Sequential
from keras.callbacks import TensorBoard

from feature_extraction.features import get_glove_w2v
from database.utils import get_train_test_data
from evaluation.metrics import *
from models.nn_models import *

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [5]:
train_test_data = get_train_test_data()
X_train, y_train, X_test, y_test = get_train_test_data(merge=True)

w2v = get_glove_w2v()

Found 1193514 word vectors.


In [45]:
embed_size = 200 # how big is each word vector
max_features = 20000 # dictionary size
maxlen = 75 # max number of words in a tweet to use

X_train, y_train, X_test, y_test = get_train_test_data(merge=True)

tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(list_tokenized_train, maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen)

embedding_matrix = get_embedding_matrix(max_features, tokenizer, w2v, embed_size)
model = get_lstm_model(embedding_matrix)
#model = get_cnn_model(embedding_matrix)
model.summary()
model.fit(X_train, y_train,
          validation_data=(X_test, y_test),
          epochs=5,
          batch_size=256,
          callbacks=[RocAucEvaluation(validation_data=(X_test, y_test)),
                     TensorBoard(log_dir='./logs',
                                 histogram_freq=0,
                                 write_graph=True,
                                 write_images=False)])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 75, 200)           3069000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 75, 100)           100400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 153       
Total para

<keras.callbacks.History at 0x7f25c0fbfe80>

In [46]:
y_scores = model.predict(X_test)

In [49]:
measures, _, _ = class_report_multilabel(y_test, y_scores)
results['lstm'] = measures
pd.DataFrame(results).T

ROC AUC for class 0: 0.97
Precision for class 0: 0.75
ROC AUC for class 1: 0.88
Precision for class 1: 0.65
ROC AUC for class 2: 0.98
Precision for class 2: 0.66
Test score: 0.93

Average precision score, micro-averaged over all classes: 0.69


Unnamed: 0,accuracy,avg_precision,f1score,recall,test_roc_auc
cnn,0.756891,0.703939,0.670724,0.684472,0.948295
lstm,0.747402,0.691062,0.623656,0.576398,0.92542


In [43]:
measures = class_report_multilabel(y_test, y_scores)
results = {}
results['cnn'] = measures
pd.DataFrame(results).T

ROC AUC for class 0: 0.97
Precision for class 0: 0.79
ROC AUC for class 1: 0.88
Precision for class 1: 0.65
ROC AUC for class 2: 0.97
Precision for class 2: 0.73
Test score: 0.92

Average precision score, micro-averaged over all classes: 0.70


Unnamed: 0,accuracy,avg_precision,f1score,recall,test_roc_auc
cnn,0.756891,0.703939,0.670724,0.684472,0.923612


In [None]:
for Xr_train, y_train, Xr_test, y_test, indicator in train_test_data:
    print(indicator)
    tokenizer = Tokenizer(num_words=max_features)
    
    tokenizer.fit_on_texts(list(Xr_train))
    list_tokenized_train = tokenizer.texts_to_sequences(Xr_train)
    list_tokenized_test = tokenizer.texts_to_sequences(Xr_test)
    X_train = pad_sequences(list_tokenized_train, maxlen)
    X_test = pad_sequences(list_tokenized_test, maxlen)
    
    #embedding_matrix = get_embedding_matrix(max_features, tokenizer, w2v, embed_size)
    y_pred = model.predict(X_test)
    
    
    #model = get_model(embedding_matrix)
    #model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, 
    #      callbacks=[roc_callback(training_data=(X_train,
    #                                             y_train),
    #                              validation_data=(X_test, y_test))])