In [1]:
cd ..

/home/vmadmin/pass


In [3]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from feature_extraction.features import *
from feature_extraction.features import get_glove_w2v
from database.utils import get_train_test_data

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
train_test_data = get_train_test_data()
X_train, y_train, X_test, y_test = get_train_test_data(merge=True)

w2v = get_glove_w2v()

embed_size = 200 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 75 # max number of words in a comment to use

Found 1193514 word vectors.


In [43]:
def get_embedding_matrix(max_features, tokenizer, w2v, embed_size):
    all_embs = np.stack(w2v.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    emb_mean, emb_std    
    
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))

    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words + 1, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = w2v.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [93]:
def get_model(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(input_dim=len(embedding_matrix), output_dim=embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(3, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()
    return model

In [92]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [95]:
tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(list_tokenized_train, maxlen)
X_test = pad_sequences(list_tokenized_test, maxlen)

embedding_matrix = get_embedding_matrix(max_features, tokenizer, w2v, embed_size)
model = get_model(embedding_matrix)
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, 
      callbacks=[roc_callback(training_data=(X_train,
                                             y_train),
                              validation_data=(X_test, y_test))])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        (None, 75)                0         
_________________________________________________________________
embedding_24 (Embedding)     (None, 75, 200)           3069000   
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 75, 100)           100400    
_________________________________________________________________
global_max_pooling1d_19 (Glo (None, 100)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_18 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_36 (Dense)             (None, 3)                 153       
Total para

<keras.callbacks.History at 0x7fb12f7fbcf8>

In [117]:
len(train_test_data)

3

In [122]:
y_pred[0]

array([  2.89925515e-06,   4.33312869e-03,   8.76744390e-01], dtype=float32)

In [125]:
np.argmax(y_pred, axis=1)

array([2, 2, 2, ..., 1, 1, 1])

In [59]:
for Xr_train, y_train, Xr_test, y_test, indicator in train_test_data:
    print(indicator)
    tokenizer = Tokenizer(num_words=max_features)
    
    tokenizer.fit_on_texts(list(Xr_train))
    list_tokenized_train = tokenizer.texts_to_sequences(Xr_train)
    list_tokenized_test = tokenizer.texts_to_sequences(Xr_test)
    X_train = pad_sequences(list_tokenized_train, maxlen)
    X_test = pad_sequences(list_tokenized_test, maxlen)
    
    #embedding_matrix = get_embedding_matrix(max_features, tokenizer, w2v, embed_size)
    y_pred = model.predict(X_test)
    
    
    #model = get_model(embedding_matrix)
    #model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, 
    #      callbacks=[roc_callback(training_data=(X_train,
    #                                             y_train),
    #                              validation_data=(X_test, y_test))])

sleep
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 75)                0         
_________________________________________________________________
embedding_21 (Embedding)     (None, 75, 200)           569000    
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 75, 100)           100400    
_________________________________________________________________
global_max_pooling1d_16 (Glo (None, 100)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_15 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 51        
Tota