In [None]:
# IMPLEMENTATION taken from https://github.com/udsclub/ucu_sentiment/blob/master/projects/p01/notebooks/english_cnn_emb_on_eng_lang.ipynb

In [22]:
import os
import numpy as np
from keras import backend as K
import csv
import gzip
from sklearn.model_selection import train_test_split
import random
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
import string
from gensim.models.doc2vec import TaggedDocument
import gensim.models as g
import codecs
from sklearn.metrics import f1_score

In [41]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
            
            Only computes a batch-wise average of recall.
            
            Computes the recall, a metric for multi-label classification of
            how many relevant items are selected.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
    
    def precision(y_true, y_pred):
        """Precision metric.
            
            Only computes a batch-wise average of precision.
            
            Computes the precision, a metric for multi-label classification of
            how many selected items are relevant.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

In [7]:
def load_initial_data(path):
    df = []
    for line in gzip.open(path, 'rb'):
        df.append(eval(line))
    return df

In [11]:
data = []
for file in os.listdir("amazon"):
    data+=load_initial_data("amazon/"+file)

In [16]:
data = data[:int(len(data)*0.1)]
random.shuffle(data)
dataframe = pd.DataFrame.from_dict(data)

In [17]:
dataX, dataY = dataframe["question"], dataframe["questionType"]
dataY = dataY.apply(lambda y: 1 if y == "yes/no" else 0).as_matrix()

  


In [20]:
questions =list()
for el in range(len(dataframe)):
    questions.append(TaggedDocument(dataX[el], [dataY[el]]))

In [23]:
question_vecs = list()
model = g.Doc2Vec(size=300, window=20, min_count=2, workers=8, alpha=0.025, min_alpha=0.01, dm=0)
model.build_vocab(questions)
model.train(questions, total_examples=len(questions), epochs=10)
#print(model.infer_vector(dataX[0]))
for el in range(len(dataX)):
    question_vecs.append((model.infer_vector(dataX[el])))



In [25]:
index = int(len(dataframe)*0.8)

In [26]:
data, labels =  question_vecs, dataY 
labels = np.asarray(labels, dtype = 'int8')

In [28]:
# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

In [29]:
# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(data, 
                                                                  np.asarray(labels, dtype = 'int8'), 
                                                                  test_size = VALIDATION_SPLIT, 
                                                                  random_state = RANDOM_SEED, 
                                                                  stratify = labels)

In [30]:
# initialize dictionary size and maximum sentence length
MAX_NB_WORDS = 74
MAX_SEQUENCE_LENGTH = 400
eng_alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [31]:
def create_vocab_set():
    alphabet = (list(eng_alphabet) + list(string.digits) + list(string.punctuation) + list(string.whitespace))
    vocab_size = len(alphabet)
    vocab = {}
    for ix, t in enumerate(alphabet):
        vocab[t] = ix+1
    return vocab, vocab_size

In [32]:
def text2sequence(text, vocab):
    temp = []
    for review in text:
                     temp.append([])
                     for i in review:
                         char = vocab.get(i,0)
                         if char != 0:
                            temp[-1].append(char)
    return temp

In [33]:
vocab, vocab_size = create_vocab_set()

X_train = text2sequence(data_train, vocab)
X_val = text2sequence(data_val, vocab)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

In [34]:
from keras.models import Sequential
from keras.layers import GlobalMaxPooling1D, Conv1D, Dropout, Embedding, Dense
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

In [42]:
NAME = "char_cnn_emb"
EMBEDDING_DIM = 100

# initialize model
model = Sequential()
model.add(Embedding(vocab_size+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, trainable=True))
model.add(Conv1D(activation="relu", filters=200, kernel_size=4, padding="valid"))
model.add(Conv1D(activation="relu", filters=200, kernel_size=4, padding="valid"))
model.add(GlobalMaxPooling1D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

callback_2 = EarlyStopping(monitor='val_f1', min_delta=0, patience=5, verbose=0, mode='max')

callback_3 = ModelCheckpoint("models/model_{}.hdf5".format(NAME), monitor='val_f1',
                                 save_best_only=True, verbose=0, mode='max')

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=[f1])

model.summary()
model.fit(X_train, labels_train, validation_data=[X_val, labels_val],
          batch_size=1024, epochs=1000, callbacks=[callback_2, callback_3])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 100)          7500      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 397, 200)          80200     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 394, 200)          160200    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 100)               10100     
__________

<keras.callbacks.History at 0x7f15f9845cc0>

In [37]:
arr = model.predict_classes(X_val)

In [38]:
labels_predicted = arr.reshape((len(arr)))

In [2]:
f1_score(labels_val, labels_predicted)