## Reading text files: train, dev and test:

In [2]:
DATA_PATH = "/Users/MeryemMhamdi/GoogleDriveEPFL/Gdrive Thesis/3 Algorithms Implementation/ECGA_Data/french/"

In [7]:
x_train = []
y_train = []
x_dev = []
y_dev = []
x_test = []
y_test = []
with open(DATA_PATH+"train.txt") as file:
    lines = file.readlines()
    for line in lines:
        parts = line.split("\t")
        x_train.append(parts[0])
        y_train.append(parts[1])
        
with open(DATA_PATH+"dev.txt") as file:
    lines = file.readlines()
    for line in lines:
        parts = line.split("\t")
        x_dev.append(parts[0])
        y_dev.append(parts[1])
        
with open(DATA_PATH+"test.txt") as file:
    lines = file.readlines()
    for line in lines:
        parts = line.split("\t")
        x_test.append(parts[0])
        y_test.append(parts[1])

In [11]:
print('len(x_train):',len(x_train))
print('len(y_train):',len(y_train))

print('len(x_dev):',len(x_dev))
print('len(y_dev):',len(y_dev))

print('len(x_test):',len(x_test))
print('len(y_test):',len(y_test))

len(x_train): 43101
len(y_train): 43101
len(x_dev): 10780
len(y_dev): 10780
len(x_test): 13471
len(y_test): 13471


# Preprocessing (tokenization, stop word removal, digit removal):

In [39]:
from string import punctuation

import nltk
import numpy as np
from nltk.corpus import stopwords
from tqdm import tqdm

def num_there(s):
    return not any(i.isdigit() for i in s)

def all_punct(s):
    return not all(i in punctuation for i in s)

def nltk_tokenizer(x_raw):
    """
    :param mode: specifies whether it is train, validation or test part of the data to be tokenized
    :return: tokens_list: the list of tokens (no lemmatization) per each sentence
    """
    tokens_list = []  # List of list of tokens
    for i in tqdm(range(0, len(x_raw))):
        ## Tokenizing each sentence in doc
        tokens = nltk.word_tokenize(x_raw[i])
        tokens_doc = [word.lower() for word in tokens if all_punct(word) and
                       word not in stopwords.words("french") and num_there(word)]
        
        tokens_list.append(tokens_doc)

    return tokens_list

In [94]:
x_train_pro = nltk_tokenizer(x_train)

  1%|▏         | 557/43101 [00:22<21:21, 33.20it/s]  

KeyboardInterrupt: 

In [41]:
x_dev_pro = nltk_tokenizer(x_train)

100%|██████████| 200/200 [00:07<00:00, 25.26it/s]


In [42]:
x_test_pro = nltk_tokenizer(x_train)

100%|██████████| 200/200 [00:08<00:00, 24.09it/s]


In [47]:
max_sequences = np.max([len(x) for x in x_train_pro]+[len(x) for x in x_dev_pro]+[len(x) for x in x_test_pro])
print("max_sequences=",max_sequences)

max_sequences= 569


# Vocabulary Creation and Converting to IDs:

In [62]:
x_all = x_train_pro+x_dev_pro+x_test_pro

vocab_dict = {}
for doc in x_all:
    for token in doc:
        if token in vocab_dict:
            vocab_dict[token] += 1
        else:
            vocab_dict[token] = 1
vocab_list = sorted(vocab_dict)
vocab = dict([x, y] for (y, x) in enumerate(vocab_list))

In [63]:
vocab

{"'allez": 0,
 "'aux": 1,
 "'d'accord": 2,
 "'d'usage": 3,
 "'faites": 4,
 "'fermée": 5,
 "'fratrie": 6,
 "'grenelle": 7,
 "'honore": 8,
 "'il": 9,
 "'insupportables": 10,
 "'je": 11,
 "'m": 12,
 "'maintenant": 13,
 "'ni-ni": 14,
 "'niche": 15,
 "'non": 16,
 "'nous": 17,
 "'ou": 18,
 "'oublie": 19,
 "'oui": 20,
 "'pas": 21,
 "'rations": 22,
 "'rendez-vous": 23,
 "'s": 24,
 "'tout": 25,
 "'têtes": 26,
 "'vous": 27,
 "'à": 28,
 '*groupe': 29,
 '*prix': 30,
 '*qualité': 31,
 '*résultats': 32,
 '*simples': 33,
 '-deux': 34,
 '-départ': 35,
 '-que': 36,
 './jyc': 37,
 './md': 38,
 './ml': 39,
 '.bfx': 40,
 '.jir/nim': 41,
 '/abo': 42,
 '/abo/lm': 43,
 '/ap': 44,
 '/ap/cr/hpa': 45,
 '/ba': 46,
 '/ba/gb': 47,
 '/ban': 48,
 '/ban/md': 49,
 '/be/fan': 50,
 '/bg/ap': 51,
 '/bg/cc': 52,
 '/bg/ej': 53,
 '/cbe': 54,
 '/cc': 55,
 '/cc/ej': 56,
 '/cc/hf': 57,
 '/cca': 58,
 '/cca/gb': 59,
 '/cca/jpc': 60,
 '/cr': 61,
 '/db/fan': 62,
 '/dh/msk/cr': 63,
 '/dr': 64,
 '/dr/cr': 65,
 '/dr/jlf': 66,
 '/dr/j

In [64]:
sequences_train = []
for doc in x_train_pro:
    list_ids_sub = []
    for token in doc:
        list_ids_sub.append(vocab[token])
    sequences_train.append(list_ids_sub)
    
sequences_dev = []
for doc in x_dev_pro:
    list_ids_sub = []
    for token in doc:
        list_ids_sub.append(vocab[token])
    sequences_dev.append(list_ids_sub)
    
sequences_test = []
for doc in x_test_pro:
    list_ids_sub = []
    for token in doc:
        list_ids_sub.append(vocab[token])
    sequences_test.append(list_ids_sub)

In [75]:
from keras.preprocessing.sequence import pad_sequences
data_train = pad_sequences(sequences_train, padding='post', maxlen=max_sequences, value=len(vocab))
data_dev = pad_sequences(sequences_dev, padding='post', maxlen=max_sequences, value=len(vocab))
data_test = pad_sequences(sequences_test, padding='post', maxlen=max_sequences, value=len(vocab))

# Building Embedding Matrix:

# One Hot Encoding:

In [92]:
from keras.utils import to_categorical
one_hot_train = to_categorical(list(y_train[0:1000]), num_classes=4)
one_hot_dev = to_categorical(list(y_dev[0:200]), num_classes=4)
one_hot_test = to_categorical(list(y_test[0:200]), num_classes=4)

# Training and Testing:

In [95]:
from keras.layers import Embedding, Input, Dense, AveragePooling1D, Lambda, Dropout, average, Flatten
from keras.models import Model
import keras.backend as K
from keras.engine.topology import Layer
from keras.optimizers import Adam, SGD

class ZeroMaskedEntries(Layer):
    """
    This layer is called after an Embedding layer.
    It zeros out all of the masked-out embeddings.
    It also swallows the mask without passing it on.
    You can change this to default pass-on behavior as follows:

    def compute_mask(self, x, mask=None):
        if not self.mask_zero:
            return None
        else:
            return K.not_equal(x, 0)
    """

    def __init__(self, **kwargs):
        self.support_mask = True
        super(ZeroMaskedEntries, self).__init__(**kwargs)

    def build(self, input_shape):
        self.output_dim = input_shape[1]
        self.repeat_dim = input_shape[2]

    def call(self, x, mask=None):
        mask = K.cast(mask, 'float32')
        mask = K.repeat(mask, self.repeat_dim)
        mask = K.permute_dimensions(mask, (0, 2, 1))
        return x * mask

    def compute_mask(self, input_shape, input_mask=None):
        return None


def mask_aware_mean(x):
    # recreate the masks - all zero rows have been masked
    mask = K.not_equal(K.sum(K.abs(x), axis=2, keepdims=True), 0)

    # number of that rows are not all zeros
    n = K.sum(K.cast(mask, 'float32'), axis=1, keepdims=False)

    # compute mask-aware mean of x
    x_mean = K.sum(x, axis=1, keepdims=False) / n

    return x_mean


def mask_aware_mean_output_shape(input_shape):
    shape = list(input_shape)
    assert len(shape) == 3
    return (shape[0], shape[2])

class MLPFineTuneModel(object):

    def __init__(self, n_classes):
        sequence_input = Input(shape=(max_sequences,), dtype='int32')
        embedding_layer = Embedding(len(vocab)+1, 300, input_length=max_sequences, trainable=True)(sequence_input)

        print(embedding_layer.shape)
        #embed_zeroed = ZeroMaskedEntries()(embedding_layer)


        #lambda_mean = Lambda(mask_aware_mean, mask_aware_mean_output_shape)(embed_zeroed)

        # average_emb = average(embedding_layer)
        average_emb = AveragePooling1D(pool_size=(max_sequences,))(embedding_layer)


        dense_layer = Dense(512, input_shape=(300,), activation='relu')(average_emb)

        dropout_layer = Dropout(0.3)(dense_layer)

        flat = Flatten()(dropout_layer)

        softmax_layer = Dense(4, activation='softmax')(flat)

        model = Model(input=sequence_input, output=softmax_layer)

        adam = Adam(lr=0.001, beta_1=0.7, beta_2=0.999, epsilon=1e-08)
        #sgd = SGD(lr=args.learning_rate, decay=0, momentum=0.9, nesterov=True)

        model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=["accuracy"])

        self.model = model

In [96]:
cldc_model = MLPFineTuneModel(4)

(?, 569, 300)




In [97]:
history = cldc_model.model.fit(data_train, one_hot_train,
                               batch_size=32,
                               epochs=100,
                               shuffle=True,
                               validation_data=(data_dev, one_hot_dev))

Train on 1000 samples, validate on 200 samples
Epoch 1/100
  32/1000 [..............................] - ETA: 29s - loss: 1.3906 - acc: 0.0625

            1%|▏         | 557/43101 [00:40<50:56, 13.92it/s]

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100

KeyboardInterrupt: 