# Use pre-trained GloVe words for embeddings

https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa

https://github.com/msahamed/yelp_comments_classification_nlp

https://github.com/msahamed/yelp_comments_classification_nlp/blob/master/word_embeddings.ipynb

In [44]:
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
#from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPooling1D, Bidirectional, Conv1D, concatenate
from keras.layers import Dense, Input, LSTM, Dropout, Activation, GlobalMaxPooling1D, Bidirectional, Conv1D, concatenate

# apparemment pour définir l'embeddings "pré entraîné" il
# faut importer un Embedding différent (??)
# pas trouvé dans le doc Keras - https://keras.io/layers/embeddings/
from keras.layers.embeddings import Embedding
from keras.models import Model

from tools import *

In [45]:
# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

## Nettoyage des données (optionnel)

In [46]:
params = {'lower': False, 
          'lemma': False, 
          'stop_words': False}

comment = data_train[2]
print(comment)
print('-------')
print(clean_comment(comment, **params))

Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
-------
Hey man I m really not trying to edit war It s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page He seems to care more about the formatting than the actual info 


In [47]:
clean_data_train = transform_dataset(data_train, func=clean_comment, kwargs=params)
clean_data_test = transform_dataset(data_test, func=clean_comment, kwargs=params)

Transformation: 100%       
Transformation: 100%       


## Définition des jeux de données

In [48]:
tokens_vectorizer = TokenVectorizer(max_len=SENTENCE_LENGTH, max_features=VOCAB_SIZE)

# X_train_all, X_test = encode(data_train, data_test, vectorizer=tokens_vectorizer)
X_train_all, X_test = encode(clean_data_train, clean_data_test, vectorizer=tokens_vectorizer)

ENCODING: Fitting vectorizer to data
ENCODING: transforming data to numerical


In [43]:
SPLIT_VALID_RATIO = 0.10
SPLIT_RANDOM_SEED = 0  # TODO : check split because of unbalanced classes

X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, 
                                                      test_size=SPLIT_VALID_RATIO,
                                                      random_state=SPLIT_RANDOM_SEED)

## Tokenization et découpage des données textuelles 

Conforme au github https://github.com/msahamed/yelp_comments_classification_nlp/blob/master/word_embeddings.ipynb

pour pouvoir encode avec Glove après (ne pas encode directement != Nicolas sur models_testing.ipynb)

In [10]:
# Convert strings to int indexes, 
# considering only the VOCAB_SIZE most commons words, 
# and pad the sentences to SENTENCE_LENGTH words
VOCAB_SIZE = 30000
SENTENCE_LENGTH = 200  # 200 if stop_words deleted, 120 otherwise

In [31]:
tokenizer = Tokenizer(num_words= VOCAB_SIZE)
tokenizer.fit_on_texts(data_train)

sequences = tokenizer.texts_to_sequences(data_train)
data = pad_sequences(sequences,maxlen=SENTENCE_LENGTH)

data.shape

(159571, 200)

In [33]:
SPLIT_VALID_RATIO = 0.10
SPLIT_RANDOM_SEED = 0  # TODO : check split because of unbalanced classes

X_train, X_valid, y_train, y_valid = train_test_split(data, y_train_all, 
                                                      test_size=SPLIT_VALID_RATIO,
                                                      random_state=SPLIT_RANDOM_SEED)

## Récupération des embeddings de Glove

several possibilities of pre-training/embeddings vector sizes for GloVe, see:

https://nlp.stanford.edu/projects/glove/

In [18]:
embeddings_index = dict()
f = open('glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [28]:
# several embeddings sizes possible with Glove
EMBEDDING_DIM = 100

In [29]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, index in tokenizer.word_index.items():
    if index > VOCAB_SIZE - 1: # détermine à quel point on s'intéresse aux mots moins importants d'après Glove
        break
    else:
        embedding_vector = embeddings_index.get(word) # on va chercher le mot dans Glove embeddings_index
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

## Définition du réseau de Yoon Kim

In [30]:
N_FILTERS = 100
MODEL_NAME = "embed_conv_fc_GLOVE"

# input
inp = Input(shape=(SENTENCE_LENGTH, ))
# embedding
emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM,input_length=SENTENCE_LENGTH,weights=[embedding_matrix], trainable=False)(inp)

# Specify each convolution layer and their kernel siz i.e. n-grams 
conv_3 = Conv1D(filters=N_FILTERS, kernel_size=3, activation='relu')(emb)
pool_3 = GlobalMaxPooling1D()(conv_3)

conv_4 = Conv1D(filters=N_FILTERS, kernel_size=4, activation='relu')(emb)
pool_4 = GlobalMaxPooling1D()(conv_4)

conv_5 = Conv1D(filters=N_FILTERS, kernel_size=5, activation='relu')(emb)
pool_5 = GlobalMaxPooling1D()(conv_5)

# Gather all convolution layers
x = concatenate([pool_3, pool_4, pool_5], axis=1)
x = Dropout(0.1)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.1)(x)
outp = Dense(6, activation='sigmoid')(x)

# # load pre-trained model from disk
# model = load_nnet(MODEL_NAME)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# train
BATCH_SIZE = 32
N_EPOCHS = 2

RocAuc = RocAucEvaluation(validation_data=(X_valid, y_valid))

hist = model.fit(X_train, y_train, 
                 batch_size=BATCH_SIZE, 
                 epochs=N_EPOCHS, 
                 validation_data=(X_valid, y_valid),
                 callbacks=[RocAuc])

# save trained nnet to disk for later use
save_nnet(model, MODEL_NAME)

In [None]:
# final model evaluation
y_train_pred = model.predict(X_train, batch_size=512)
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score)) 

y_valid_pred = model.predict(X_valid, batch_size=512)
valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

In [None]:
# predict
y_test_pred = model.predict(X_test, batch_size=512, verbose=2)

In [None]:
# write submission file
submission(y_test_pred, id_test, name=MODEL_NAME)

# Use pre-trained word2vec words for embeddings
