# Pré-traitement des données

## Chargement des données

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPooling1D, Bidirectional, Conv1D, concatenate
from keras.models import Model

from tools import *
from embeddings import *
from models import *

Using TensorFlow backend.


In [2]:
# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

## Nettoyage des données textuelles (optionnel)

In [3]:
params = {'lower': True, 
          'lemma': False, 
          'stop_words': False}

comment = data_train[2]
print(comment)
print('-------')
print(CommentCleaner(**params).transform(comment))

Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
-------
hey man i m really not trying to edit war it s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info 


In [4]:
clean_data_train = transform_dataset(data_train, transformer=CommentCleaner, kwargs=params)
clean_data_test = transform_dataset(data_test, transformer=CommentCleaner, kwargs=params)

Transformation: 100%       
Transformation: 100%       


## Conversion numérique des données textuelles

In [5]:
# Convert strings to int indexes, 
# considering only the VOCAB_SIZE most common words, 
# and pad the sentences to SENTENCE_LENGTH words
VOCAB_SIZE = 30000
SENTENCE_LENGTH = 200  # 200 if stop_words deleted, 120 otherwise

tokenizer = TokenVectorizer(max_len=SENTENCE_LENGTH, max_features=VOCAB_SIZE)

# X_train_all, X_test = encode(data_train, data_test, vectorizer=tokens_vectorizer)
X_train_all, X_test = encode(clean_data_train, clean_data_test, vectorizer=tokenizer)

ENCODING: Fitting vectorizer to data
ENCODING: transforming data to numerical


## Extraction des features auxiliaires

In [7]:
print("Computing comments length")
comments_lengths_train = np.array(transform_dataset(data_train, transformer=CommentLength, n_prints=5))
comments_lengths_test = np.array(transform_dataset(data_test, transformer=CommentLength, n_prints=5))

print("Computing number of punctuation marks in comments")
params = {'divide_by_len': True, 'chars_set': {'!'}}
comments_punctuation_train = np.array(transform_dataset(data_train, transformer=CharCounter, kwargs=params))
comments_punctuation_test = np.array(transform_dataset(data_test, transformer=CharCounter, kwargs=params))

print("Computing number of upper cased words in comments")
params = {'divide_by_len': True}
comments_upperwords_train = np.array(transform_dataset(data_train, transformer=UppercaseWordsCounter, kwargs=params))
comments_upperwords_test = np.array(transform_dataset(data_test, transformer=UppercaseWordsCounter, kwargs=params))

# concatenation of auxiliary features
X_aux_train_all = np.vstack((comments_lengths_train, comments_punctuation_train, comments_upperwords_train)).T
X_aux_test = np.vstack((comments_lengths_test, comments_punctuation_test, comments_upperwords_test)).T

Computing comments length
Transformation: 100%       
Transformation: 100%       
Computing number of punctuation marks in comments
Transformation: 100%       
Transformation: 100%       
Computing number of upper cased words in comments
Transformation: 100%       
Transformation: 100%       


## Séparation du jeu d'entraînement et de validation

In [8]:
SPLIT_VALID_RATIO = 0.10
SPLIT_RANDOM_SEED = 0  # TODO : check split because of imbalanced classes

# numerical comments
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, 
                                                      test_size=SPLIT_VALID_RATIO,
                                                      random_state=SPLIT_RANDOM_SEED)

# auxiliary input
X_aux_train, X_aux_valid, _, _ = train_test_split(X_aux_train_all, y_train_all, 
                                                      test_size=SPLIT_VALID_RATIO,
                                                      random_state=SPLIT_RANDOM_SEED)

# Test des différents modèles

## Embeddings + LSTM + 2 fc

In [25]:
EMBEDDING_DIM = 150
TRAIN_EMBEDDINGS = True
MODEL_NAME = "draft_embed_bidirlstm_2fc"

model = bidirectional_lstm(sentence_length=SENTENCE_LENGTH, vocab_size=VOCAB_SIZE,
                    embedding_dim=EMBEDDING_DIM, embedding_matrix=None, train_embeddings=TRAIN_EMBEDDINGS)

In [10]:
# train
BATCH_SIZE = 32
N_EPOCHS = 2

RocAuc = RocAucEvaluation(validation_data=(X_valid, y_valid))

hist = model.fit(X_train, y_train, 
                 batch_size=BATCH_SIZE, 
                 epochs=N_EPOCHS, 
                 validation_data=(X_valid, y_valid),
                 callbacks=[RocAuc])

# save trained nnet to disk for later use
save_nnet(model, MODEL_NAME)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
epoch: 1 - val_roc_auc: 0.9765
Epoch 2/2
epoch: 2 - val_roc_auc: 0.9815


In [11]:
# final model evaluation
y_train_pred = model.predict(X_train, batch_size=512)
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score)) 

y_valid_pred = model.predict(X_valid, batch_size=512)
valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

ROC-AUC score on train set : 0.9898
ROC-AUC score on validation set : 0.9815


In [12]:
# predict
y_test_pred = model.predict(X_test, batch_size=512, verbose=2)

In [13]:
# write submission file
submission(y_test_pred, id_test, name=MODEL_NAME)

## Embeddings + conv1D parallèles + fc (Yoon Kim)

In [10]:
EMBEDDING_DIM = 150
TRAIN_EMBEDDINGS = True
N_FILTERS = 100
FILTERS_SIZES = (3, 5, 7)
MODEL_NAME = "embed_conv_fc"

model = yoon_kim(sentence_length=SENTENCE_LENGTH, vocab_size=VOCAB_SIZE,
                 n_filters=N_FILTERS, filters_sizes=FILTERS_SIZES,
                 embedding_dim=EMBEDDING_DIM, embedding_matrix=None, train_embeddings=TRAIN_EMBEDDINGS)

In [11]:
# train
BATCH_SIZE = 32
N_EPOCHS = 2

RocAuc = RocAucEvaluation(validation_data=(X_valid, y_valid))

hist = model.fit(X_train, y_train, 
                 batch_size=BATCH_SIZE, 
                 epochs=N_EPOCHS, 
                 validation_data=(X_valid, y_valid),
                 callbacks=[RocAuc])

# save trained nnet to disk for later use
save_nnet(model, MODEL_NAME)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
epoch: 1 - val_roc_auc: 0.9800
Epoch 2/2
epoch: 2 - val_roc_auc: 0.9846


FileNotFoundError: [Errno 2] No such file or directory: 'models/embed_conv_fc.json'

In [None]:
# final model evaluation
y_train_pred = model.predict(X_train, batch_size=512)
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score)) 

y_valid_pred = model.predict(X_valid, batch_size=512)
valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

## GloVe twitter 200 + Bidir LSTM + auxiliary input

In [9]:
# Load GloVe pre-trained embeddings
EMBEDDING_DIM = 200  # several embeddings sizes depending on source : 25, 50, 100, 200, 300 
EMBEDDING_SOURCE = 'glove_twitter'  # {'glove_twitter', 'glove_wikipedia', 'word2vec_googlenews'}

embeddings_matrix = load_pretrained_embeddings(tokenizer.word_index, VOCAB_SIZE, EMBEDDING_DIM, EMBEDDING_SOURCE)

Number of pre-trained word vectors in database       : 1193514
Number of our words with a pre-trained embedding     : 26798
Percentage of our words with a pre-trained embedding : 89.327%


In [10]:
USE_AUX_FEATURES = True
TRAIN_EMBEDDINGS = True
MODEL_NAME = "bidirlstm_bipool_aux_1fc_glove_twitter_200t"

model = bidirectional_lstm(sentence_length=SENTENCE_LENGTH, vocab_size=VOCAB_SIZE,
                           embedding_dim=EMBEDDING_DIM, embedding_matrix=embeddings_matrix, train_embeddings=TRAIN_EMBEDDINGS,
                           aux_input_dim=X_aux_train.shape[1] if USE_AUX_FEATURES else None)

In [None]:
# train
BATCH_SIZE = 32
N_EPOCHS = 2

RocAuc = RocAucEvaluation(validation_data=([X_valid, X_aux_valid] if USE_AUX_FEATURES else X_valid, y_valid))

hist = model.fit([X_train, X_aux_train] if USE_AUX_FEATURES else X_train, y_train, 
                 batch_size=BATCH_SIZE, 
                 epochs=N_EPOCHS, 
                 validation_data=([X_valid, X_aux_valid] if USE_AUX_FEATURES else X_valid, y_valid),
                 callbacks=[RocAuc])

# save trained nnet to disk for later use
save_nnet(model, MODEL_NAME)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
epoch: 1 - val_roc_auc: 0.9848
Epoch 2/2

In [None]:
# final model evaluation
y_train_pred = model.predict([X_train, X_aux_train] if USE_AUX_FEATURES else X_train, batch_size=512)
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score))

y_valid_pred = model.predict([X_valid, X_aux_valid] if USE_AUX_FEATURES else X_valid, batch_size=512)
valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

# Predictions et soumission

In [None]:
# predict
y_test_pred = model.predict([X_test, X_aux_test] if USE_AUX_FEATURES else X_test, batch_size=512, verbose=1)

In [None]:
# write submission file
submission(y_test_pred, id_test, name=MODEL_NAME)