# I - Données, pré-traitement  

In [221]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import sparse
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPooling1D, Bidirectional, Conv1D, concatenate
from keras.models import Model

from tools import *
from embeddings import *
from models import *

In [2]:
# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

## Nettoyage des données (optionnel)

In [141]:
params = {'lower': True, 
          'lemma': False, 
          'stop_words': False}

comment = data_train[2]
print(comment)
print('-------')
print(CommentCleaner(**params).transform(comment))

Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
-------
hey man i m really not trying to edit war it s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info 


In [142]:
clean_data_train = transform_dataset(data_train, transformer=CommentCleaner, kwargs=params)
clean_data_test = transform_dataset(data_test, transformer=CommentCleaner, kwargs=params)

Transformation: 100%       
Transformation: 100%       


## Conversion numérique des données textuelles

In [211]:
# Convert strings to int indexes, 
# considering only the VOCAB_SIZE most common words, 
# and pad the sentences to SENTENCE_LENGTH words
VOCAB_SIZE = 30000
## TODO: set parameters in a better way


tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                                   min_df=10, max_features=VOCAB_SIZE, use_idf=True, smooth_idf=True,
                                   sublinear_tf=False)

#count_vectorizer = CountVectorizer(analyzer='word', stop_words='english',
#                                  strip_accents='unicode', max_features=VOCAB_SIZE)


# X_train_all, X_test = encode(data_train, data_test, vectorizer=tokens_vectorizer)
X_train_all, X_test = encode(clean_data_train, clean_data_test, vectorizer=tfidf_vectorizer)

ENCODING: Fitting vectorizer to data
ENCODING: transforming data to numerical


## Extraction des features auxiliaires

In [198]:
print("Computing comments length")
comments_lengths_train = np.array(transform_dataset(data_train, transformer=CommentLength, n_prints=5))
comments_lengths_test = np.array(transform_dataset(data_test, transformer=CommentLength, n_prints=5))

print("Computing number of punctuation marks in comments")
params = {'divide_by_len': True, 'chars_set': {'!'}}
comments_punctuation_train = np.array(transform_dataset(data_train, transformer=CharCounter, kwargs=params))
comments_punctuation_test = np.array(transform_dataset(data_test, transformer=CharCounter, kwargs=params))

print("Computing number of upper cased words in comments")
params = {'divide_by_len': True}
comments_upperwords_train = np.array(transform_dataset(data_train, transformer=UppercaseWordsCounter, kwargs=params))
comments_upperwords_test = np.array(transform_dataset(data_test, transformer=UppercaseWordsCounter, kwargs=params))

# concatenation of auxiliary features
X_aux_train_all = np.vstack((comments_lengths_train, comments_punctuation_train, comments_upperwords_train)).T
X_aux_test = np.vstack((comments_lengths_test, comments_punctuation_test, comments_upperwords_test)).T

Computing comments length
Transformation: 100%       
Transformation: 100%       
Computing number of punctuation marks in comments
Transformation: 100%       
Transformation: 100%       
Computing number of upper cased words in comments
Transformation: 100%       
Transformation: 100%       


In [212]:
SPLIT_VALID_RATIO = 0.10
SPLIT_RANDOM_SEED = 0  # TODO : check split because of imbalanced classes

# numerical comments
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, 
                                                      test_size=SPLIT_VALID_RATIO,
                                                      random_state=SPLIT_RANDOM_SEED)

# auxiliary input
X_aux_train, X_aux_valid, _, _ = train_test_split(X_aux_train_all, y_train_all, 
                                                  test_size=SPLIT_VALID_RATIO,
                                                  random_state=SPLIT_RANDOM_SEED)

## NBSVM on TFIDF/CBOW/...

In [213]:
params = {'C':6*[0.5],
          'dual': 6*[False],
          'solver': 6*['lbfgs']}
USE_AUX_FEATURES = False

model = OneVAllClassifier(n_classes=6, clf=NbSvmClassifier, params=params)

In [214]:
model.fit(hstack((X_train, X_aux_train)).astype(int).tocsr() if USE_AUX_FEATURES else X_train, y_train)

OneVAllClassifier(clf=None, n_classes=6, params=None)

In [215]:
y_train_pred = model.predict_proba(hstack((X_train, X_aux_train)).astype(int).tocsr() if USE_AUX_FEATURES else X_train)
y_valid_pred = model.predict_proba(hstack((X_valid, X_aux_valid)).astype(int).tocsr() if USE_AUX_FEATURES else X_valid)

In [216]:
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score))

valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

ROC-AUC score on train set : 0.9916
ROC-AUC score on validation set : 0.9845


In [210]:
# predict
y_test_pred = model.predict_proba(hstack((X_test, X_aux_test)).astype(int).tocsr() if USE_AUX_FEATURES else X_test)
# write submission file
submission(y_test_pred, id_test, name='NBSVM_Tfidf_max_df_10')    

In [217]:
np.sum(y_test_pred, axis=0)
np.sum(y_train_pred, axis=0)
np.sum(y_train, axis=0)

array([13763,  1434,  7591,   435,  7064,  1258])

## Auxiliar features

In [222]:
params = {'C':6*[0.5],
          'dual': 6*[False],
          'solver': 6*['lbfgs']}

model = OneVAllClassifier(n_classes=6, clf=NbSvmClassifier, params=params)
model.fit(sparse.csr_matrix(X_aux_train), y_train)

OneVAllClassifier(clf=None, n_classes=6, params=None)

In [223]:
y_train_pred = model.predict_proba(sparse.csr_matrix(X_aux_train))
y_valid_pred = model.predict_proba(sparse.csr_matrix(X_aux_valid))

In [224]:
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score))

valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

ROC-AUC score on train set : 0.6269
ROC-AUC score on validation set : 0.6353
