# I - Données, pré-traitement  

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import sparse
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

from tools import *
from embeddings import *
from models import *

Using TensorFlow backend.


In [2]:
# load raw string data
data_train, y_train_all, data_test, id_test = load_data()

print('Nb comments: {} (y_shape: {})'.format(len(data_train), y_train_all.shape))
#print(y_train_all.sum(0)/y_train_all.shape[0])


DATA_AUGMENT = False
# If data augmentation, load "translated data" and add to dataset only negatives examples
y_train_all_no_aug = y_train_all 
if DATA_AUGMENT:
    y_train_all_toxic_idx = np.where(np.sum(y_train_all, axis=1)!=0)[0]

    for language_extension in ['_fr', '_es', '_de']:
        print(language_extension)
        data_train_lg, _, _ ,_ = load_data(language=language_extension)
        data_train_lg_toxic = [data_train_lg[idx] for idx in y_train_all_toxic_idx] 
        data_train += data_train_lg_toxic
        y_train_all = np.vstack((y_train_all, y_train_all[y_train_all_toxic_idx]))

    print('Nb comments after data augment: {} (y_shape: {})'.format(len(data_train), y_train_all.shape))
    #print(y_train_all.sum(0)/y_train_all.shape[0])


Nb comments: 159571 (y_shape: (159571, 6))


In [3]:
if DATA_AUGMENT==True:
    plt.figure()
    plt.tight_layout()
    
    y_train_all_no_aug_binary = np.atleast_2d(np.sum(y_train_all_no_aug, axis=1)).T
    y_train_all_no_aug_binary[y_train_all_no_aug_binary>0] = 1
    
    y_train_all_binary = np.atleast_2d(np.sum(y_train_all, axis=1)).T
    y_train_all_binary[y_train_all_binary>0] = 1

    
    plt.title('Class imbalance with data augmentation:\n {} vs. {} examples'.format(np.shape(y_train_all_no_aug)[0], np.shape(y_train_all)[0]))
    plt.bar(['clean']+CLASSES, [y_train_all_binary.shape[0]- np.sum(y_train_all_binary)] + list(np.sum(y_train_all, axis=0)), color='r')
    plt.bar(['clean']+CLASSES, [y_train_all_no_aug_binary.shape[0]- np.sum(y_train_all_no_aug_binary)] + list(np.sum(y_train_all_no_aug, axis=0)))
    plt.xticks(rotation=80)

    plt.show()

## Nettoyage des données (optionnel)

In [4]:
params = {'lower': False, 
          'lemma': True, 
          'stop_words': False}

comment = data_train[2]
print(comment)
print('-------')
print(CommentCleaner(**params).transform(comment))

Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
-------
Hey man I m really not try to edit war It s just that this guy be constantly remove relevant information and talk to me through edit instead of my talk page He seem to care more about the format than the actual info


In [5]:
clean_data_train = transform_dataset(data_train, transformer=CommentCleaner, kwargs=params)
clean_data_test = transform_dataset(data_test, transformer=CommentCleaner, kwargs=params)

Transformation: 100%       
Transformation: 100%       


## Conversion numérique des données textuelles

In [6]:
# Convert strings to int indexes, 
# considering only the VOCAB_SIZE most common words, 
# and pad the sentences to SENTENCE_LENGTH words
VOCAB_SIZE = 30000
## TODO: set parameters in a better way


tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                                   min_df=10, max_features=VOCAB_SIZE, use_idf=True, smooth_idf=True,
                                   sublinear_tf=True)

# count_vectorizer = CountVectorizer(analyzer='word', stop_words='english',
#                                  strip_accents='unicode', max_features=VOCAB_SIZE)


# X_train_all, X_test = encode(data_train, data_test, vectorizer=tokens_vectorizer)
X_train_all, X_test = encode(clean_data_train, clean_data_test, vectorizer=tfidf_vectorizer)

ENCODING: Fitting vectorizer to data
ENCODING: transforming data to numerical


## Extraction des features auxiliaires

In [7]:
print("Computing comments length")
comments_lengths_train = np.array(transform_dataset(data_train, transformer=CommentLength, n_prints=5))
comments_lengths_test = np.array(transform_dataset(data_test, transformer=CommentLength, n_prints=5))

print("Computing number of punctuation marks in comments")
params = {'divide_by_len': True, 'chars_set': {'!'}}
comments_punctuation_train = np.array(transform_dataset(data_train, transformer=CharCounter, kwargs=params))
comments_punctuation_test = np.array(transform_dataset(data_test, transformer=CharCounter, kwargs=params))

print("Computing number of upper cased words in comments")
params = {'divide_by_len': True}
comments_upperwords_train = np.array(transform_dataset(data_train, transformer=UppercaseWordsCounter, kwargs=params))
comments_upperwords_test = np.array(transform_dataset(data_test, transformer=UppercaseWordsCounter, kwargs=params))

# concatenation of auxiliary features
X_aux_train_all = np.vstack((comments_lengths_train, comments_punctuation_train, comments_upperwords_train)).T
X_aux_test = np.vstack((comments_lengths_test, comments_punctuation_test, comments_upperwords_test)).T

Computing comments length
Transformation: 100%       
Transformation: 100%       
Computing number of punctuation marks in comments
Transformation: 100%       
Transformation: 100%       
Computing number of upper cased words in comments
Transformation: 100%       
Transformation: 100%       


In [8]:
SPLIT_VALID_RATIO = 0.10
SPLIT_RANDOM_SEED = 0  # TODO : check split because of imbalanced classes

# numerical comments
X_train, X_valid, y_train, y_valid = train_test_split(X_train_all, y_train_all, 
                                                      test_size=SPLIT_VALID_RATIO,
                                                      random_state=SPLIT_RANDOM_SEED)

# auxiliary input
X_aux_train, X_aux_valid, _, _ = train_test_split(X_aux_train_all, y_train_all, 
                                                  test_size=SPLIT_VALID_RATIO,
                                                  random_state=SPLIT_RANDOM_SEED)

## NBSVM on TFIDF/CBOW/...

In [16]:
params = {'C':6*[0.5],
          'dual': 6*[False],
          'solver': 6*['lbfgs']}
USE_AUX_FEATURES = False
MODEL_NAME = 'only_lemma_no_aux_TFIDF_NBSVM_C_4'

model = OneVAllClassifier(n_classes=6, clf=NbSvmClassifier, params=params)

In [17]:
model.fit(hstack((X_train, X_aux_train)).astype(int).tocsr() if USE_AUX_FEATURES else X_train, y_train)

Fitting model 0:
Fitting model 1:
Fitting model 2:
Fitting model 3:
Fitting model 4:
Fitting model 5:


OneVAllClassifier(clf=None, n_classes=6, params=None)

In [18]:
y_train_pred = model.predict_proba(hstack((X_train, X_aux_train)).astype(int).tocsr() if USE_AUX_FEATURES else X_train)
y_valid_pred = model.predict_proba(hstack((X_valid, X_aux_valid)).astype(int).tocsr() if USE_AUX_FEATURES else X_valid)

In [19]:
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score))

valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

ROC-AUC score on train set : 0.9958
ROC-AUC score on validation set : 0.9808


In [20]:
y_train_all_pred = model.predict_proba(hstack((X_train_all, X_aux_train_all)).astype(int).tocsr() if USE_AUX_FEATURES else X_train_all)
save_pred(y_train_all_pred, MODEL_NAME)

In [21]:
# predict
y_test_pred = model.predict_proba(hstack((X_test, X_aux_test)).astype(int).tocsr() if USE_AUX_FEATURES else X_test)
# write submission file
submission(y_test_pred, id_test, name=MODEL_NAME)
save_pred(y_test_pred, MODEL_NAME+'_test')

## Apprentissage sur features auxiliaires seules

In [222]:
params = {'C':6*[0.5],
          'dual': 6*[False],
          'solver': 6*['lbfgs']}

model = OneVAllClassifier(n_classes=6, clf=NbSvmClassifier, params=params)
model.fit(sparse.csr_matrix(X_aux_train), y_train)

OneVAllClassifier(clf=None, n_classes=6, params=None)

In [223]:
y_train_pred = model.predict_proba(sparse.csr_matrix(X_aux_train))
y_valid_pred = model.predict_proba(sparse.csr_matrix(X_aux_valid))

In [224]:
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score))

valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))

ROC-AUC score on train set : 0.6269
ROC-AUC score on validation set : 0.6353


## Autres modèles que NBSVM

In [54]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
# GradientBoostingC/R doesnt have predict_proba, GaussianNB need dense data (impossible here, due to RAM limitation) 

  from numpy.core.umath_tests import inner1d


In [55]:
USE_AUX_FEATURES = False

params_RFC = {}
params_XGB = {'n_jobs':6*[4]}
MODEL_NAME = 'XGB'
model = OneVAllClassifier(n_classes=6, clf=XGBClassifier, params=params_XGB)


In [56]:
model.fit(hstack((X_train, X_aux_train)).astype(int).tocsr() if USE_AUX_FEATURES else X_train, y_train)

Fitting model 0:
Fitting model 1:
Fitting model 2:
Fitting model 3:
Fitting model 4:
Fitting model 5:


OneVAllClassifier(clf=None, n_classes=6, params=None)

In [57]:
y_train_pred = model.predict_proba(hstack((X_train, X_aux_train)).astype(int).tocsr() if USE_AUX_FEATURES else X_train)
y_valid_pred = model.predict_proba(hstack((X_valid, X_aux_valid)).astype(int).tocsr() if USE_AUX_FEATURES else X_valid)

In [58]:
y_train_all_pred = model.predict_proba(hstack((X_train_all, X_aux_train_all)).astype(int).tocsr() if USE_AUX_FEATURES else X_train_all)
save_pred(y_train_all_pred, MODEL_NAME)

In [59]:
train_score = evaluate(y_train, y_train_pred)
print("ROC-AUC score on train set : {:.4f}".format(train_score))

valid_score = evaluate(y_valid, y_valid_pred)
print("ROC-AUC score on validation set : {:.4f}".format(valid_score))


ROC-AUC score on train set : 0.9523
ROC-AUC score on validation set : 0.9465


In [25]:
# predict
y_test_pred = model.predict_proba(hstack((X_test, X_aux_test)).astype(int).tocsr() if USE_AUX_FEATURES else X_test)
# write submission file
#submission(y_test_pred, id_test, name=MODEL_NAME) 
save_pred(y_test_pred, MODEL_NAME+'_test')