https://www.kaggle.com/gaussmake1994/word-character-n-grams-tfidf-regressions-lb-051

1504 views
45 voters


In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from functools import lru_cache
from tqdm import tqdm as tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy import sparse

train = pd.read_csv('../input/train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
train['comment_text'] = train['comment_text'].fillna('nan')
test = pd.read_csv('../input/test.csv')
test.head()


Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [3]:
test['comment_text'] = test['comment_text'].fillna('nan')
submission = pd.read_csv('../input/sample_submission.csv')
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.5,0.5,0.5,0.5,0.5,0.5
1,0000247867823ef7,0.5,0.5,0.5,0.5,0.5,0.5
2,00013b17ad220c46,0.5,0.5,0.5,0.5,0.5,0.5
3,00017563c3f7919a,0.5,0.5,0.5,0.5,0.5,0.5
4,00017695ad8997eb,0.5,0.5,0.5,0.5,0.5,0.5


In [4]:
for label in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print(label, (train[label] == 1.0).sum() / len(train))

toxic 0.09584448302009764
severe_toxic 0.009995550569965721
obscene 0.052948217407925
threat 0.002995531769557125
insult 0.04936360616904074
identity_hate 0.00880485802558109


In [5]:
train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].corr()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
toxic,1.0,0.308619,0.676515,0.157058,0.647518,0.266009
severe_toxic,0.308619,1.0,0.403014,0.123601,0.375807,0.2016
obscene,0.676515,0.403014,1.0,0.141179,0.741272,0.286867
threat,0.157058,0.123601,0.141179,1.0,0.150022,0.115128
insult,0.647518,0.375807,0.741272,0.150022,1.0,0.337736
identity_hate,0.266009,0.2016,0.286867,0.115128,0.337736,1.0


## Text postprocessing
I'll try models with:
- text as is
- stemmed text
- lemmatized text

In [6]:
stemmer = EnglishStemmer()

@lru_cache(30000)
def stem_word(text):
    return stemmer.stem(text)


lemmatizer = WordNetLemmatizer()

@lru_cache(30000)
def lemmatize_word(text):
    return lemmatizer.lemmatize(text)


def reduce_text(conversion, text):
    return " ".join(map(conversion, wordpunct_tokenize(text.lower())))


def reduce_texts(conversion, texts):
    return [reduce_text(conversion, str(text))
            for text in tqdm(texts)]

In [7]:
train['comment_text_stemmed'] = reduce_texts(stem_word, train['comment_text'])
test['comment_text_stemmed'] = reduce_texts(stem_word, test['comment_text'])
train['comment_text_lemmatized'] = reduce_texts(lemmatize_word, train['comment_text'])
test['comment_text_lemmatized'] = reduce_texts(lemmatize_word, test['comment_text'])

100%|██████████| 159571/159571 [00:20<00:00, 7874.21it/s]
100%|██████████| 153164/153164 [00:40<00:00, 3782.32it/s]
100%|██████████| 159571/159571 [00:36<00:00, 4427.86it/s]
100%|██████████| 153164/153164 [00:28<00:00, 5363.59it/s]


In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_stemmed,comment_text_lemmatized
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explan whi the edit made under my usernam hard...,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d ' aww ! he match this background colour i ' ...,d ' aww ! he match this background colour i ' ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man , i ' m realli not tri to edit war . i...","hey man , i ' m really not trying to edit war ..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i can ' t make ani real suggest on impr...",""" more i can ' t make any real suggestion on i..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you , sir , are my hero . ani chanc you rememb...","you , sir , are my hero . any chance you remem..."


In [9]:
test.head()

Unnamed: 0,id,comment_text,comment_text_stemmed,comment_text_lemmatized
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule is more succes then you ' ll ...,yo bitch ja rule is more succesful then you ' ...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,"== from rfc == the titl is fine as it is , imo .","== from rfc == the title is fine a it is , imo ."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",""" == sourc == * zaw ashton on lapland — / """,""" == source == * zawe ashton on lapland — / """
3,00017563c3f7919a,":If you have a look back at the source, the in...",": if you have a look back at the sourc , the i...",": if you have a look back at the source , the ..."
4,00017695ad8997eb,I don't anonymously edit articles at all.,i don ' t anonym edit articl at all .,i don ' t anonymously edit article at all .


## Validation
Our metric is collumn-average of collumn log_loss values. So let's define custom metric based on binary log loss and define cross-validation function:

In [10]:
def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

## Cross-validation
I don't found quickly a way to stratified split for multilabel case.

So I used next way for stratified splitting:

define ordered list of all possible label combinations. E.g.

- 0 = ["toxic"=0, "severe_toxic"=0, "obscene"=0, "threat"=0, "insult"=0, "identity_hate"=0]
- 1 = ["toxic"=0, "severe_toxic"=0, "obscene"=0, "threat"=0, "insult"=1, "identity_hate"=0]
- 2 = ["toxic"=0, "severe_toxic"=0, "obscene"=0, "threat"=0, "insult"=1, "identity_hate"=1]
for each row replace label combination with combination index

use StratifiedKFold on this
train and test model by train/test indices from StratifiedKFold
Basic idea is next:

- we can present label combination as class for multiclass classification - at least for some cases
- we can stratified split by combination indices
--so in each split distribution of combination indices will be similar to full set
--so source label distribution also will be similar
But I don't sure that all my assumpions are fully correct - at least, for common case.

In [11]:
def cv(model, X, y, label2binary, n_splits=3):
    def split(X, y):
        return StratifiedKFold(n_splits=n_splits).split(X, y)
    
    def convert_y(y):
        new_y = np.zeros([len(y)])
        for i, val in enumerate(label2binary):
            idx = (y == val).max(axis=1)
            new_y[idx] = i
        return new_y
    
    X = np.array(X)
    y = np.array(y)
    scores = []
    for train, test in tqdm(split(X, convert_y(y)), total=n_splits):
        fitted_model = model(X[train], y[train])
        scores.append(metric(y[test], fitted_model(X[test])))
    return np.array(scores)

Let's define possible label combinations:

In [12]:
label2binary = np.array([
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 1, 0, 1],
    [0, 0, 0, 1, 1, 0],
    [0, 0, 0, 1, 1, 1],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 1, 0, 0, 1],
    [0, 0, 1, 0, 1, 0],
    [0, 0, 1, 0, 1, 1],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 1, 1, 0, 1],
    [0, 0, 1, 1, 1, 0],
    [0, 0, 1, 1, 1, 1],
    [0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0],
    [0, 1, 0, 0, 1, 1],
    [0, 1, 0, 1, 0, 0],
    [0, 1, 0, 1, 0, 1],
    [0, 1, 0, 1, 1, 0],
    [0, 1, 0, 1, 1, 1],
    [0, 1, 1, 0, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 1, 1, 0, 1, 0],
    [0, 1, 1, 0, 1, 1],
    [0, 1, 1, 1, 0, 0],
    [0, 1, 1, 1, 0, 1],
    [0, 1, 1, 1, 1, 0],
    [0, 1, 1, 1, 1, 1],
    [1, 0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 1, 0],
    [1, 0, 0, 0, 1, 1],
    [1, 0, 0, 1, 0, 0],
    [1, 0, 0, 1, 0, 1],
    [1, 0, 0, 1, 1, 0],
    [1, 0, 0, 1, 1, 1],
    [1, 0, 1, 0, 0, 0],
    [1, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0],
    [1, 0, 1, 0, 1, 1],
    [1, 0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0, 1],
    [1, 0, 1, 1, 1, 0],
    [1, 0, 1, 1, 1, 1],
    [1, 1, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 1],
    [1, 1, 0, 0, 1, 0],
    [1, 1, 0, 0, 1, 1],
    [1, 1, 0, 1, 0, 0],
    [1, 1, 0, 1, 0, 1],
    [1, 1, 0, 1, 1, 0],
    [1, 1, 0, 1, 1, 1],
    [1, 1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 1],
    [1, 1, 1, 0, 1, 0],
    [1, 1, 1, 0, 1, 1],
    [1, 1, 1, 1, 0, 0],
    [1, 1, 1, 1, 0, 1],
    [1, 1, 1, 1, 1, 0],
    [1, 1, 1, 1, 1, 1],
])

## Dummy model
Let's build dummy model that always return 0.5 and compare score on cross-validation with test-set public leatherboard "All 0.5s Benchmark" (score - 0.693)

In [13]:
def dummy_model(X, y):
    def _predict(X):
        return np.ones([X.shape[0], 6]) * 0.5
    
    return _predict

cv(dummy_model,
   train['comment_text'],
   train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
   label2binary)

100%|██████████| 3/3 [00:00<00:00,  5.32it/s]


array([0.69314718, 0.69314718, 0.69314718])

seems like we built metric correctly, so let's go to baseline building

## Baseline (binary logistic regression over word-based tf-idf)
Let's build model that:

compute tf-idf for given train texts
train 6 logistic regressions (one for each label)
compute tf-idf on test texts
compute probability of "1" class for all 6 regressions

In [14]:
def regression_baseline(X, y):
    tfidf = TfidfVectorizer()
    X_tfidf = tfidf.fit_transform(X)
    columns = y.shape[1]
    regressions = [
        LogisticRegression().fit(X_tfidf, y[:, i])
        for i in range(columns)
    ]
    
    def _predict(X):
        X_tfidf = tfidf.transform(X)
        predictions = np.zeros([len(X), columns])
        for i, regression in enumerate(regressions):
            regression_prediction = regression.predict_proba(X_tfidf)
            predictions[:, i] = regression_prediction[:, regression.classes_ == 1][:, 0]
        return predictions
    
    return _predict

Now let's check model on source texts/stemmed texts/lemmatized texts

In [15]:
cv(regression_baseline,
   train['comment_text'],
   train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
   label2binary)

100%|██████████| 3/3 [02:16<00:00, 45.60s/it]


array([0.0559303 , 0.05627819, 0.05591385])

In [16]:
cv(regression_baseline,
   train['comment_text_stemmed'],
   train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
   label2binary)

100%|██████████| 3/3 [02:12<00:00, 44.21s/it]


array([0.05361348, 0.05397901, 0.05395586])

In [17]:
cv(regression_baseline,
   train['comment_text_lemmatized'],
   train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
   label2binary)

100%|██████████| 3/3 [02:20<00:00, 46.82s/it]


array([0.0545853 , 0.05495679, 0.05475021])

As you can see - this baseline gives best score on stemmed texts. Anyway - let's try to add character-level features:

## Regressions over tfidf over words and character n-grams
Let's build model that:

compute tfidf of words of stemmed texts
compute tfidf of character n-grams from source text
train/predict regressions on computed tfidf-s.

In [18]:
def regression_wordchars(X, y):
    tfidf_word = TfidfVectorizer()
    X_tfidf_word = tfidf_word.fit_transform(X[:, 1])
    tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
    X_tfidf_char = tfidf_char.fit_transform(X[:, 0])
    X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
    
    columns = y.shape[1]
    regressions = [
        LogisticRegression().fit(X_tfidf, y[:, i])
        for i in range(columns)
    ]
    
    def _predict(X):
        X_tfidf_word = tfidf_word.transform(X[:, 1])
        X_tfidf_char = tfidf_char.transform(X[:, 0])
        X_tfidf = sparse.hstack([X_tfidf_word, X_tfidf_char])
        predictions = np.zeros([len(X), columns])
        for i, regression in enumerate(regressions):
            regression_prediction = regression.predict_proba(X_tfidf)
            predictions[:, i] = regression_prediction[:, regression.classes_ == 1][:, 0]
        return predictions
    
    return _predict

In [19]:
cv(regression_wordchars,
   train[['comment_text', 'comment_text_stemmed']],
   train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']],
   label2binary)

100%|██████████| 3/3 [33:39<00:00, 673.17s/it]


array([0.04880963, 0.04934726, 0.04933602])

## Prediction
Let's use our best model - regression over word&chars tfidf to build submission:

In [20]:
%%time
model = regression_wordchars(np.array(train[['comment_text', 'comment_text_stemmed']]),
                             np.array(train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]))

CPU times: user 9min 15s, sys: 30.3 s, total: 9min 46s
Wall time: 17min 34s


In [26]:
%%time
prediction = model(np.array(test[['comment_text', 'comment_text_stemmed']]))

CPU times: user 3min 57s, sys: 13.5 s, total: 4min 11s
Wall time: 7min 41s


In [27]:
for i, label in enumerate(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']):
    submission[label] = prediction[:, i]
submission.head()


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.99985,0.16663,0.999582,0.065705,0.991182,0.478256
1,0000247867823ef7,0.008019,0.004266,0.006445,0.000499,0.004568,0.002605
2,00013b17ad220c46,0.02523,0.008034,0.016309,0.001218,0.006552,0.002247
3,00017563c3f7919a,0.002481,0.001001,0.001842,0.000697,0.002761,0.000383
4,00017695ad8997eb,0.018513,0.001046,0.0046,0.000557,0.008488,0.000902


In [28]:
submission.to_csv('submission_word_charactern-gramsTFIDF.csv', index=None)