In [1]:
import numpy as np
import pandas as pd

## read the dataset

1. first, see the data sample. 
```
the task is text multi-classification in six(toxic, server_toxic, obscene, threat, insult, indentity_hate)
from the sample data, we konw one text with some labels, and same text data is no labels(all 0)
```

In [39]:
train = pd.read_csv('dataset/train.csv')

In [3]:
train.sample(n=10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
9144,185da65a1b870e3c,The last two episode of Jon & Kate Plus 8 have...,0,0,0,0,0,0
157162,d9bfa04a9c3be8dd,"Clarified. I think i was able to work it, desp...",0,0,0,0,0,0
91755,f5478b2aab845bb4,And a source claiming that Evanescence is a go...,0,0,0,0,0,0
15128,27f4c1a114603d00,"""\n\nWilla gets the rights to the album """"Sexy...",0,0,0,0,0,0
76710,cd6fa5923f7c21b7,"""#REDIRECT Talk:""""Polish Operation"""" of the NK...",0,0,0,0,0,0
119620,7f8ef79020a98b40,"""\nThe F-14 was designed as a dogfighter first...",0,0,0,0,0,0
2129,05c1f66548965da9,"""\n\n Unapproved bot. \n\nYou appear to be usi...",0,0,0,0,0,0
42523,717af680e51654a6,"Disclosing my affliation to M. Sanjayan, whose...",0,0,0,0,0,0
52804,8d29a15eecb0190f,The fact that your only contribution to the en...,0,0,0,0,0,0
87874,eb13473631370d56,"I like the suggestion, and am impressed with t...",0,0,0,0,0,0


In [40]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1 - train[labels].max(axis=1)
train['comment_text'] = train['comment_text'].str.lower()
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## extract the word feature

in the public notebook, the higher score kernel that feature used the tf/idf, then we use the pre-trained word embedding to instead it.

we use the two pre-trained word embedding (google news vetocr/ fasttext)

before using it, we should fix the text vocab to fit the pre-trained word embedding. 

the method for fitting the pre-train word emebedding is

we check the percentage that how many words in the pre-trained embedding and text. then we get the oov(out of vocabulary) and clean / modifty the oov to get the higer percentage 

In [2]:
# the vocabulary method
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
def build_vocab(sentences, verbose=True):
    vocab = {}
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    
    return vocab

In [41]:
# get the vocab
sentences = train['comment_text'].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:01<00:00, 91687.45it/s] 
100%|██████████| 159571/159571 [00:02<00:00, 69232.52it/s]


load the google_news_vector 

In [7]:
# read the pre-trained embedding
from gensim.models import KeyedVectors
embedding_index = KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

In [14]:
# read the pre-trained embedding
from gensim.models import KeyedVectors
embedding_index =  KeyedVectors.load_word2vec_format('./model/crawl-300d-2M.vec/crawl-300d-2M.vec')

In [15]:
# get the coverage 
# embedding of (vocab, all text)
import operator
def check_coverage(vocab, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass
    
    print('Found embedding for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embedding for {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    
    return sorted_x

In [33]:
oov = check_coverage(vocab, embedding_index)

100%|██████████| 470340/470340 [00:01<00:00, 427171.14it/s]


Found embedding for 23.44% of vocab
Found embedding for 88.99% of all text


In [34]:
# out of vocabulary
oov[:10]

[("i'm", 17305),
 ("i've", 8507),
 ('(utc)', 5780),
 ('article,', 5596),
 ('page,', 5415),
 ("i'll", 5175),
 ('(talk)', 4250),
 ("i'd", 3392),
 ('also,', 3372),
 ('so,', 3214)]

In [3]:
# clean the symbol

# def clean_text(x):
#     x = str(x)
#     for punct in "/-—–":
#         x = x.replace(punct, ' ')
#     for punct in '&':
#         x = x.replace(punct, f' {punct} ')
#     for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
#         x = x.replace(punct, '')
#     return x

def clean_text(x):
    x = str(x)
    for punct in '—?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [43]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x: clean_text(x))
sentences = train['comment_text'].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:02<00:00, 64986.43it/s]
100%|██████████| 159571/159571 [00:02<00:00, 75222.13it/s]


In [44]:
oov = check_coverage(vocab, embedding_index)

100%|██████████| 253453/253453 [00:00<00:00, 412550.52it/s]


Found embedding for 41.45% of vocab
Found embedding for 97.35% of all text


In [45]:
oov[:10]

[('npov', 1480),
 ('wikipediahi', 713),
 ('3rr', 657),
 ('fucksex', 624),
 ('yourselfgo', 621),
 ('verticalaligntop', 606),
 ('width100', 590),
 ('wikipediaquestions', 563),
 ('stylewidth', 553),
 ('mothjer', 489)]

In [5]:
# clean the number 

import re

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', "###", x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [17]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x: clean_numbers(x))
sentences = train['comment_text'].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:05<00:00, 31061.11it/s]
100%|██████████| 159571/159571 [00:01<00:00, 141538.14it/s]
100%|██████████| 159571/159571 [00:01<00:00, 86858.84it/s]


In [18]:
oov = check_coverage(vocab, embedding_index)

100%|██████████| 217359/217359 [00:00<00:00, 450253.30it/s]


Found embedding for 30.55% of vocab
Found embedding for 87.24% of all text


In [19]:
oov[:30]

[('to', 297269),
 ('of', 224448),
 ('and', 223455),
 ('a', 214860),
 ('doesnt', 6698),
 ('didnt', 5772),
 ('isnt', 4604),
 ('wikipedias', 2818),
 ('contribs', 2347),
 ('wasnt', 2256),
 ('enwikipediaorg', 2009),
 ('faggot', 1981),
 ('shouldnt', 1548),
 ('npov', 1539),
 ('afd', 1353),
 ('wikiproject', 1332),
 ('wikipedian', 1267),
 ('barnstar', 1024),
 ('infobox', 946),
 ('wikipedians', 900),
 ('aligntop', 827),
 ('behaviour', 785),
 ('helpme', 758),
 ('rfa', 748),
 ('wikipediahi', 713),
 ('faggots', 682),
 ('colorf5fffa', 679),
 ('3rr', 671),
 ('fucksex', 624),
 ('yourselfgo', 621)]

In [6]:
# fix the special or mispell words

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispell_dict = {
    'fucksex': 'fuck sex',
    'yourselfgo': 'yourself go',
    'mothjer': 'mother',
    'philippineslong': 'philippines long',
    'offfuck': 'off fuck',
    'bitchesfuck': 'bitches fuck',
    'youfuck': 'you fuck',
    'bitchfuck': 'bitch fuck',
    'talkcontribs': 'talk contribs',
    'criminalwar': 'criminal war',
    'penissmall': 'penis small',
    'securityfuck': 'security fuck'
}
    
# mispell_dict = {
#     'doesnt': 'does not',
#     'isnt': 'is not',
#     'wasnt': 'was not',
#     'contribs': 'contributions',
#     'npov': 'neutral point of view',
#     'shouldnt': 'should not',
#     'wikiproject': 'wiki project',
#     'afd': 'alternative for germany',
#     'aligntop': 'align top',
#     'infobox': 'info box',
#     'behaviour': 'behavior',
#     'helpme': 'help me',
#     'wikipediahi': 'wikipedia hi',
#     'fucksex': 'fuck sex',
#     'yourselfgo': 'yourself go',
#     'didnt': 'did not',
#     'mothjer': 'mother',
#     'talkpage': 'talk page',
#     'wikipediaquestions': 'wikipedia questions',
#     'hasnt': 'has not',
#     'userpage': 'user page',
#     'arbcom': 'arbitration committee',
#     'rfa': 'radio free asia',
#     'wikipedian': 'wikipedia',
#     'wikipedias': 'wikipedia',
#     'faggot': 'fag got',
#     'bitchesfuck': 'bitches fuck',
#     'offfuck': 'off fuck',
#     'organisation': 'organization',
#     'sexsex': 'sex sex',
#     'youfuck': 'you fuck',
#     'bitchfuck': 'bitch fuck'
# }



mispellings, misepplings_re = _get_mispell(mispell_dict)

In [7]:
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    
    return misepplings_re.sub(replace, text)

In [8]:
# clean the html(css etc.) text

def clean_html(x):
    x = re.sub('color[0-9A-Fa-f]+', '', x)
    x = re.sub('width[0-9]+', '', x)
    x = re.sub('border[0-9]+px', '', x)
    x = re.sub('oldid#+', '', x)
    x = re.sub('verticalaligntop', '', x)
    x = re.sub('[a-z1-9]*wikipedia[a-z1-9]*', '', x)
    x = re.sub('cellpadding[0-9]+', '', x)
    return x

In [69]:
train['comment_text'] = train['comment_text'].progress_apply(lambda x: replace_typical_misspell(x))
train['comment_text'] = train['comment_text'].progress_apply(lambda x: clean_html(x))
sentences = train['comment_text'].progress_apply(lambda x: x.split())
to_remove = ['stylewidth', 'stylebackground', 'sockpuppetry', 'deneidaccess', 'pagedelete', 'stylewidth', 'verticalaligntop', 'editwarring']
sentences = [[word for word in sentences if not word in to_remove] for sentences in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 159571/159571 [00:01<00:00, 80684.55it/s]
100%|██████████| 159571/159571 [00:04<00:00, 32381.65it/s]
100%|██████████| 159571/159571 [00:01<00:00, 139735.70it/s]
100%|██████████| 159571/159571 [00:02<00:00, 66610.09it/s]
100%|██████████| 159571/159571 [00:02<00:00, 71941.66it/s]


In [70]:
oov = check_coverage(vocab, embedding_index)

100%|██████████| 250608/250608 [00:00<00:00, 398853.95it/s]


Found embedding for 41.91% of vocab
Found embedding for 97.52% of all text


In [71]:
oov[:20]

[('npov', 1480),
 ('3rr', 657),
 ('fggt', 480),
 ('gfdl', 444),
 ('wprs', 393),
 ('proassadhanibal911youre', 345),
 ('ricehey', 329),
 ('threerevert', 321),
 ('notrhbysouthbanof', 308),
 ('classmainpagebg', 304),
 ('wpnpov', 299),
 ('bunksteve', 278),
 ('marcolfuck', 260),
 ('boymamas', 258),
 ('wpblp', 253),
 ('ytmndin', 238),
 ('tommy2010', 228),
 ('wpor', 227),
 ('youbollocks', 217),
 ('basteredbastered', 217)]

In [9]:
to_remove = ['to', 'of', 'and', 'a', 'stylewidth', 'stylebackground', 'sockpuppetry', 'deneidaccess', 'pagedelete', 'stylewidth']
def delete_stop_words(x, stop_words):
    x = ' '.join([word for word in x.split() if not word in stop_words])
    return x

In [10]:
# all process

def process(text):
    text['comment_text'] = text['comment_text'].str.lower()
    text['comment_text'] = text['comment_text'].progress_apply(lambda x: clean_text(x))
    # text['comment_text'] = text['comment_text'].progress_apply(lambda x: clean_numbers(x))
    text['comment_text'] = text['comment_text'].progress_apply(lambda x: replace_typical_misspell(x))
    text['comment_text'] = text['comment_text'].progress_apply(lambda x: clean_html(x))
    text['comment_text'] = text['comment_text'].progress_apply(lambda x: delete_stop_words(x, to_remove))
    return text

In [12]:
# save the pre-test
test = pd.read_csv('./dataset/test.csv')
test = process(test)
test.to_csv('./fasttext_dataset/process_test.csv')

100%|██████████| 153164/153164 [00:02<00:00, 66684.02it/s]
100%|██████████| 153164/153164 [00:01<00:00, 90443.63it/s]
100%|██████████| 153164/153164 [00:04<00:00, 36391.68it/s]
100%|██████████| 153164/153164 [00:02<00:00, 65601.56it/s]


In [13]:
# save the pre-train
train = pd.read_csv('./dataset/train.csv')
train = process(train)
train.to_csv('./fasttext_dataset/process_train.csv')

100%|██████████| 159571/159571 [00:02<00:00, 64024.58it/s]
100%|██████████| 159571/159571 [00:01<00:00, 82303.26it/s]
100%|██████████| 159571/159571 [00:04<00:00, 32199.12it/s]
100%|██████████| 159571/159571 [00:02<00:00, 59678.26it/s]
