In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
from rules import normalized_chars
import random
import re
from unidecode import unidecode

laughing = {
    'huhu',
    'haha',
    'gagaga',
    'hihi',
    'wkawka',
    'wkwk',
    'kiki',
    'keke',
    'huehue',
    'hshs',
    'hoho',
    'hewhew',
    'uwu',
    'sksk',
    'ksks',
    'gituu',
    'gitu',
    'mmeeooww',
    'meow',
    'alhamdulillah',
    'muah',
    'mmuahh',
    'hehe',
    'salamramadhan',
    'happywomensday',
    'jahagaha',
    'ahakss',
    'ahksk'
}

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    x = []
    for word in string:
        word = word.lower()
        if any([laugh in word for laugh in laughing]):
            if random.random() >= 0.5:
                x.append(word)
        else:
            x.append(word)
    string = [w.title() if w[0].isupper() else w for w in x]
    return ' '.join(string)

In [3]:
labels = """
1. severe toxic
2. obscene
3. identity attack
4. insult
5. threat
6. asian
7. atheist
8. bisexual
9. black
10. buddhist
11. christian
12. female
13. heterosexual
14. indian
15. homosexual, gay or lesbian
16. intellectual or learning disability
17. jewish
18. latino
19. male
20. muslim
21. other disability
22. other gender
23. other race or ethnicity
24. other religion
25. other sexual orientation
26. physical disability
27. psychiatric or mental illness
28. transgender
29. white
30. malay
31. chinese
"""
labels = [l.split('. ')[1].strip() for l in labels.split('\n') if len(l)]
labels

['severe toxic',
 'obscene',
 'identity attack',
 'insult',
 'threat',
 'asian',
 'atheist',
 'bisexual',
 'black',
 'buddhist',
 'christian',
 'female',
 'heterosexual',
 'indian',
 'homosexual, gay or lesbian',
 'intellectual or learning disability',
 'jewish',
 'latino',
 'male',
 'muslim',
 'other disability',
 'other gender',
 'other race or ethnicity',
 'other religion',
 'other sexual orientation',
 'physical disability',
 'psychiatric or mental illness',
 'transgender',
 'white',
 'malay',
 'chinese']

In [4]:
import glob

files = glob.glob('../toxicity/translated*')
files

['../toxicity/translated-1750000.json',
 '../toxicity/translated-1450000.json',
 '../toxicity/translated-700000.json',
 '../toxicity/translated-350000.json',
 '../toxicity/translated-600000.json',
 '../toxicity/translated-900000.json',
 '../toxicity/translated-1000000.json',
 '../toxicity/translated-1100000.json',
 '../toxicity/translated-550000.json',
 '../toxicity/translated-150000.json',
 '../toxicity/translated-500000.json',
 '../toxicity/translated-1500000.json',
 '../toxicity/translated-1150000.json',
 '../toxicity/translated-750000.json',
 '../toxicity/translated-850000.json',
 '../toxicity/translated-1650000.json',
 '../toxicity/translated-300000.json',
 '../toxicity/translated-650000.json',
 '../toxicity/translated-950000.json',
 '../toxicity/translated-250000.json',
 '../toxicity/translated-1600000.json',
 '../toxicity/translated-0.json',
 '../toxicity/translated-1550000.json',
 '../toxicity/translated-1800000.json',
 '../toxicity/translated-450000.json',
 '../toxicity/transl

In [5]:
import json

X, Y = [], []

for file in files:
    print(file)
    with open(file) as fopen:
        f = json.load(fopen)
        for row in f:
            if len(row[1]) == 29:
                X.append(row[0])
                Y.append(row[1] + [0, 0])
        
    
len(X)

../toxicity/translated-1750000.json
../toxicity/translated-1450000.json
../toxicity/translated-700000.json
../toxicity/translated-350000.json
../toxicity/translated-600000.json
../toxicity/translated-900000.json
../toxicity/translated-1000000.json
../toxicity/translated-1100000.json
../toxicity/translated-550000.json
../toxicity/translated-150000.json
../toxicity/translated-500000.json
../toxicity/translated-1500000.json
../toxicity/translated-1150000.json
../toxicity/translated-750000.json
../toxicity/translated-850000.json
../toxicity/translated-1650000.json
../toxicity/translated-300000.json
../toxicity/translated-650000.json
../toxicity/translated-950000.json
../toxicity/translated-250000.json
../toxicity/translated-1600000.json
../toxicity/translated-0.json
../toxicity/translated-1550000.json
../toxicity/translated-1800000.json
../toxicity/translated-450000.json
../toxicity/translated-50000.json
../toxicity/translated-1050000.json
../toxicity/translated-1200000.json
../toxicity/tr

1401054

In [6]:
rejected_labels = ['black', 'white', 'jewish', 'latino']
[labels.index(l) for l in rejected_labels]
labels = [l for l in labels if l not in rejected_labels]

In [7]:
ydf = pd.DataFrame(np.array(Y))
ydf = ydf.loc[(ydf[8] == 0) & (ydf[28] == 0) & (ydf[16] == 0) & (ydf[17] == 0)]
ydf = ydf.drop([8, 28, 16, 17], axis = 1)
ix = ydf.index.tolist()
Y = ydf.values.tolist()

In [8]:
X = [X[i] for i in ix]

In [9]:
mapping = {'severe_toxic': 'severe toxic', 'identity_hate': 'identity attack',
          'toxic': 'severe toxic', 'melayu': 'malay', 'cina': 'chinese', 'india': 'indian'}

In [10]:
def generate_onehot(tags, depth = len(labels)):
    onehot = [0] * depth
    for tag in tags:
        onehot[labels.index(tag)] = 1
    return onehot

In [11]:
with open('../toxicity/kaum.json') as fopen:
    kaum = json.load(fopen)
    
for k, v in kaum.items():
    print(k, len(v))

melayu 84851
cina 43956
india 20208


In [12]:
with open('../toxicity/weak-learning-toxicity.json') as fopen:
    scores = json.load(fopen)
    
for k, v in scores.items():
    for no in range(len(v)):
        tags = []
        for l, v_ in v[no].items():
            if round(v_) == 1:
                tags.append(mapping.get(l, l))
        tags.append(mapping[k])
        Y.append(generate_onehot(tags))
        X.append(kaum[k][no])

In [13]:
from tqdm import tqdm

for i in tqdm(range(len(X))):
    X[i] = cleaning(X[i])

100%|██████████| 1510055/1510055 [04:35<00:00, 5488.19it/s] 


In [14]:
actual_t, actual_l = [], []

for i in tqdm(range(len(X))):
    if len(X[i]) > 2:
        actual_t.append(X[i])
        actual_l.append(Y[i])

100%|██████████| 1510055/1510055 [00:01<00:00, 1251440.85it/s]


In [15]:
with open('combined.txt', 'w') as fopen:
    fopen.write('\n'.join(actual_t))

In [16]:
import youtokentome as yttm

In [17]:
%%time

bpe = yttm.BPE.train(data='combined.txt', 
               vocab_size=60000, model='toxic.model')

CPU times: user 26.8 s, sys: 5.77 s, total: 32.5 s
Wall time: 7.5 s


In [18]:
vocab = {v: i for i, v in enumerate(bpe.vocab())}
rev_vocab = {i: v for i, v in enumerate(bpe.vocab())}
len(vocab)

60000

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
import re

r = re.compile(r'[\S]+').findall

In [21]:
subs = [' '.join(s) for s in bpe.encode(actual_t, output_type=yttm.OutputType.SUBWORD)]

In [22]:
tfidf = TfidfVectorizer(vocabulary = vocab, token_pattern = r'[\S]+').fit(subs)

In [23]:
import pickle
with open('tfidf-toxic.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)

In [24]:
vector = tfidf.transform(subs)

In [25]:
Y = np.around(np.array(actual_l))

In [26]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(vector, Y, test_size = 0.2)
train_X.shape, test_X.shape

((1201860, 60000), (300465, 60000))

In [27]:
from sklearn.naive_bayes import ComplementNB

In [28]:
from sklearn.multiclass import OneVsRestClassifier

In [29]:
multinomial = OneVsRestClassifier(ComplementNB()).fit(train_X, train_Y)

  str(classes[c]))
  str(classes[c]))


In [33]:
from sklearn import metrics

In [34]:
print(
    metrics.classification_report(
        train_Y,
        multinomial.predict(train_X),
        target_names=labels,digits=5
    )
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                     precision    recall  f1-score   support

                       severe toxic    0.31967   0.99343   0.48370     39401
                            obscene    0.06234   0.68947   0.11434     11287
                    identity attack    0.03492   0.61682   0.06610      5624
                             insult    0.17200   0.73748   0.27894     50732
                             threat    0.00799   0.12224   0.01500      1718
                              asian    0.00180   0.03686   0.00344      1492
                            atheist    0.00176   0.05402   0.00341       722
                           bisexual    0.00007   0.01136   0.00014        88
                           buddhist    0.00035   0.02885   0.00070       208
                          christian    0.14597   0.88664   0.25067     18754
                             female    0.13707   0.80804   0.23438     27850
                       heterosexual    0.00131   0.04887   0.00255       53

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
print(
    metrics.classification_report(
        test_Y,
        multinomial.predict(test_X),
        target_names=labels,digits=5
    )
)

                                     precision    recall  f1-score   support

                       severe toxic    0.32096   0.99468   0.48532      9955
                            obscene    0.06031   0.68096   0.11081      2799
                    identity attack    0.03312   0.60086   0.06277      1393
                             insult    0.15655   0.69002   0.25519     12575
                             threat    0.00661   0.11058   0.01247       416
                              asian    0.00087   0.01799   0.00166       389
                            atheist    0.00137   0.04494   0.00266       178
                           bisexual    0.00052   0.08333   0.00104        24
                           buddhist    0.00000   0.00000   0.00000        45
                          christian    0.13652   0.86153   0.23570      4622
                             female    0.12714   0.78073   0.21867      6891
                       heterosexual    0.00153   0.06299   0.00299       12

In [37]:
with open('multinomial-toxic.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)

In [38]:
import boto3
s3 = boto3.client('s3')
bucketName = 'huseinhouse-storage'

Key = 'multinomial-toxic.pkl'
outPutname = "v34/toxicity/multinomial.pkl"
s3.upload_file(Key,bucketName,outPutname)

In [39]:
Key = 'tfidf-toxic.pkl'
outPutname = "v34/toxicity/tfidf.pkl"
s3.upload_file(Key,bucketName,outPutname)

In [40]:
Key = 'toxic.model'
outPutname = "v34/toxicity/youtokentome.model"
s3.upload_file(Key,bucketName,outPutname)