In [2]:

import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [3]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (20345, 7)
Test shape :  (5086, 6)


In [4]:

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [5]:
sentences = train["free_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 20345/20345 [00:00<00:00, 139887.14it/s]
100%|██████████| 20345/20345 [00:00<00:00, 179881.56it/s]{'"đồ': 20, 'đẹp': 850, 'mà': 1580, 'không': 1923, 'tậu': 6}



In [6]:
!pip3 install gensim

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple


In [7]:

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
news_path = '/home/redbox/toxic-comment-vietnamese/vi/baomoi.window2.vn.model.bin'
embeddings_index = KeyedVectors.load_word2vec_format(datapath(news_path), binary=True, encoding='utf8',unicode_errors='ignore')

In [8]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [9]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 35141/35141 [00:00<00:00, 258858.18it/s]Found embeddings for 47.67% of vocab
Found embeddings for  85.21% of all text



In [10]:
oov[:25]


[('vl"', 4248),
 ('nha"', 385),
 ('nhé"', 199),
 ('di"', 196),
 ('phòng"', 194),
 ('"có', 193),
 ('thương"', 145),
 ('nhi"', 144),
 ('rồi"', 128),
 ('thư"', 124),
 ('sđt"', 118),
 ('ngu"', 117),
 ('nguyễn"', 114),
 ('chó"', 114),
 ('"ai', 109),
 ('"vl', 105),
 ('"nhìn', 102),
 ('"anh', 102),
 ('"em', 101),
 ('lê"', 96),
 ('trân"', 96),
 ('"phạm', 95),
 ('người_ta', 89),
 ('"cái', 86),
 ('"đm', 86)]

In [11]:
train['free_text']= train.free_text.str.replace('"', ' ')


In [12]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,free_text,label_id,CLEAN,OFFENSIVE,HATE
0,0,train_aabeeyfurc,đồ đẹp mà không tậu cho mình một bộ là có lỗi...,0,1.0,0.0,0.0
1,1,train_aabetrttaa,ngáo vl,0,1.0,0.0,0.0
2,2,train_aaccolapwa,son nhiu mimi,0,1.0,0.0,0.0
3,3,train_aacoaahwdb,hanh ha lâm hằng hang nguyen,0,1.0,0.0,0.0
4,4,train_aaczjqfhfk,nguyen tam_to the group chợ chung_cư petrolan...,0,1.0,0.0,0.0


In [13]:
sentences = train["free_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 20345/20345 [00:00<00:00, 184453.78it/s]


In [14]:
oov = check_coverage(vocab,embeddings_index)


100%|██████████| 27942/27942 [00:00<00:00, 293843.05it/s]Found embeddings for 64.91% of vocab
Found embeddings for  95.79% of all text



In [15]:
oov[:10]


[('hahak', 111),
 ('người_ta', 104),
 ('di_đà', 81),
 ('tiktok', 79),
 ('douyin', 71),
 ('tranchauden', 71),
 ('excited', 43),
 ('nhaaa', 40),
 ('hot_hit', 33),
 ('ahihi', 32)]

In [34]:
def text_remove(t):
    new = []
    for id in range(len(t)-1):
        if t[id]!=t[id+1]:
            new.append(t[id])
    new.append(t[-1])

    return ''.join(new)


In [35]:
train["question_text"] = train["free_text"].progress_apply(lambda x: text_remove(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())

vocab = build_vocab(sentences)

100%|██████████| 20345/20345 [00:00<00:00, 41559.66it/s]
100%|██████████| 20345/20345 [00:00<00:00, 209319.61it/s]
100%|██████████| 20345/20345 [00:00<00:00, 110997.37it/s]


In [36]:
oov = check_coverage(vocab,embeddings_index)


100%|██████████| 27038/27038 [00:00<00:00, 217387.44it/s]Found embeddings for 65.71% of vocab
Found embeddings for  95.87% of all text



In [31]:
oov[:10]

[('hahak', 111),
 ('người_ta', 104),
 ('di_đà', 81),
 ('tiktok', 79),
 ('douyin', 71),
 ('tranchauden', 71),
 ('oficial', 58),
 ('fresize', 51),
 ('folow', 50),
 ('hapy', 44)]

In [47]:
for i in embeddings_index.vocab:
    print(i)

ộc
sharmal_dissnayake
algonquin
debkafile
xavier_samuel
hawtai
mauricio_victorino
iar
phan_huệ
nhị_Độ
simonsen
tiểu_cơ
gamla_stan
snape
mukarurinda
bùi_mạnh_tiến
thẩm_thần_huy
parties
doyley
dothraki
chợ_rịa
dương_văn_chương
in_hybrid
mát_tính
khủ
đầu_tư_thông_minh
phạm_Đình_hiệp
pines
ducdongho
chieng_mai
gordeeva
Đinh_thị_hải
sander
thiên_tử_online
neymar_sr
bùi_văn_tuyên
donaco_international
enna
acetylcystein
tăng_Âm_quyền
injo
*hôm
laguna_seca
phuong_le
goodson
lê_mạnh
tam_tòng_tứ_đức
màn_trời
sinco
đư­ờng
tiền_vận
võ_trứ
cô_Đồng_quyến
trâm_oanh
tÃ­ch
mistake
cnet_asia
việt_pax_thiên
vào_sinh
nguyễn_thị_vượng
nhóm_bố
quang_tâm
nguyễn_minh_chí
toni_morrison
tteokbokki
bản_ba
kiểm_xạ
lược_đồ
franklin_templeton_investments
Ô_lâm
bùi_thị_lan_hương
turritopsis
_lạch_huyện
recoba
glycine
poros
ma_thành
thương_binh_xã_hội
khairy
cồn_sơn
toop
lan_chín
capacity
môi_xinh
isolde
võ_văn_Đức_bảy
faseb
cindy_chang
eabia
toshiba_corp
Đỗ_thị_kim_hoa
smart_manager
giàng_thị_sua
babysan
trần_thị_m


KeyboardInterrupt: 