# Experiment number word out-of-vocab in Subword Embedding and Word Embedding

## Setup
data: use spam classification cleaned dataset
Sub-word tokenizer use SentencePiece lib
Word tokenizer use Tokenizer from Keras.preprocessing.text
test data: use spam classification cleaned test dataset
vocab-size: 8000

## Subword Tokenizer
1. fit tokenizer on train set.
2. encode each sentence in test set to list of IDs, ID is int: ID of sub-word in vocab. OOV_TOKEN has id 0. Word that can't be encoded has id 0

In [1]:
import sentencepiece as spm

spm.SentencePieceTrainer.train('--input=train_sw.txt --model_prefix=m --vocab_size=8000')
sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [7]:
sp.shape

AttributeError: 'SentencePieceProcessor' object has no attribute 'shape'

In [78]:
import pandas as pd

In [164]:
test_dataset = pd.read_csv('test_clean_set.csv')
test = test_dataset['search_text'].apply(lambda x: sp.encode_as_ids(str(x)))

In [165]:
test_dataset.describe()

Unnamed: 0,label
count,10000.0
mean,0.4951
std,0.500001
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [166]:
num_oov = 0
total_len = 0
for i in test:
    num_oov += len([j for j in i if j == 0])
    total_len += len(i)
mean_len = total_len / (len(test))

In [167]:
print("Out-of-vocab: %d"%num_oov)
print("Mean length: %d"%mean_len)

Out-of-vocab: 1372
Mean length: 139


### Number word out-of-vocab with sub-word tokenizer: 1372

## Word Tokenizer
1. fit tokenizer on train set
2. encode each sentence in test set to list of IDs, ID is int: ID of sub-word in vocab. OOV_TOKEN has id 1. Word that can't be encoded has id 1

In [83]:
from tensorflow.keras.preprocessing import text

In [84]:
tokenizer = text.Tokenizer(num_words=8000, oov_token="__UNK")
train = pd.read_csv('train_clean_set.csv')
tokenizer.fit_on_texts(train.search_text.astype(str))

In [85]:
train.describe()

Unnamed: 0,label,word_count,length,avg_word
count,40000.0,40000.0,40000.0,40000.0
mean,0.501225,117.1102,519.0428,3.431374
std,0.500005,281.615687,1257.105827,0.392057
min,0.0,0.0,0.0,0.0
25%,0.0,33.0,148.0,3.204504
50%,1.0,62.0,271.0,3.4
75%,1.0,104.0,454.0,3.596491
max,1.0,18933.0,86089.0,9.0


In [86]:
test_word = tokenizer.texts_to_sequences(test_dataset.search_text.astype(str))

In [87]:
num_oov = 0
mean_len = 0
for i in test_word:
    num_oov += len([j for j in i if j == 1])
    mean_len += len(i)
mean_len = mean_len / len(test_word)

In [88]:
print("Out-of-vocab: %d"%num_oov)
print("Mean length: %d"%mean_len)

Out-of-vocab: 52295
Mean length: 116


### Number word out-of-vocab with word tokenizer: 52295

## Reduce size of vocab
Vocab-size: 4000

In [89]:
spm.SentencePieceTrainer.train('--input=train_sw.txt --model_prefix=m --vocab_size=4000')
sp = spm.SentencePieceProcessor()
sp.load('m.model')
test = test_dataset['search_text'].apply(lambda x: sp.encode_as_ids(str(x)))
num_oov = 0
for i in test:
    num_oov += len([j for j in i if j == 0])
num_oov

1372

In [168]:
new_train = pd.read_csv("data/new_training_data.csv", names=["text", "label"])
new_train.head()
new_train = new_train.dropna()
new_train = new_train.drop_duplicates(subset="text")

In [169]:
new_train.describe()

Unnamed: 0,label
count,249815.0
mean,0.180942
std,0.730299
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [170]:
new_test = pd.read_csv("data/test_dataset_1.7.csv", names=["text", "label"], skiprows=1)
new_test.head()

Unnamed: 0,text,label
0,t định tag con ý trường t :v,0.0
1,Hào Lam,0.0
2,K biết nữa 😁,0.0
3,bình thường thôi chị hương ơi,0.0
4,Lưu Huyền,0.0


In [171]:
new_test.describe()

Unnamed: 0,label
count,132770.0
mean,0.132635
std,0.695604
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [172]:
import string
import re
import emoji

In [173]:
def clean_text(x):
    htmltag = re.compile('<(?:[A-Za-z_:][\w:.-]*(?=\s)(?!(?:[^>\"\']|\"[^\"]*\"|\'[^\']*\')*?(?<=\s)(?:term|range)\s*=)(?!\s*/?>)\s+(?:\".*?\"|\'.*?\'|[^>]*?)+|/?[A-Za-z_:][\w:.-]*\s*/?)>')
    x = re.sub(htmltag, ' ', x)
    x = re.sub(r'^\W', ' ', x)
    x = re.sub(r'&lt;(.+?)&gt;|&amp;nbsp;', '', x)
    x = " ".join(str(x).lower() for x in str(x).split())
    table = str.maketrans('', '', string.punctuation)
    x = " ".join (str(x).translate(table) for x in str(x).split())
    x = emoji.get_emoji_regexp().sub(u' ',x)
    x = " ".join(x for x in str(x).split() if len(x) < 10)
    x = x.strip()
    return x

In [174]:
train = new_train['text'].apply(lambda x: clean_text(str(x)))

In [175]:
test = new_test['text'].apply(lambda x: clean_text(str(x)))

In [98]:
train.describe()

count     249815
unique    245690
top             
freq         274
Name: text, dtype: object

In [99]:
train.head()

0                              đừng cho bé ăn xương đó
1    re trẻ thông minh trong thời hiện đại liệu có ...
2    plo moodys đã xếp hạng cfr ở mức b3 cho công t...
3    dính nhiều phốt kém chất lượng các chị có thể ...
4    thập diên mai phục mình thik kiểu tóc đó haha ...
Name: text, dtype: object

In [100]:
test.describe()

count     132778
unique    127485
top             
freq        2527
Name: text, dtype: object

In [101]:
test.head()

0      t định tag con ý trường t v
1                          hào lam
2                       k biết nữa
3    bình thường thôi chị hương ơi
4                        lưu huyền
Name: text, dtype: object

In [197]:
spm.SentencePieceTrainer.train('--input=data/train_sw.txt --model_prefix=m2 --vocab_size=30000 model_type=bpe')
sp = spm.SentencePieceProcessor()
sp.load('m2.model')
test_encoded = test.apply(lambda x: sp.encode_as_ids(str(x)))

In [198]:
oov_num = 0
mean_len = 0
for i in test_encoded:
    oov_num += len([j for j in i if j == 0])
    mean_len += len(i)
mean_len = mean_len / len(test_encoded)
print("Out-of-vocab: {}\nMean length: {}".format(oov_num, mean_len))

Out-of-vocab: 33867
Mean length: 23.91427043636747


In [185]:
new_tokenizer = text.Tokenizer(num_words=30000, oov_token="__UNK")
new_tokenizer.fit_on_texts(train.astype(str))

In [186]:
test_word = new_tokenizer.texts_to_sequences(test.astype(str))

In [187]:
oov_num = 0
mean_len = 0
for i in test_word:
    oov_num += len([j for j in i if j == 1])
    mean_len += len(i)
mean_len = mean_len / len(test_encoded)
print("Out-of-vocab: {}\nMean length: {}".format(oov_num, mean_len))

Out-of-vocab: 212022
Mean length: 21.755787856421996


In [123]:
new_tokenizer.word_index

{'__UNK': 1,
 'có': 2,
 'là': 3,
 'cho': 4,
 'em': 5,
 'và': 6,
 'e': 7,
 'k': 8,
 'đi': 9,
 'mà': 10,
 'của': 11,
 'quá': 12,
 'này': 13,
 'với': 14,
 'các': 15,
 'chị': 16,
 'không': 17,
 'a': 18,
 'thì': 19,
 'bạn': 20,
 'được': 21,
 'đẹp': 22,
 'con': 23,
 'cũng': 24,
 'ạ': 25,
 'anh': 26,
 '1': 27,
 'hàng': 28,
 'giá': 29,
 'mua': 30,
 'trong': 31,
 'nhé': 32,
 'ko': 33,
 'rồi': 34,
 'mình': 35,
 'để': 36,
 'nha': 37,
 'làm': 38,
 'ơi': 39,
 'luôn': 40,
 'như': 41,
 'đó': 42,
 'nào': 43,
 'đã': 44,
 'mới': 45,
 'lại': 46,
 'người': 47,
 'sữa': 48,
 'về': 49,
 'c': 50,
 'còn': 51,
 'uống': 52,
 'ra': 53,
 'chỉ': 54,
 'từ': 55,
 'nhà': 56,
 'nhiều': 57,
 'ngày': 58,
 '2': 59,
 'thế': 60,
 'đồng': 61,
 'đến': 62,
 'gì': 63,
 'lên': 64,
 'hơn': 65,
 'cái': 66,
 'thấy': 67,
 'ở': 68,
 'hay': 69,
 'sẽ': 70,
 'lắm': 71,
 'thể': 72,
 'chất': 73,
 'phải': 74,
 'mẹ': 75,
 'cả': 76,
 'vậy': 77,
 'khi': 78,
 '3': 79,
 'công': 80,
 'bị': 81,
 'bé': 82,
 'đầu': 83,
 'thôi': 84,
 'vào': 85,
 'nê

In [200]:
new_train['text'] = train

In [201]:
new_test['text'] = test

In [204]:
new_test.to_csv("data/test_preprocessed.csv", index=False)
new_train.to_csv("data/train_preprocessed.csv", index=False)