In [1]:
# !pip -q install ../input/pysastrawi/Sastrawi-1.0.1-py2.py3-none-any.whl 

## for data
import json
import pandas as pd
import numpy as np
from sklearn .model_selection import StratifiedKFold, GroupKFold
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import string
import re
import nltk
from nltk.corpus import stopwords
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for explainer
from lime import lime_text
## for word embedding
import gensim

import gensim.downloader as gensim_api
## for deep learning
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
## for bert language model
import transformers
from transformers import TFAutoModel, AutoTokenizer
from transformers import RobertaTokenizer, TFRobertaModel

# from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Preprocessing function helper
# replace word that concatenate with other word
def remove_concatenate_2_words(text):
    list_words = ['khusus']
    for w in list_words:
        text = text.replace(w, '')
    return text

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

STOPWORDS_ID = set(stopwords.words('indonesian'))
STOPWORDS_EN = set(stopwords.words('english'))
def remove_stopwords(list_text):
    text_not_in_ID = [word for word in list_text if word not in STOPWORDS_EN]
    text = [word for word in text_not_in_ID if word not in STOPWORDS_ID]
    return text

# remove big number and split text that contains word and number
def remove_big_number(list_text):
    words = []
    for w in list_text:
        sub_w = re.split('(\d+)',w)
        for item in sub_w:
            try:
                tmp = int(item)
                if tmp < 7000:
                    if (tmp>1000) and (tmp % 100 == 0): # for even number
                        words.append(str(tmp))
                    elif (tmp<=1000) and (tmp>100) and (tmp % 10 == 0 ):
                        words.append(str(tmp))
                    elif (tmp<=100) and (tmp % 2 == 0):
                        words.append(str(tmp))
            except:
                words.append(item)
    return words

def remove_zero_val(list_text):
    return [w for w in list_text if w not in ['0']]

def remove_common_words(list_text):
    common_words = "hari keren kere kw super baik jual jualan quality best free  kwalitas berkualitas kualitas bagus terbaik kembali dijamin beli gratis murah free diskon ongkir cek berkualitas original asli kualitas uang jaminan jamin terjamin buatan buat kirim wilayah luar kota jawa bali jakarta surabaya bulan month year day tahun hari harian anda your nikmat singapore malaysia indonesia vietnam thailand filipina bangkok jepang buy one get dapat dua two satu meriah kirim send pengiriman paket hemat uang kembali dapat guarantee buatan lokal dalam internasional karya termurah paling murah terbaik cheap murah biaya".split(' ')
    return [w for w in list_text if w not in common_words]

def remove_strange_words(list_text):
    strange_words = ['aaa', 'aaaa', 'aaaaa', 'abc', 'abcd', 'bb', 'bbb', 'bbbb', 'ccc', 'cccc', 'thn', 'th', 'bln']
    return [w for w in list_text if w not in strange_words]

def text_vectorizer(max_features, max_len, vocab):
    # max_features: Maximum vocab size.
    # max_len: Sequence length to pad the outputs to.
    
    text_dataset = tf.data.Dataset.from_tensor_slices(vocab)
    
    # Create the layer.
    vectorize_layer = TextVectorization(
        max_tokens = max_features,
        output_mode = 'int',
        output_sequence_length = max_len
    )

    vectorize_layer.adapt(text_dataset.batch(64))

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)
    return model

In [3]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        for stopwords in lst_stopwords:
            lst_text = [word for word in lst_text if word not in 
                        stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        # english stemming
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
        
        # indonesian stemming
#         factory = StemmerFactory()
#         id_stemmer = factory.create_stemmer()

#         lst_text = [id_stemmer.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
        
    # remove_zero_val
    lst_text = [w for w in lst_text if w not in ['0']]
    
    # remove strange words
    strange_words = ['aaa', 'aaaa', 'aaaaa', 'abc', 'abcd', 'bb', 'bbb', 'bbbb', 'ccc', 'cccc', 'thn', 'th', 'bln']
    lst_text = [w for w in lst_text if w not in strange_words]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

def string_escape(s, encoding='utf-8'):
    return (
        s.encode('latin1')  # To bytes, required by 'unicode-escape'
        .decode('unicode-escape')  # Perform the actual octal-escaping decode
        .encode('latin1')  # 1:1 mapping back to bytes
        .decode(encoding)
    )  # Decode original encoding

lst_stopwords_en = nltk.corpus.stopwords.words("english")
lst_stopwords_id = nltk.corpus.stopwords.words("indonesian")

In [4]:
df = pd.read_csv('../input/shopee-product-matching/train.csv')
df['label_group'], _ = df['label_group'].factorize()

In [5]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)
        
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
#         add_special_tokens = True,
        return_attention_mask = True,
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen
        )
    
    return np.array(enc_di['input_ids']), np.array(enc_di['attention_mask'])

MAX_LEN = 105
MODEL = '../input/tfroberta-base-indonesian/roberta-base-indonesian-522M'
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

In [6]:
# preprocess df_train title & phash
df['tmp'] = df['title'].apply(lambda x: string_escape(x))
df["tmp"] = df["tmp"].apply(lambda x: utils_preprocess_text(
    x, flg_stemm=False, flg_lemm=False, lst_stopwords=None))

# for BERT
ids, att_mask = regular_encode(list(df["tmp"].values), tokenizer, maxlen=MAX_LEN)
df['input_ids'] = list(ids)
df['att_mask'] = list(att_mask)
del ids, att_mask

df['tmp'] = df['title'].apply(lambda x: string_escape(x))
df['tmp'] = df['tmp'].apply(lambda x: remove_concatenate_2_words(x))
df['tmp'] = df['tmp'].str.lower()
df['tmp'] = df['tmp'].apply(lambda x: remove_punctuation(x))
df['tmp'] = df['tmp'].apply(lambda x: str(x).split())
df['tmp'] = df['tmp'].apply(lambda x: remove_stopwords(x))
#     df['tmp'] = df['tmp'].apply(lambda x: remove_big_number(x))
df['tmp'] = df['tmp'].apply(lambda x: remove_zero_val(x))
#     df['tmp'] = df['tmp'].apply(lambda x: remove_common_words(x))
df['tmp'] = df['tmp'].apply(lambda x: remove_strange_words(x))
df['tmp'] = df['tmp'].apply(lambda x: list(np.unique(x)))

# for mlp input
# title vocab
words = list(df['tmp'])
words = list(np.unique(np.concatenate(words)))
# Text vectorizer
model = text_vectorizer(max_features = 25000, max_len = 100, vocab = words)
list_text = [' '.join(x) for x in df['tmp']]
title_vec = model.predict(list_text)
df['title_vec'] = list(title_vec)
del model, list_text, title_vec, words

n_classes = df['label_group'].nunique()
print(f'n_classes: {n_classes}')

df.to_parquet(f'/kaggle/working/train.parquet', engine='pyarrow')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


n_classes: 11014
