In [1]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn .model_selection import StratifiedKFold, GroupKFold

from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import string
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../input/shopee-product-matching/train.csv')
df['label_group'], unique = df['label_group'].factorize()
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,0
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",1
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,3
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,4


In [3]:
# Preprocessing function helper
# replace word that concatenate with other word
def remove_concatenate_2_words(text):
    list_words = ['khusus']
    for w in list_words:
        text = text.replace(w, '')
    return text

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

STOPWORDS_ID = set(stopwords.words('english'))
STOPWORDS_EN = set(stopwords.words('indonesian'))
def remove_stopwords(list_text):
    text_not_in_ID = [word for word in list_text if word not in STOPWORDS_EN]
    text = [word for word in text_not_in_ID if word not in STOPWORDS_ID]
    return text

# remove big number and split text that contains word and number
def remove_big_number(list_text):
    words = []
    for w in list_text:
        sub_w = re.split('(\d+)',w)
        for item in sub_w:
            try:
                tmp = int(item)
                if tmp < 7000:
                    if (tmp>1000) and (tmp % 100 == 0): # for even number
                        words.append(str(tmp))
                    elif (tmp<=1000) and (tmp>100) and (tmp % 10 == 0 ):
                        words.append(str(tmp))
                    elif (tmp<=100) and (tmp % 2 == 0):
                        words.append(str(tmp))
            except:
                words.append(item)
    return words

def remove_zero_val(list_text):
    return [w for w in list_text if w not in ['0']]

def remove_common_words(list_text):
    common_words = "hari keren kere kw super baik jual jualan quality best free  kwalitas berkualitas kualitas bagus terbaik kembali dijamin beli gratis murah free diskon ongkir cek berkualitas original asli kualitas uang jaminan jamin terjamin buatan buat kirim wilayah luar kota jawa bali jakarta surabaya bulan month year day tahun hari harian anda your nikmat singapore malaysia indonesia vietnam thailand filipina bangkok jepang buy one get dapat dua two satu meriah kirim send pengiriman paket hemat uang kembali dapat guarantee buatan lokal dalam internasional karya termurah paling murah terbaik cheap murah biaya".split(' ')
    return [w for w in list_text if w not in common_words]

def remove_strange_words(list_text):
    strange_words = ['aaa', 'aaaa', 'aaaaa', 'abc', 'abcd', 'bb', 'bbb', 'bbbb', 'ccc', 'cccc', 'thn', 'th', 'bln']
    return [w for w in list_text if w not in strange_words]

def string_escape(s, encoding='utf-8'):
    return (
        s.encode('latin1')  # To bytes, required by 'unicode-escape'
        .decode('unicode-escape')  # Perform the actual octal-escaping decode
        .encode('latin1')  # 1:1 mapping back to bytes
        .decode(encoding)
    )  # Decode original encoding

In [4]:
def text_vectorizer(max_features, max_len, vocab):
    # max_features: Maximum vocab size.
    # max_len: Sequence length to pad the outputs to.
    
    text_dataset = tf.data.Dataset.from_tensor_slices(vocab)
    
    # Create the layer.
    vectorize_layer = TextVectorization(
        max_tokens = max_features,
        output_mode = 'int',
        output_sequence_length = max_len
    )

    vectorize_layer.adapt(text_dataset.batch(64))

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)
    return model

In [5]:
# preprocess df_unseen title & phash
df['title'] = df['title'].apply(lambda x: string_escape(x))
df['title'] = df['title'].apply(lambda x: remove_concatenate_2_words(x))
df['title'] = df['title'].str.lower()
df['title'] = df['title'].apply(lambda x: remove_punctuation(x))
df['title'] = df['title'].apply(lambda x: str(x).split())
df['title'] = df['title'].apply(lambda x: remove_stopwords(x))
# df['title'] = df['title'].apply(lambda x: remove_big_number(x))
df['title'] = df['title'].apply(lambda x: remove_zero_val(x))
df['title'] = df['title'].apply(lambda x: remove_common_words(x))
df['title'] = df['title'].apply(lambda x: remove_strange_words(x))
df['title'] = df['title'].apply(lambda x: list(np.unique(x)))

# title vocab
words = list(df['title'])
words = list(np.unique(np.concatenate(words)))

# phash vocab
phash = list(df['image_phash'].apply(lambda x: list(str(x))))
phash = list(np.unique(np.concatenate(phash)))

# Text vectorizer
model = text_vectorizer(max_features = 25000, max_len = 100, vocab = words)
list_text = [' '.join(x) for x in df['title']]
title_vec = model.predict(list_text)
df['title_vec'] = list(title_vec)

model = text_vectorizer(max_features = 25, max_len = 25, vocab = phash)
list_text = [' '.join(x) for x in df['image_phash']]
phash_vec = model.predict(list_text)
df['phash_vec'] = list(phash_vec)


n_classes = df['label_group'].nunique()
print(f'n_classes: {n_classes}')

# save to file
df.to_parquet(f'./train.parquet', engine='pyarrow')

n_classes: 11014


In [6]:
df_train = pd.read_parquet('./train.parquet', engine='pyarrow')
df_train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,title_vec,phash_vec
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,"[bag, paper, secret, victoria]",0,"[22885, 8826, 5485, 1546, 0, 0, 0, 0, 0, 0, 0,...","[8, 13, 8, 10, 13, 2, 8, 14, 10, 4, 13, 5, 15,..."
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"[12, 3m, 45, double, foam, mm, tape, vhb, x]",1,"[1, 1, 1, 18683, 17293, 10679, 3298, 1561, 878...","[7, 2, 14, 2, 8, 13, 11, 17, 5, 15, 9, 14, 9, ..."
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,"[397, canned, gr, luncheon, maling, meat, pork...",2,"[1, 20880, 16430, 11983, 11679, 11302, 7680, 2...","[6, 8, 13, 5, 6, 17, 17, 3, 4, 14, 3, 12, 17, ..."
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,"[acak, alhadi, batik, campur, daster, dpt00100...",3,"[24337, 23981, 22642, 20908, 19313, 18663, 140...","[9, 12, 16, 13, 2, 5, 12, 9, 3, 7, 2, 3, 7, 15..."
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,"[220ml, latte, nescafe, éclair]",4,"[1, 12631, 9898, 271, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7, 11, 2, 14, 16, 8, 2, 8, 15, 13, 7, 4, 10, ..."
