In [1]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import nltk
nltk.download('punkt')
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import time
from datetime import timedelta

2024-12-30 02:31:09.887382: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-30 02:31:09.904696: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-30 02:31:09.910615: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-30 02:31:09.923278: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

In [2]:
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    print(f"CUDA version: {torch.version.cuda}")

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {num_gpus}")

if num_gpus > 0:
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

CUDA available: True
CUDA version: 11.7
Number of GPUs: 1
GPU 0: NVIDIA A100-SXM4-80GB


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
import pickle

with open('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/indo_keras_tokenizer.pickle', 'rb') as handle:
    keras_tokenizer = pickle.load(handle)
    
vocab_size = len(keras_tokenizer.word_index) + 1
print('Vocab size:', vocab_size)

Vocab size: 2521


In [5]:
def check_coverage(vocab_list, tokenizer, is_bert=True):
    num_words_found = 0
    num_words_not_found = 0
    true_oov = []
    unk_token = '[UNK]' if is_bert else '<UNK>'
    
    if hasattr(tokenizer, 'vocab'): # Mengambil semua kosakata yang dikenali oleh Tokenizer dari Model LLMs
        tokenizer_vocab = set(tokenizer.vocab.keys()) 
    else:
        tokenizer_vocab = set(tokenizer.get_vocab().keys())
    
    for word in tqdm(vocab_list, desc="Checking tokenizer coverage"):
        tokens = tokenizer.tokenize(word) # Melakukan tokenisasi pada kosakata yang dikenali oleh Keras Tokenizer dalam vocab_list
        
        if (len(tokens) == 1 and
            (tokens[0] == unk_token or tokens[0] not in tokenizer_vocab)):
            num_words_not_found += 1
            true_oov.append(word)
        else:
            num_words_found += 1
    
    total_words = len(vocab_list)
    print('\nTokenizer Coverage Analysis:')
    print(f'Words recognized (whole/subwords): {num_words_found}/{total_words} ({num_words_found/total_words:.2%})')
    print(f'Words completely unknown: {num_words_not_found}/{total_words} ({num_words_not_found/total_words:.2%})')
    
    return true_oov

In [6]:
def initialize_embedding_matrix(keras_tokenizer, tokenizer, model, embedding_matrix, model_type):
    start_time = time.time()
    word_embeddings = {}
    
    try:
        for word, i in tqdm(keras_tokenizer.word_index.items(), desc="Processing words"):
            try:
                # Melakukan tokenisasi pada kosakata yang dikenali oleh Keras Tokenizer menggunakan Tokenizer dari Model LLMs
                inputs = tokenizer(word, return_tensors="pt", add_special_tokens=False).to(device) 
                
                with torch.no_grad():
                    # Ambil last hidden state
                    if model_type.lower() == 'encoder':
                        embedding = model(**inputs).last_hidden_state[0].mean(dim=0).cpu().numpy()
                    elif model_type.lower() == 'decoder':
                        outputs = model(**inputs, output_hidden_states=True)
                        last_hidden_state = outputs.hidden_states[-1]
                        embedding = last_hidden_state[0].mean(dim=0).cpu().numpy()
                    else:
                        raise ValueError("model_type must 'encoder' or 'decoder'")
                    
                    embedding_matrix[i] = embedding # Menyimpan embedding sesuai dengan indeks dari Keras Tokenizer
                    word_embeddings[i] = (word, embedding)
                    
                torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"Error processing word '{word}': {str(e)}")
                continue
                
    finally:
        end_time = time.time()
        elapsed_time = end_time - start_time
        elapsed_time_formatted = str(timedelta(seconds=int(elapsed_time)))
        
        print(f"\nTotal waktu eksekusi: {elapsed_time_formatted}")
        print(f"Total waktu dalam detik: {elapsed_time:.2f} detik")
            
    return embedding_matrix, word_embeddings

In [7]:
df = pd.read_excel('/home/basilmusyaffa19/Skripsi Basil/Dataset/FIX/clean_SMS_22112024.xlsx', engine='openpyxl')
df

Unnamed: 0,teks,label
0,promo beli paket flash mulai aplikasi my telko...,1
1,gb hari rp ribu spesial buat kamu pilih aktif ...,1
2,plg hormat sisa kuota flash download aplikasi ...,1
3,mohon hormat sisa kuota flash download aplikas...,1
4,hari rp ribu khusus buat kamu pilih aktif seka...,1
...,...,...
1138,yooo sama sama aku umum kelompok kelas,0
1139,pernah tulis cadar belum pikir warna jeans,0
1140,bu mau kirim,0
1141,berangkat pagi mau tunai transfer,0


In [8]:
print("Jumlah data sebelum:", len(df))

# Menghapus data kosong
df = df.replace('', np.nan).dropna()
# Hapus NaN
df = df.dropna(subset=['teks'])
# Menghapus nilai float
df = df[~df['teks'].apply(lambda x: isinstance(x, float))]
# Menghapus semua baris yang duplikat
df = df.drop_duplicates(subset=['teks'], keep='first')

print("Jumlah data setelah:", len(df))

Jumlah data sebelum: 1143
Jumlah data setelah: 1138


In [9]:
from transformers import AutoTokenizer, AutoModel

bert_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
bert_model = AutoModel.from_pretrained("indobenchmark/indobert-large-p1").to(device)

  return self.fget.__get__(instance, owner)()


In [10]:
vocab_list = list(keras_tokenizer.word_index.keys())
oov_words = check_coverage(vocab_list, bert_tokenizer, is_bert=True)

# Tampilkan contoh kata yang benar-benar tidak dikenali
print("\nContoh kata yang tidak dikenali sama sekali:")
for word in oov_words[:10]:
    tokens = bert_tokenizer.tokenize(word)
    print(f"'{word}' -> {tokens}")

Checking tokenizer coverage: 100%|██████████| 2520/2520 [00:00<00:00, 27303.49it/s]


Tokenizer Coverage Analysis:
Words recognized (whole/subwords): 2520/2520 (100.00%)
Words completely unknown: 0/2520 (0.00%)

Contoh kata yang tidak dikenali sama sekali:





In [11]:
print(bert_model.config)

BertConfig {
  "_name_or_path": "indobenchmark/indobert-large-p1",
  "_num_labels": 5,
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_vers

In [12]:
embedding_dim = bert_model.config.hidden_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [13]:
print("Dimensi embedding:", embedding_dim)
print("Dimensi embedding matrix:", embedding_matrix.shape)

Dimensi embedding: 1024
Dimensi embedding matrix: (2521, 1024)


In [14]:
embedding_matrix_BERT, word_embeddings_BERT = initialize_embedding_matrix(
    keras_tokenizer,
    bert_tokenizer,
    bert_model,
    embedding_matrix,
    'encoder'
)

Processing words: 100%|██████████| 2520/2520 [00:38<00:00, 65.71it/s]


Total waktu eksekusi: 0:00:38
Total waktu dalam detik: 38.35 detik





In [15]:
embedding_matrix_BERT

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.40297502, -0.25745982,  1.86437404, ..., -0.47794813,
         0.66742909, -2.65763545],
       [ 0.40340644, -0.26850891,  1.87049556, ..., -0.48913869,
         0.66734242, -2.6521647 ],
       ...,
       [ 0.25861955, -0.00424888,  1.83511174, ..., -0.47858012,
         0.60898656, -2.57891917],
       [ 0.4075101 , -0.35284889,  1.89335084, ..., -0.4616366 ,
         0.68235916, -2.71083069],
       [ 0.53624713, -0.94921613,  1.73413193, ..., -0.02876114,
         0.95431006, -2.94930148]])

In [16]:
for idx, (word, vector) in list(word_embeddings_BERT.items())[:5]:
    print(f"Index: {idx}")
    print(f"Word: {word}")
    vector_np = np.array(vector) if not isinstance(vector, np.ndarray) else vector
    print(f"Vector (first 10 values): {vector_np[:10]}")
    print("---")

Index: 1
Word: dapat
Vector (first 10 values): [ 0.40297502 -0.25745982  1.864374    0.33416057 -1.9402444  -1.0983953
 -0.9724231  -1.3997627   0.55408114 -1.6336396 ]
---
Index: 2
Word: info
Vector (first 10 values): [ 0.40340644 -0.2685089   1.8704956   0.33167115 -1.9378407  -1.0883396
 -0.9734533  -1.4064689   0.5547063  -1.6310918 ]
---
Index: 3
Word: rp
Vector (first 10 values): [ 0.11514442  0.25567952  1.8145393   0.62990904 -1.9298229  -1.2689822
 -1.2203645  -1.3089141   0.54101557 -1.6077067 ]
---
Index: 4
Word: pin
Vector (first 10 values): [ 0.33033907 -0.11559957  1.8480192   0.4054546  -1.9489185  -1.1717795
 -1.0442709  -1.3714694   0.5616242  -1.6209968 ]
---
Index: 5
Word: sms
Vector (first 10 values): [ 0.38031676 -0.20306265  1.8663999   0.36625448 -1.94487    -1.1321994
 -1.0027246  -1.395241    0.5569555  -1.619472  ]
---


In [17]:
#index_word = {v: k for k, v in keras_tokenizer.word_index.items()}

#def get_word_from_id(word_id):
    #return index_word.get(word_id, "Unknown ID")

#word_id = 2898 
#word = get_word_from_id(word_id)
#print(f"Word for ID {word_id}: {word}")

#index = 2898
#embedding_matrix_BERT[index-1]

#with open("tokenizer_output.txt", "w") as file:
    #for word, index in keras_tokenizer.word_index.items():
        #file.write(f"ID: {index}, Word: {word}\n")

In [18]:
path = '/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/Hasil Embedding/21 Nov/embedding_matrix_indoBERTLarge_1024D_21112024.npy'
np.save(path, embedding_matrix_BERT)

In [19]:
import os
file_size = os.path.getsize('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/Hasil Embedding/21 Nov/embedding_matrix_indoBERTLarge_1024D_21112024.npy')
print(f"Ukuran file: {file_size/1024/1024:.2f} MB")

Ukuran file: 19.70 MB
