In [1]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import nltk
nltk.download('punkt')
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import time
from datetime import timedelta

2024-12-10 16:21:30.916694: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-10 16:21:30.933758: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-10 16:21:30.939240: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-10 16:21:30.953608: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

In [2]:
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    print(f"CUDA version: {torch.version.cuda}")

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {num_gpus}")

if num_gpus > 0:
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

CUDA available: True
CUDA version: 11.7
Number of GPUs: 1
GPU 0: NVIDIA A100-SXM4-80GB


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
import pickle

with open('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/indo_keras_tokenizer.pickle', 'rb') as handle:
    keras_tokenizer = pickle.load(handle)
    
vocab_size = len(keras_tokenizer.word_index) + 1
print('Vocab size:', vocab_size)

Vocab size: 2521


In [5]:
def check_coverage(vocab_list, tokenizer, is_bert=True):
    num_words_found = 0
    num_words_not_found = 0
    true_oov = []
    unk_token = '[UNK]' if is_bert else '<UNK>'
    
    if hasattr(tokenizer, 'vocab'):
        tokenizer_vocab = set(tokenizer.vocab.keys())
    else:
        tokenizer_vocab = set(tokenizer.get_vocab().keys())
    
    for word in tqdm(vocab_list, desc="Checking tokenizer coverage"):
        tokens = tokenizer.tokenize(word)
        
        if (len(tokens) == 1 and 
            (tokens[0] == unk_token or tokens[0] not in tokenizer_vocab)):
            num_words_not_found += 1
            true_oov.append(word)
        else:
            num_words_found += 1
    
    total_words = len(vocab_list)
    print('\nTokenizer Coverage Analysis:')
    print(f'Words recognized (whole/subwords): {num_words_found}/{total_words} ({num_words_found/total_words:.2%})')
    print(f'Words completely unknown: {num_words_not_found}/{total_words} ({num_words_not_found/total_words:.2%})')
    
    return true_oov

In [6]:
def initialize_embedding_matrix(keras_tokenizer, tokenizer, model, embedding_matrix, model_type):
    start_time = time.time()
    word_embeddings = {}
    
    try:
        for word, i in tqdm(keras_tokenizer.word_index.items(), desc="Processing words"):
            try:
                inputs = tokenizer(word, return_tensors="pt", add_special_tokens=False).to(device)
                
                with torch.no_grad():
                    if model_type.lower() == 'encoder':
                        embedding = model(**inputs).last_hidden_state[0].mean(dim=0).cpu().numpy()
                    elif model_type.lower() == 'decoder':
                        outputs = model(**inputs, output_hidden_states=True)
                        last_hidden_state = outputs.hidden_states[-1]
                        embedding = last_hidden_state[0].mean(dim=0).cpu().numpy()
                    else:
                        raise ValueError("model_type must 'encoder' or 'decoder'")
                    
                    embedding_matrix[i] = embedding
                    word_embeddings[i] = (word, embedding)
                    
                torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"Error processing word '{word}': {str(e)}")
                continue
                
    finally:
        end_time = time.time()
        elapsed_time = end_time - start_time
        elapsed_time_formatted = str(timedelta(seconds=int(elapsed_time)))
        
        print(f"\nTotal waktu eksekusi: {elapsed_time_formatted}")
        print(f"Total waktu dalam detik: {elapsed_time:.2f} detik")
            
    return embedding_matrix, word_embeddings

In [7]:
df = pd.read_excel('/home/basilmusyaffa19/Skripsi Basil/Dataset/FIX/clean_SMS_22112024.xlsx', engine='openpyxl')
df

Unnamed: 0,teks,label
0,promo beli paket flash mulai aplikasi my telko...,1
1,gb hari rp ribu spesial buat kamu pilih aktif ...,1
2,plg hormat sisa kuota flash download aplikasi ...,1
3,mohon hormat sisa kuota flash download aplikas...,1
4,hari rp ribu khusus buat kamu pilih aktif seka...,1
...,...,...
1138,yooo sama sama aku umum kelompok kelas,0
1139,pernah tulis cadar belum pikir warna jeans,0
1140,bu mau kirim,0
1141,berangkat pagi mau tunai transfer,0


In [8]:
print("Jumlah data sebelum:", len(df))

# Menghapus data kosong
df = df.replace('', np.nan).dropna()
# Hapus NaN
df = df.dropna(subset=['teks'])
# Menghapus nilai float
df = df[~df['teks'].apply(lambda x: isinstance(x, float))]
# Menghapus semua baris yang duplikat
df = df.drop_duplicates(subset=['teks'], keep='first')

print("Jumlah data setelah:", len(df))

Jumlah data sebelum: 1143
Jumlah data setelah: 1138


In [9]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel

xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
xlmr_model = XLMRobertaModel.from_pretrained('xlm-roberta-large').to(device)



In [10]:
vocab_list = list(keras_tokenizer.word_index.keys())
oov_words = check_coverage(vocab_list, xlmr_tokenizer, is_bert=False)

# Tampilkan contoh kata yang benar-benar tidak dikenali
print("\nContoh kata yang tidak dikenali sama sekali:")
for word in oov_words[:10]:
    tokens = bert_tokenizer.tokenize(word)
    print(f"'{word}' -> {tokens}")

Checking tokenizer coverage: 100%|██████████| 2520/2520 [00:00<00:00, 90480.38it/s]


Tokenizer Coverage Analysis:
Words recognized (whole/subwords): 2520/2520 (100.00%)
Words completely unknown: 0/2520 (0.00%)

Contoh kata yang tidak dikenali sama sekali:





In [11]:
print(xlmr_model.config)

XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-large",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.45.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



In [12]:
embedding_dim = xlmr_model.config.hidden_size
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [13]:
print("Dimensi embedding:", embedding_dim)
print("Dimensi embedding matrix:", embedding_matrix.shape)

Dimensi embedding: 1024
Dimensi embedding matrix: (2521, 1024)


In [17]:
embedding_matrix_xlmr, word_embeddings_xlmr = initialize_embedding_matrix(
    keras_tokenizer,
    xlmr_tokenizer,
    xlmr_model,
    embedding_matrix,
    'encoder'
)

Processing words: 100%|██████████| 2520/2520 [00:43<00:00, 57.57it/s]


Total waktu eksekusi: 0:00:43
Total waktu dalam detik: 43.78 detik





In [15]:
embedding_matrix_xlmr

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0093585 , -0.02203197,  0.18635076, ...,  0.06333438,
         0.06612749, -0.07377793],
       [-0.02798257, -0.01775952,  0.26358303, ...,  0.33430958,
         0.22786261, -0.0574947 ],
       ...,
       [ 0.04638209, -0.04529408,  0.07211487, ...,  0.29261225,
         0.06969289, -0.31357098],
       [ 0.23080245, -0.29851806,  0.19955362, ...,  0.72080779,
         0.12128031, -0.13525267],
       [-0.25753355, -0.30937615,  0.13065648, ..., -0.03193025,
        -0.2279087 , -0.13841757]])

In [16]:
for idx, (word, vector) in list(word_embeddings_xlmr.items())[:5]:
    print(f"Index: {idx}")
    print(f"Word: {word}")
    vector_np = np.array(vector) if not isinstance(vector, np.ndarray) else vector
    print(f"Vector (first 10 values): {vector_np[:10]}")
    print("---")

Index: 1
Word: dapat
Vector (first 10 values): [ 0.0093585  -0.02203197  0.18635076 -0.21273486  0.27472043 -0.11556941
 -0.5545718  -0.51569575  0.01428739  0.08894514]
---
Index: 2
Word: info
Vector (first 10 values): [-0.02798257 -0.01775952  0.26358303 -0.08474933  0.23249489  0.22397552
 -0.45022127 -0.23172297  0.03183522 -0.10485303]
---
Index: 3
Word: rp
Vector (first 10 values): [-0.2264594   0.3285705   0.11480813 -0.20484537  0.10054754 -0.06819888
 -0.25244507 -0.69213605 -0.21278243  0.09682664]
---
Index: 4
Word: pin
Vector (first 10 values): [ 0.3447512   0.10992728 -0.17589007 -0.5994565  -0.35437897  0.41128403
 -0.64616644 -0.3930797  -0.08763904 -0.05837632]
---
Index: 5
Word: sms
Vector (first 10 values): [ 0.01071113 -0.48916233  0.21118851 -0.591316   -0.24247213  0.27745128
 -0.56149185 -0.00657092  0.02028463 -0.10136258]
---


In [18]:
path = '/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/Hasil Embedding/21 Nov/embedding_matrix_XLMroBERTaLarge_1024D_21112024.npy'
np.save(path, embedding_matrix_xlmr)

In [19]:
import os
file_size = os.path.getsize('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/Hasil Embedding/21 Nov/embedding_matrix_XLMroBERTaLarge_1024D_21112024.npy')
print(f"Ukuran file: {file_size/1024/1024:.2f} MB")

Ukuran file: 19.70 MB
