In [1]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import nltk
nltk.download('punkt')
from tqdm import tqdm

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import time
from datetime import timedelta

2024-11-21 20:59:40.344503: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-21 20:59:40.369828: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-21 20:59:40.375374: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-21 20:59:40.391948: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

In [2]:
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    print(f"CUDA version: {torch.version.cuda}")

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {num_gpus}")

if num_gpus > 0:
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

CUDA available: True
CUDA version: 11.7
Number of GPUs: 1
GPU 0: NVIDIA A100-SXM4-80GB


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
import pickle

with open('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Inggris/english_keras_tokenizer.pickle', 'rb') as handle:
    keras_tokenizer = pickle.load(handle)
    
vocab_size = len(keras_tokenizer.word_index) + 1
print('Vocab size:', vocab_size)

Vocab size: 6588


In [5]:
def check_coverage(vocab_list, tokenizer, is_bert=True):
    num_words_found = 0
    num_words_not_found = 0
    true_oov = []
    unk_token = '[UNK]' if is_bert else '<UNK>'
    
    if hasattr(tokenizer, 'vocab'):
        tokenizer_vocab = set(tokenizer.vocab.keys())
    else:
        tokenizer_vocab = set(tokenizer.get_vocab().keys())
    
    for word in tqdm(vocab_list, desc="Checking tokenizer coverage"):
        tokens = tokenizer.tokenize(word)
        
        if (len(tokens) == 1 and 
            (tokens[0] == unk_token or tokens[0] not in tokenizer_vocab)):
            num_words_not_found += 1
            true_oov.append(word)
        else:
            num_words_found += 1
    
    total_words = len(vocab_list)
    print('\nTokenizer Coverage Analysis:')
    print(f'Words recognized (whole/subwords): {num_words_found}/{total_words} ({num_words_found/total_words:.2%})')
    print(f'Words completely unknown: {num_words_not_found}/{total_words} ({num_words_not_found/total_words:.2%})')
    
    return true_oov

In [6]:
def initialize_embedding_matrix(keras_tokenizer, tokenizer, model, embedding_matrix, model_type):
    start_time = time.time()
    word_embeddings = {}
    
    try:
        for word, i in tqdm(keras_tokenizer.word_index.items(), desc="Processing words"):
            try:
                inputs = tokenizer(word, return_tensors="pt", add_special_tokens=False).to(device)
                
                with torch.no_grad():
                    if model_type.lower() == 'encoder':
                        embedding = model(**inputs).last_hidden_state[0].mean(dim=0).cpu().numpy()
                    elif model_type.lower() == 'decoder':
                        outputs = model(**inputs, output_hidden_states=True)
                        last_hidden_state = outputs.hidden_states[-1]
                        embedding = last_hidden_state[0].mean(dim=0).cpu().numpy()
                    else:
                        raise ValueError("model_type must 'encoder' or 'decoder'")
                    
                    embedding_matrix[i] = embedding
                    word_embeddings[i] = (word, embedding)
                    
                torch.cuda.empty_cache()
                
            except Exception as e:
                print(f"Error processing word '{word}': {str(e)}")
                continue
                
    finally:
        end_time = time.time()
        elapsed_time = end_time - start_time
        elapsed_time_formatted = str(timedelta(seconds=int(elapsed_time)))
        
        print(f"\nTotal waktu eksekusi: {elapsed_time_formatted}")
        print(f"Total waktu dalam detik: {elapsed_time:.2f} detik")
            
    return embedding_matrix, word_embeddings

In [7]:
df = pd.read_excel('/home/basilmusyaffa19/Skripsi Basil/Dataset/FIX/clean_UCI_22112024.xlsx', engine='openpyxl')
df

Unnamed: 0,label,teks
0,1,go jurong point crazy available bugis great wo...
1,1,ok lar joking wif oni
2,1,free entry wkly comp win fa cup final tkts may...
3,1,dun say early hor already say
4,1,nah think go usf life around though
...,...,...
4994,1,time tried contact pound prize claim easy call...
4995,1,going esplanade fr home
4996,1,pity mood suggestion
4997,1,guy bitching acted like interested buying some...


In [8]:
print("Jumlah data sebelum:", len(df))

# Menghapus data kosong
df = df.replace('', np.nan).dropna()
# Hapus NaN
df = df.dropna(subset=['teks'])
# Menghapus nilai float
df = df[~df['teks'].apply(lambda x: isinstance(x, float))]
# Menghapus semua baris yang duplikat
df = df.drop_duplicates(subset=['teks'], keep='first')

print("Jumlah data setelah:", len(df))

Jumlah data sebelum: 4999
Jumlah data setelah: 4998


In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_gpt = AutoModelForCausalLM.from_pretrained("gpt2-medium").to(device)
tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2-medium")



In [10]:
vocab_list = list(keras_tokenizer.word_index.keys())
oov_words = check_coverage(vocab_list, tokenizer_gpt, is_bert=False)

# Tampilkan contoh kata yang benar-benar tidak dikenali
print("\nContoh kata yang tidak dikenali sama sekali:")
for word in oov_words[:10]:
    tokens = bert_tokenizer.tokenize(word)
    print(f"'{word}' -> {tokens}")

Checking tokenizer coverage: 100%|██████████| 6587/6587 [00:00<00:00, 21478.72it/s]


Tokenizer Coverage Analysis:
Words recognized (whole/subwords): 6587/6587 (100.00%)
Words completely unknown: 0/6587 (0.00%)

Contoh kata yang tidak dikenali sama sekali:





In [11]:
print(model_gpt.config)

GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.45.0",
  "use_cache": true,
  "vocab_size": 50257
}



In [12]:
embedding_dim = model_gpt.config.n_embd
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [13]:
print("Dimensi embedding:", embedding_dim)
print("Dimensi embedding matrix:", embedding_matrix.shape)

Dimensi embedding: 1024
Dimensi embedding matrix: (6588, 1024)


In [14]:
embedding_matrix_gpt, word_embeddings_gpt = initialize_embedding_matrix(
    keras_tokenizer,
    tokenizer_gpt,
    model_gpt,
    embedding_matrix,
    'decoder'
)

Processing words: 100%|██████████| 6587/6587 [02:26<00:00, 44.86it/s]


Total waktu eksekusi: 0:02:26
Total waktu dalam detik: 146.84 detik





In [15]:
embedding_matrix_gpt

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.0522459 ,  0.04767202, -0.13080455, ..., -0.00874161,
         0.05193056, -0.27072704],
       [-0.2418468 ,  0.08761762, -0.09614614, ..., -0.05747351,
         0.17983393, -0.43582755],
       ...,
       [-0.08425059,  0.23326066,  0.33585617, ...,  0.02881989,
         0.02648362, -0.11299706],
       [ 0.0385328 ,  0.38246351, -0.03597653, ..., -0.23896097,
        -0.23418753, -0.22373393],
       [-0.15060945,  0.32495287,  0.20788555, ..., -0.17197423,
        -0.39646959, -0.00106618]])

In [16]:
for idx, (word, vector) in list(word_embeddings_gpt.items())[:5]:
    print(f"Index: {idx}")
    print(f"Word: {word}")
    vector_np = np.array(vector) if not isinstance(vector, np.ndarray) else vector
    print(f"Vector (first 10 values): {vector_np[:10]}")
    print("---")

Index: 1
Word: call
Vector (first 10 values): [-0.0522459   0.04767202 -0.13080455  0.30200303 -0.18411675 -0.02986437
 -0.09928483  0.32701287 -0.16883808 -1.6366733 ]
---
Index: 2
Word: get
Vector (first 10 values): [-0.2418468   0.08761762 -0.09614614  0.17855516  0.28875348 -0.03287686
  0.461701    0.21888737  0.21284316 -1.547617  ]
---
Index: 3
Word: ur
Vector (first 10 values): [ 0.12136213 -0.37856215 -0.49205014  0.16591899 -0.05338626 -0.05610901
  0.17835748 -0.11285753  0.04102818 -1.5577227 ]
---
Index: 4
Word: go
Vector (first 10 values): [ 0.20680365 -0.12432213 -0.4014406   0.4502381  -0.2625269   0.0282814
  0.0655566   0.6958488   0.20288402 -1.590834  ]
---
Index: 5
Word: ok
Vector (first 10 values): [ 0.49521017 -0.21528961 -0.49980488  0.36692864 -0.53749883  0.14089376
 -0.3415411   0.5002537   0.07695413 -1.4523376 ]
---


In [17]:
path = '/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Inggris/Hasil Embedding/21 Nov/embedding_matrix_gpt2_1024D_21112024.npy'
np.save(path, embedding_matrix_gpt)

In [18]:
import os
file_size = os.path.getsize('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Inggris/Hasil Embedding/21 Nov/embedding_matrix_gpt2_1024D_21112024.npy')
print(f"Ukuran file: {file_size/1024/1024:.2f} MB")

Ukuran file: 51.47 MB
