In [1]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import nltk
import pickle
nltk.download('punkt')
from tqdm import tqdm

#from nltk.tokenize import word_tokenize
#from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import time
from datetime import timedelta

2024-12-12 12:06:11.664602: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-12 12:06:11.682581: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-12 12:06:11.688078: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 12:06:11.702282: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

In [2]:
print(np.__version__)
print(torch.__version__)

2.0.1
1.13.0+cu117


In [3]:
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

if cuda_available:
    print(f"CUDA version: {torch.version.cuda}")

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs: {num_gpus}")

if num_gpus > 0:
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

CUDA available: True
CUDA version: 11.7
Number of GPUs: 1
GPU 0: NVIDIA A100-SXM4-80GB


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
df = pd.read_excel('/home/basilmusyaffa19/Skripsi Basil/Dataset/FIX/clean_SMS_22112024.xlsx', engine='openpyxl')
df

Unnamed: 0,teks,label
0,promo beli paket flash mulai aplikasi my telko...,1
1,gb hari rp ribu spesial buat kamu pilih aktif ...,1
2,plg hormat sisa kuota flash download aplikasi ...,1
3,mohon hormat sisa kuota flash download aplikas...,1
4,hari rp ribu khusus buat kamu pilih aktif seka...,1
...,...,...
1138,yooo sama sama aku umum kelompok kelas,0
1139,pernah tulis cadar belum pikir warna jeans,0
1140,bu mau kirim,0
1141,berangkat pagi mau tunai transfer,0


In [6]:
def load_glove_embeddings(file_path):
    embeddings_dict = {}
    words_set = set()  # membuat set kosong
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]  # kata ada di posisi pertama
            vector = np.asarray(values[1:], dtype='float32')  # sisanya adalah vektor embedding
            embeddings_dict[word] = vector
            words_set.add(word)  # menambahkan kata ke dalam set
            
    return embeddings_dict, words_set

In [7]:
glove, words_set = load_glove_embeddings('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/glove_50dim_wiki.id.case.text.txt')

In [8]:
with open('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/indo_keras_tokenizer.pickle', 'rb') as handle:
    keras_tokenizer = pickle.load(handle)
    
vocab_size = len(keras_tokenizer.word_index) + 1
print('Vocab size:', vocab_size)

Vocab size: 2521


In [9]:
print("Jumlah data sebelum:", len(df))

# Menghapus data kosong
df = df.replace('', np.nan).dropna()
# Hapus NaN
df = df.dropna(subset=['teks'])
# Menghapus nilai float
df = df[~df['teks'].apply(lambda x: isinstance(x, float))]
# Menghapus semua baris yang duplikat
df = df.drop_duplicates(subset=['teks'], keep='first')

print("Jumlah data setelah:", len(df))

Jumlah data sebelum: 1143
Jumlah data setelah: 1138


In [10]:
def check_coverage(vocab_list, words_set):
    num_words_found = 0
    oov = []

    for word in tqdm(vocab_list):
        if word in words_set:
            num_words_found += 1
        else:
            oov.append(word)

    total_words = len(vocab_list)
    print('Found embeddings for {:.2%} of vocab'.format(num_words_found / total_words))
    print('Number of words not found: {}'.format(len(oov)))
    
    return oov

In [11]:
vocab_list = list(keras_tokenizer.word_index.keys())
oov = check_coverage(vocab_list, words_set)

print("\nBeberapa kata yang tidak ditemukan:")
for word in oov[:10]:
    print(f"'{word}'")

df_oov = pd.DataFrame(oov, columns=['Kata OOV'])
df_oov.to_excel('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/kata_oov_indo.xlsx', index=False)

100%|██████████| 2520/2520 [00:00<00:00, 551306.39it/s]

Found embeddings for 79.37% of vocab
Number of words not found: 520

Beberapa kata yang tidak ditemukan:
'langgan'
'nelpon'
'tsel'
'silah'
'tcash'
'simcard'
'ooredoo'
'yaris'
'dakota'
'bni'





In [12]:
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))
embedding_matrix.shape

(2521, 50)

In [13]:
def initialize_embedding_matrix(tokenizer, oov_words, embedding_matrix, model, embedding_dim=50):
    start_time = time.time()
    word_embeddings = {}
    
    for word, i in tqdm(tokenizer.word_index.items(), desc="Creating embedding matrix"): 
        if word in oov_words:
            random_vector = np.random.uniform(-0.25, 0.25, embedding_dim)
            random_vector = random_vector / np.linalg.norm(random_vector) * np.sqrt(embedding_dim)
            embedding_matrix[i] = random_vector
            word_embeddings[i] = (word, random_vector)
        else:  
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
            word_embeddings[i] = (word, embedding_vector)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"\nTotal waktu pembuatan embedding matrix: {elapsed_time}s")
    
    return embedding_matrix, word_embeddings

In [14]:
embedding_matrix_glove, word_embeddings_glove = initialize_embedding_matrix(
    keras_tokenizer,
    oov,
    embedding_matrix,
    glove
)

Creating embedding matrix: 100%|██████████| 2520/2520 [00:00<00:00, 91639.83it/s]


Total waktu pembuatan embedding matrix: 0.02921009063720703s





In [17]:
embedding_matrix_glove

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.059917  , -1.13609898,  0.55175501, ...,  0.81516302,
         2.64219594,  1.23838198],
       [-0.51291502, -0.117982  ,  0.019041  , ..., -0.15569399,
         0.212929  , -0.38389   ],
       ...,
       [-0.30222201,  0.47868001,  0.423637  , ..., -0.77613503,
         0.97546601,  0.42256501],
       [ 0.72796738, -0.61885576,  1.12429665, ...,  1.49172576,
        -0.58190233, -1.56130441],
       [ 0.45726499, -0.70696402, -0.124477  , ...,  0.52218401,
        -0.29743001, -0.57734901]])

In [18]:
for idx, (word, vector) in list(word_embeddings_glove.items())[:5]:
    print(f"Index: {idx}")
    print(f"Word: {word}")
    vector_np = np.array(vector) if not isinstance(vector, np.ndarray) else vector
    print(f"Vector (first 10 values): {vector_np[:10]}")
    print("---")

Index: 1
Word: dapat
Vector (first 10 values): [ 0.059917 -1.136099  0.551755 -0.572297  0.11707   0.925134  0.410752
  0.32349  -0.679779  1.096869]
---
Index: 2
Word: info
Vector (first 10 values): [-0.512915 -0.117982  0.019041 -0.644253  0.023331 -1.118993  0.236992
 -1.458259 -0.473929  0.402766]
---
Index: 3
Word: rp
Vector (first 10 values): [ 0.306585  0.516463  0.768335 -0.13336  -0.433293  0.572058 -0.14886
 -0.548269 -0.513062 -0.245573]
---
Index: 4
Word: pin
Vector (first 10 values): [ 0.026798 -0.651772  0.408602 -0.512624 -0.181467  0.222163  0.619997
 -1.22604   0.331878  0.049833]
---
Index: 5
Word: sms
Vector (first 10 values): [-1.345279 -0.931068  0.340035 -0.479427 -0.135056  0.04676   1.492784
 -0.725813 -0.659199  0.020117]
---


In [19]:
target_word = "nelpon"

word_found = False
for idx, (word, vector) in word_embeddings_glove.items():
    if word == target_word:
        print(f"Index: {idx}")
        print(f"Word: {word}")
        print(f"Vector: {vector}")
        word_found = True
        break

if not word_found:
    print(f"Kata '{target_word}' tidak ditemukan")

Index: 66
Word: nelpon
Vector: [-0.7929115  -0.06483223  1.23008954 -1.12152042  0.08474467  0.43574067
  1.52861666 -0.53303626 -1.41711722 -1.23748557  0.38321144  0.98999276
  0.84256983 -0.57486092 -1.37248084  1.73947026 -0.18415939  0.51975222
  1.61336982 -0.17190405  0.72490277 -0.71027314 -0.38127465 -1.69477436
  1.39655164 -1.30033129 -1.51753616 -0.67207073 -1.66574078  1.59974042
 -0.59118214 -0.43634364  1.75311939  0.7593595   1.35436137 -1.02980878
 -1.48101734  0.5230628   1.30426909 -0.38585012  0.61321928 -0.19976665
  0.48616177  0.43719156  1.16700974 -0.24616884 -0.42664793  0.38214427
  0.06694159  0.66921197]


In [23]:
path = '/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/Hasil Embedding/21 Nov/embedding_matrix_gloveWiki_50D_21112024.npy'
np.save(path, embedding_matrix_glove)

In [24]:
import os
file_size = os.path.getsize('/home/basilmusyaffa19/Skripsi Basil/Embedding Matrix/Dataset Indo/Hasil Embedding/21 Nov/embedding_matrix_gloveWiki_50D_21112024.npy')
print(f"Ukuran file: {file_size/1024/1024:.2f} MB")

Ukuran file: 0.96 MB
