# **Ujian Akhir Semester - Pengolahan Bahasa Alami**

## Anggota Kelompok:
- **Kevin Philips Tanamas** (220711789)  
- **Richard Angelico** (220711747)
- **Anthony Alvin Nathaniel** (220711773)
- **Nicholas Raymond Thosimaru** (220712111)
- **Maria** (220711969)

Library Import

In [2]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

import gc
import fasttext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.metrics import confusion_matrix
from seqeval.scheme import BILOU
from seqeval.metrics import classification_report, f1_score

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Bidirectional, LSTM, GRU, Dense, TimeDistributed, Dropout

tqdm.pandas()

2025-06-23 20:14:48.617717: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-23 20:14:49.838740: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750684490.213756  316832 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750684490.297103  316832 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-23 20:14:51.164142: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Data Loading Function

In [3]:
# Force CPU usage for memory-intensive operations
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Use CPU for data preparation

# Data loading function
def load_data(filename):
    sentences, labels = [], []
    sentence, label = [], []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split()
                if len(parts) == 2:
                    word, tag = parts
                    sentence.append(word)
                    label.append(tag)
    return sentences, labels


def simple_rnn_model(max_len_seq, emb_size, rnn_units, num_tags, lr=0.001):
    """
    SimpleRNN model for NER - fastest training, but typically lower accuracy

    Args:
        max_len_seq: Maximum sequence length
        emb_size: Embedding size
        rnn_units: Number of RNN units
        num_tags: Number of output tags
        lr: Learning rate
    """
    # Input layer
    inputs = Input(shape=(max_len_seq, emb_size), dtype='float32')

    # Optional projection to reduce dimensions
    x = Dense(128, activation='relu')(inputs)

    # SimpleRNN layer - fastest but less capable for sequence modeling
    x = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True)(x)

    # Dropout for regularization
    x = Dropout(0.2)(x)

    # Output layer with softmax activation for tag prediction
    output = TimeDistributed(Dense(num_tags, activation='softmax'))(x)

    model = Model(inputs=inputs, outputs=output)

    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [4]:
def bilstm_model(max_len_seq, emb_size, rnn_units, num_tags, lr=0.001):
    """
    Bidirectional LSTM model for NER - highest accuracy but slowest training

    Args:
        max_len_seq: Maximum sequence length
        emb_size: Embedding size
        rnn_units: Number of LSTM units per direction
        num_tags: Number of output tags
        lr: Learning rate
    """
    # Input layer
    inputs = Input(shape=(max_len_seq, emb_size), dtype='float32')

    # Projection layer to reduce dimensions
    x = Dense(128, activation='relu')(inputs)

    # Bidirectional LSTM - processes sequences in both directions
    # Higher accuracy for context-dependent tasks like NER
    x = Bidirectional(
        LSTM(
            rnn_units,
            return_sequences=True,
            recurrent_dropout=0.0,
            implementation=2  # potentially faster but less stable
        )
    )(x)

    # Dropout for regularization
    x = Dropout(0.3)(x)

    # Output layer
    output = TimeDistributed(Dense(num_tags, activation='softmax'))(x)

    model = Model(inputs=inputs, outputs=output)
    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


def bigru_model(max_len_seq, emb_size, rnn_units, num_tags, lr=0.001):
    """
    Bidirectional GRU model - good balance between speed and accuracy

    Args:
        max_len_seq: Maximum sequence length
        emb_size: Embedding size
        rnn_units: Number of GRU units per direction
        num_tags: Number of output tags
        lr: Learning rate
    """
    # Input layer
    inputs = Input(shape=(max_len_seq, emb_size), dtype='float32')

    # Projection layer
    x = Dense(128, activation='relu')(inputs)

    # Bidirectional GRU - faster than LSTM with similar capabilities
    x = Bidirectional(
        GRU(
            rnn_units,
            return_sequences=True,
            recurrent_dropout=0.0,
            reset_after=True  # modern GRU implementation
        )
    )(x)

    # Dropout for regularization
    x = Dropout(0.2)(x)

    # Output layer
    output = TimeDistributed(Dense(num_tags, activation='softmax'))(x)

    model = Model(inputs=inputs, outputs=output)
    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


Load Data

In [None]:
# Load data
train_file = "./train-70.txt"
valid_file = "./dev-30.txt"
test_file = "./test-data.txt"

train_sentences, train_labels = load_data(train_file)
valid_sentences, valid_labels = load_data(valid_file)
test_sentences, test_labels = load_data(test_file)

# Create dataframes
train_df = pd.DataFrame({'tokens': train_sentences, 'tags': train_labels})
valid_df = pd.DataFrame({'tokens': valid_sentences, 'tags': valid_labels})
test_df = pd.DataFrame({'tokens': test_sentences, 'tags': test_labels})

# Print dataset sizes
print(
    f"Train samples: {len(train_df)}, Validation samples: {len(valid_df)}, Test samples: {len(test_df)}"
)


Data Checking (Liat Semua Tags)

In [None]:
# Set all tags (flattened) from training labels
all_tags_flat = [tag for seq in train_labels for tag in seq]

# Print total number of tags
print(f"Total tags in training: {len(all_tags_flat)}")

# Tag distribution in training set
print("Tag distribution in training set:")
tag_counts = pd.Series(all_tags_flat).value_counts()
print(tag_counts.head(10))  # Show top 10 most common tags

Load FastText Model

In [None]:
import os

# Cek dan unduh FastText model jika belum tersedia
if not os.path.exists('./cc.id.300.bin'):
    print("FastText model not found. Downloading model (this may take a while)...")
    os.system("wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz")
    os.system("gunzip cc.id.300.bin.gz")
    print("Download complete.")

# Load FastText model
print("Loading FastText model...")
ft = fasttext.load_model('./cc.id.300.bin')
print("FastText model loaded successfully.")

# Set embedding size dan panjang maksimum urutan
emb_size = 300
max_seq_len = 128


Word Embeddings Function

In [None]:
# Membuat fungsi untuk menghasilkan word embeddings dari token
def get_tok_emb(tokens):
    emb = np.zeros((max_seq_len, emb_size))
    for i, word in enumerate(tokens[:max_seq_len]):
        emb[i] = ft.get_word_vector(word)
    return emb

# Proses pembuatan embeddings untuk setiap token dalam dataset
print("Creating word embeddings...")
train_df['tokens_embedding'] = train_df['tokens'].progress_apply(get_tok_emb)
valid_df['tokens_embedding'] = valid_df['tokens'].progress_apply(get_tok_emb)
test_df['tokens_embedding'] = test_df['tokens'].progress_apply(get_tok_emb)