In [1]:
# !pip install numpy tensorflow keras-tuner

In [4]:
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt  # For hyperparameter tuning

In [3]:
def load_data(data_dir):
    train_file = os.path.join(data_dir, "train.json")
    val_file = os.path.join(data_dir, "val.json")
    test_file = os.path.join(data_dir, "test.json")

    with open(train_file, "r") as f:
        train_data = json.load(f)
    with open(val_file, "r") as f:
        val_data = json.load(f)
    with open(test_file, "r") as f:
        test_data = json.load(f)

    return train_data, val_data, test_data


data_dir = "../dataset"  # Move out of 'models' and into 'dataset'
train_data, val_data, test_data = load_data(data_dir)

print(f"Loaded {len(train_data)} training relations, {len(val_data)} validation, {len(test_data)} test.")

Loaded 81 training relations, 81 validation, 81 test.


In [4]:
def preprocess_data(data):
    texts, labels = [], []
    label_dict = {}
    label_index = 0

    for relation, samples in data.items():
        if relation not in label_dict:
            label_dict[relation] = label_index
            label_index += 1

        for sample in samples:
            # No need to join, it's already a string
            texts.append(sample["tokens"])
            labels.append(label_dict[relation])

    return texts, np.array(labels), label_dict


train_texts, train_labels, label_dict = preprocess_data(train_data)
val_texts, val_labels, _ = preprocess_data(val_data)
test_texts, test_labels, _ = preprocess_data(test_data)

print(f"Sample text: {train_texts[0]}")
print(f"Label dictionary: {label_dict}")

Sample text: Employed by Australian National Airways (ANA) after leaving the Air Force, Lukis become airfield manager at [E1S] Essendon [E1E], [E2S] Melbourne [E2E].
Label dictionary: {'P931': 0, 'P4552': 1, 'P140': 2, 'P1923': 3, 'P150': 4, 'P6': 5, 'P27': 6, 'P449': 7, 'P1435': 8, 'P175': 9, 'P1344': 10, 'P39': 11, 'P527': 12, 'P740': 13, 'P706': 14, 'P84': 15, 'P495': 16, 'P123': 17, 'P57': 18, 'P22': 19, 'P178': 20, 'P241': 21, 'P403': 22, 'P1411': 23, 'P135': 24, 'P991': 25, 'P156': 26, 'P176': 27, 'P31': 28, 'P1877': 29, 'P102': 30, 'P1408': 31, 'P159': 32, 'P3373': 33, 'P1303': 34, 'P17': 35, 'P106': 36, 'P551': 37, 'P937': 38, 'P355': 39, 'P710': 40, 'P137': 41, 'P674': 42, 'P466': 43, 'P136': 44, 'P306': 45, 'P127': 46, 'P400': 47, 'P974': 48, 'P1346': 49, 'P460': 50, 'P86': 51, 'P118': 52, 'P264': 53, 'P750': 54, 'P58': 55, 'P3450': 56, 'P105': 57, 'P276': 58, 'P101': 59, 'P407': 60, 'P1001': 61, 'P800': 62, 'P131': 63, 'P177': 64, 'P364': 65, 'P2094': 66, 'P361': 67, 'P641':

In [5]:
MAX_VOCAB_SIZE = 10000
MAX_SEQ_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>", filters="")
tokenizer.fit_on_texts(train_texts)

train_sequences = pad_sequences(tokenizer.texts_to_sequences(
    train_texts), maxlen=MAX_SEQ_LENGTH, padding="post")
val_sequences = pad_sequences(tokenizer.texts_to_sequences(
    val_texts), maxlen=MAX_SEQ_LENGTH, padding="post")
test_sequences = pad_sequences(tokenizer.texts_to_sequences(
    test_texts), maxlen=MAX_SEQ_LENGTH, padding="post")

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Sample tokenized sequence: {train_sequences[0]}")

Vocabulary size: 115836
Sample tokenized sequence: [3682   14  347   45 3242    1   42 1862    2  181 6594    1  602 3801
  803   25    4    1   19    3 1283   18    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [6]:
def build_model(hp):
    inputs = Input(shape=(MAX_SEQ_LENGTH,))

    # Embedding Layer
    embedding = Embedding(
        input_dim=MAX_VOCAB_SIZE,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_SEQ_LENGTH
    )(inputs)

    # First Conv1D Layer
    conv1 = Conv1D(
        filters=hp.Int("filters_1", min_value=64, max_value=256, step=64),
        kernel_size=hp.Choice("kernel_size_1", values=[3, 5, 7]),
        activation='relu'
    )(embedding)

    conv1 = BatchNormalization()(conv1)
    pooled1 = GlobalMaxPooling1D()(conv1)

    # Second Conv1D Layer
    conv2 = Conv1D(
        filters=hp.Int("filters_2", min_value=64, max_value=256, step=64),
        kernel_size=hp.Choice("kernel_size_2", values=[3, 5, 7]),
        activation='relu'
    )(embedding)  # Use embedding as input again

    conv2 = BatchNormalization()(conv2)
    pooled2 = GlobalMaxPooling1D()(conv2)

    # Concatenate both pooled layers
    merged = tf.keras.layers.Concatenate()([pooled1, pooled2])

    dense = Dense(hp.Int("dense_units", min_value=32,
                  max_value=128, step=32), activation='relu')(merged)

    dropout = Dropout(hp.Float("dropout_rate", min_value=0.2,
                      max_value=0.5, step=0.1))(dense)

    outputs = Dense(len(label_dict), activation='softmax')(dropout)

    model = keras.Model(inputs, outputs)

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=keras.optimizers.Adam(learning_rate=hp.Choice(
            "learning_rate", values=[1e-2, 1e-3, 1e-4])),
        metrics=['accuracy']
    )

    return model

In [7]:
# print(f"Train Sequences Shape: {train_sequences.shape}")  # DEBUG
# print(f"Train Labels Shape: {train_labels.shape}")  # DEBUG

# print(f"Validation Sequences Shape: {val_sequences.shape}")  # DEBUG
# print(f"Validation Labels Shape: {val_labels.shape}")  # DEBUG

In [8]:
tuner = kt.RandomSearch(
    build_model,
    objective="val_accuracy",
    max_trials=3,
    executions_per_trial=1,
    directory="models/hyperparameter_tuning",
    project_name="cnn_relation_extraction"
)

tuner.search(train_sequences, train_labels, validation_data=(
    val_sequences, val_labels), epochs=5, batch_size=32)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Best hyperparameters: {best_hps.values}")

Trial 4 Complete [00h 03m 12s]
val_accuracy: 0.6784763932228088

Best val_accuracy So Far: 0.6784763932228088
Total elapsed time: 00h 15m 34s
Best hyperparameters: {'filters_1': 256, 'kernel_size_1': 5, 'filters_2': 192, 'kernel_size_2': 3, 'dense_units': 128, 'dropout_rate': 0.2, 'learning_rate': 0.001}


In [9]:
best_model = tuner.hypermodel.build(best_hps)

best_model.fit(
    train_sequences, train_labels,
    validation_data=(val_sequences, val_labels),
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 25ms/step - accuracy: 0.2580 - loss: 3.1798 - val_accuracy: 0.6094 - val_loss: 1.3763
Epoch 2/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 26ms/step - accuracy: 0.6347 - loss: 1.2699 - val_accuracy: 0.6570 - val_loss: 1.2381
Epoch 3/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 29ms/step - accuracy: 0.7299 - loss: 0.9043 - val_accuracy: 0.6758 - val_loss: 1.1832
Epoch 4/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 27ms/step - accuracy: 0.7868 - loss: 0.6885 - val_accuracy: 0.6796 - val_loss: 1.2950
Epoch 5/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 26ms/step - accuracy: 0.8160 - loss: 0.5818 - val_accuracy: 0.6662 - val_loss: 1.4156
Epoch 6/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 26ms/step - accuracy: 0.8434 - loss: 0.4922 - val_accuracy: 0.6672 - val_loss: 1.5694
Epoc

<keras.src.callbacks.history.History at 0x36239afd0>

In [10]:
test_loss, test_acc = best_model.evaluate(test_sequences, test_labels)
print(f"Test Accuracy: {test_acc:.4f}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7042 - loss: 1.7563
Test Accuracy: 0.6720


In [12]:
models_dir = "models"
os.makedirs(models_dir, exist_ok=True)

best_model.save(os.path.join(models_dir, "cnn_relation_extraction.h5"))

with open(os.path.join(models_dir, "tokenizer.json"), "w") as f:
    json.dump(tokenizer.word_index, f)

print("Best model and tokenizer saved successfully in 'models/' folder.")



Best model and tokenizer saved successfully in 'models/' folder.
