In [None]:
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
def load_data(data_dir):
    train_file = os.path.join(data_dir, "train.json")
    val_file = os.path.join(data_dir, "val.json")
    test_file = os.path.join(data_dir, "test.json")

    with open(train_file, "r") as f:
        train_data = json.load(f)
    with open(val_file, "r") as f:
        val_data = json.load(f)
    with open(test_file, "r") as f:
        test_data = json.load(f)

    return train_data, val_data, test_data


data_dir = "pre-processed"  # Change this if necessary
train_data, val_data, test_data = load_data(data_dir)

print(f"Loaded {len(train_data)} training relations, {len(val_data)} validation, {len(test_data)} test.")

Loaded 81 training relations, 81 validation, 81 test.


In [3]:
def preprocess_data(data):
    texts, labels = [], []
    label_dict = {}
    label_index = 0

    for relation, samples in data.items():
        if relation not in label_dict:
            label_dict[relation] = label_index
            label_index += 1

        for sample in samples:
            # Convert token list to sentence
            texts.append(" ".join(sample["tokens"]))
            labels.append(label_dict[relation])

    return texts, np.array(labels), label_dict


train_texts, train_labels, label_dict = preprocess_data(train_data)
val_texts, val_labels, _ = preprocess_data(val_data)
test_texts, test_labels, _ = preprocess_data(test_data)

print(f"Sample text: {train_texts[0]}")
print(f"Label dictionary: {label_dict}")

Sample text: Employed by Australian National Airways ( ANA ) after leaving the Air Force , Lukis become airfield manager at Essendon , Melbourne .
Label dictionary: {'P931': 0, 'P4552': 1, 'P140': 2, 'P1923': 3, 'P150': 4, 'P6': 5, 'P27': 6, 'P449': 7, 'P1435': 8, 'P175': 9, 'P1344': 10, 'P39': 11, 'P527': 12, 'P740': 13, 'P706': 14, 'P84': 15, 'P495': 16, 'P123': 17, 'P57': 18, 'P22': 19, 'P178': 20, 'P241': 21, 'P403': 22, 'P1411': 23, 'P135': 24, 'P991': 25, 'P156': 26, 'P176': 27, 'P31': 28, 'P1877': 29, 'P102': 30, 'P1408': 31, 'P159': 32, 'P3373': 33, 'P1303': 34, 'P17': 35, 'P106': 36, 'P551': 37, 'P937': 38, 'P355': 39, 'P710': 40, 'P137': 41, 'P674': 42, 'P466': 43, 'P136': 44, 'P306': 45, 'P127': 46, 'P400': 47, 'P974': 48, 'P1346': 49, 'P460': 50, 'P86': 51, 'P118': 52, 'P264': 53, 'P750': 54, 'P58': 55, 'P3450': 56, 'P105': 57, 'P276': 58, 'P101': 59, 'P407': 60, 'P1001': 61, 'P800': 62, 'P131': 63, 'P177': 64, 'P364': 65, 'P2094': 66, 'P361': 67, 'P641': 68, 'P59': 69, 'P4

In [4]:
MAX_VOCAB_SIZE = 10000
MAX_SEQ_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = pad_sequences(tokenizer.texts_to_sequences(
    train_texts), maxlen=MAX_SEQ_LENGTH, padding="post")
val_sequences = pad_sequences(tokenizer.texts_to_sequences(
    val_texts), maxlen=MAX_SEQ_LENGTH, padding="post")
test_sequences = pad_sequences(tokenizer.texts_to_sequences(
    test_texts), maxlen=MAX_SEQ_LENGTH, padding="post")

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Sample tokenized sequence: {train_sequences[0]}")

Vocabulary size: 84148
Sample tokenized sequence: [3347   10  395   33 3146 5381   31 1916    2  191  356    1  680 3459
  776   16    1 1117    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [5]:
def build_cnn_model():
    model = keras.Sequential([
        Input(shape=(MAX_SEQ_LENGTH,)),
        Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM,
                  input_length=MAX_SEQ_LENGTH),
        Conv1D(filters=128, kernel_size=5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        # Multi-class classification
        Dense(len(label_dict), activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model


cnn_model = build_cnn_model()
cnn_model.summary()



In [6]:
cnn_model.fit(
    train_sequences, train_labels,
    validation_data=(val_sequences, val_labels),
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.1274 - loss: 3.8693 - val_accuracy: 0.4165 - val_loss: 2.2787
Epoch 2/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - accuracy: 0.3841 - loss: 2.3308 - val_accuracy: 0.4724 - val_loss: 1.9837
Epoch 3/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.4689 - loss: 1.9287 - val_accuracy: 0.4950 - val_loss: 1.9053
Epoch 4/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - accuracy: 0.5249 - loss: 1.6706 - val_accuracy: 0.4847 - val_loss: 1.9103
Epoch 5/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.5595 - loss: 1.5047 - val_accuracy: 0.4857 - val_loss: 1.9412
Epoch 6/10
[1m1503/1503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.6053 - loss: 1.3232 - val_accuracy: 0.4844 - val_loss: 2.0325
Epoch 7

<keras.src.callbacks.history.History at 0x3593cf2d0>

In [7]:
test_loss, test_acc = cnn_model.evaluate(test_sequences, test_labels)
print(f"Test Accuracy: {test_acc:.4f}")

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5157 - loss: 2.5212
Test Accuracy: 0.4736


In [8]:
cnn_model.save("cnn_relation_extraction.h5")

with open("tokenizer.json", "w") as f:
    json.dump(tokenizer.word_index, f)

print("Model and tokenizer saved successfully.")



Model and tokenizer saved successfully.
