In [4]:
# !pip install numpy tensorflow keras-tuner

In [7]:
import os
import json
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt  # For hyperparameter tuning

In [10]:
def load_data(data_dir):
    train_file = os.path.join(data_dir, "train.json")
    val_file = os.path.join(data_dir, "val.json")
    test_file = os.path.join(data_dir, "test.json")

    with open(train_file, "r") as f:
        train_data = json.load(f)
    with open(val_file, "r") as f:
        val_data = json.load(f)
    with open(test_file, "r") as f:
        test_data = json.load(f)

    return train_data, val_data, test_data


data_dir = "../dataset"  # Move out of 'models' and into 'dataset'
train_data, val_data, test_data = load_data(data_dir)

print(f"Loaded {len(train_data)} training relations, {len(val_data)} validation, {len(test_data)} test.")

Loaded 81 training relations, 81 validation, 81 test.


In [11]:
def preprocess_data(data):
    texts, labels = [], []
    label_dict = {}
    label_index = 0

    for relation, samples in data.items():
        if relation not in label_dict:
            label_dict[relation] = label_index
            label_index += 1

        for sample in samples:
            # No need to join, it's already a string
            texts.append(sample["tokens"])
            labels.append(label_dict[relation])

    return texts, np.array(labels), label_dict


train_texts, train_labels, label_dict = preprocess_data(train_data)
val_texts, val_labels, _ = preprocess_data(val_data)
test_texts, test_labels, _ = preprocess_data(test_data)

print(f"Sample text: {train_texts[0]}")
print(f"Label dictionary: {label_dict}")

Sample text: Employed by Australian National Airways (ANA) after leaving the Air Force, Lukis become airfield manager at [E1S] Essendon [E1E], [E2S] Melbourne [E2E].
Label dictionary: {'P931': 0, 'P4552': 1, 'P140': 2, 'P1923': 3, 'P150': 4, 'P6': 5, 'P27': 6, 'P449': 7, 'P1435': 8, 'P175': 9, 'P1344': 10, 'P39': 11, 'P527': 12, 'P740': 13, 'P706': 14, 'P84': 15, 'P495': 16, 'P123': 17, 'P57': 18, 'P22': 19, 'P178': 20, 'P241': 21, 'P403': 22, 'P1411': 23, 'P135': 24, 'P991': 25, 'P156': 26, 'P176': 27, 'P31': 28, 'P1877': 29, 'P102': 30, 'P1408': 31, 'P159': 32, 'P3373': 33, 'P1303': 34, 'P17': 35, 'P106': 36, 'P551': 37, 'P937': 38, 'P355': 39, 'P710': 40, 'P137': 41, 'P674': 42, 'P466': 43, 'P136': 44, 'P306': 45, 'P127': 46, 'P400': 47, 'P974': 48, 'P1346': 49, 'P460': 50, 'P86': 51, 'P118': 52, 'P264': 53, 'P750': 54, 'P58': 55, 'P3450': 56, 'P105': 57, 'P276': 58, 'P101': 59, 'P407': 60, 'P1001': 61, 'P800': 62, 'P131': 63, 'P177': 64, 'P364': 65, 'P2094': 66, 'P361': 67, 'P641':

In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Get unique relation labels
num_classes = len(label_dict)

# Generate random predictions
random_predictions = np.random.randint(0, num_classes, size=len(test_labels))

# Compute accuracy
random_acc = accuracy_score(test_labels, random_predictions)

# Generate a classification report
random_report = classification_report(test_labels, random_predictions, digits=4, zero_division=0)

print(f"Random Baseline Accuracy: {random_acc:.4f}")
print("Random Baseline Classification Report:")
print(random_report)

Random Baseline Accuracy: 0.0145
Random Baseline Classification Report:
              precision    recall  f1-score   support

           0     0.0220    0.0286    0.0248        70
           1     0.0120    0.0143    0.0131        70
           2     0.0119    0.0143    0.0130        70
           3     0.0112    0.0143    0.0126        70
           4     0.0000    0.0000    0.0000        70
           5     0.0260    0.0286    0.0272        70
           6     0.0154    0.0143    0.0148        70
           7     0.0111    0.0143    0.0125        70
           8     0.0294    0.0286    0.0290        70
           9     0.0000    0.0000    0.0000        70
          10     0.0779    0.0857    0.0816        70
          11     0.0149    0.0143    0.0146        70
          12     0.0116    0.0143    0.0128        70
          13     0.0143    0.0143    0.0143        70
          14     0.0303    0.0286    0.0294        70
          15     0.0000    0.0000    0.0000        70
         