## SKG Graph Completion with Multiple Models

### This script trains different graph embedding models for knowledge graph completion tasks
### on the SKG dataset and logs all metrics and hyperparameters to a CSV file.

In [1]:
import os
import torch
import pandas as pd
import numpy as np
import argparse
import time
import csv
from datetime import datetime
from torch.optim import Adam

from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.evaluation import RankBasedEvaluator
from pykeen.models import (
    RotatE, 
    ComplEx, 
    TransE, 
    DistMult, 
    CrossE, 
    ConvE,
    RESCAL
)
from pykeen.training import SLCWATrainingLoop

  from .autonotebook import tqdm as notebook_tqdm




 ## Set up Command Line Arguments

In [2]:
def parse_arguments():
    import sys
    
    # Handle running in Jupyter notebook
    if any('jupyter' in arg for arg in sys.argv):
        # Default arguments when running in notebook
        class Args:
            dataset = 'gyafc'
            output_file = 'results.csv'
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            seed = 42
            embedding_dims = [1024] # 128, 256, 
            learning_rates = [0.0005] # 0.001, 
            num_epochs = 1000
        return Args()
    
    # Normal argparse for command-line usage
    parser = argparse.ArgumentParser(description='Train knowledge graph embedding models on SKG data')
    parser.add_argument('--dataset', type=str, default='politeness',
                        help='Dataset folder name (e.g., politeness, olid, gyafc)')
    parser.add_argument('--output_file', type=str, default='results.csv',
                        help='Output CSV file to store results')
    parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
                        help='Device to use for training (cuda or cpu)')
    parser.add_argument('--seed', type=int, default=42, 
                        help='Random seed for reproducibility')
    parser.add_argument('--embedding_dims', type=int, nargs='+', default=[128, 256, 512],
                        help='Embedding dimensions to try')
    parser.add_argument('--learning_rates', type=float, nargs='+', default=[0.001, 0.0005],
                        help='Learning rates to try')
    parser.add_argument('--num_epochs', type=int, default=50,
                        help='Number of training epochs')
    return parser.parse_args()



 ## Utility Functions

In [3]:
def setup_dataset(dataset_name, create_inverse=False):
    """
    Set up the dataset by loading the triples and splitting into train/valid/test sets.
    
    Args:
        dataset_name: Name of the dataset folder
        create_inverse: Whether to create inverse triples
        
    Returns:
        train, valid, test factories
    """
    # Get project root (works in both scripts and notebooks)
    try:
        # For regular Python scripts
        project_root = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        # For Jupyter notebooks
        import pathlib
        project_root = str(pathlib.Path().absolute())
    
    # Define path to triples file
    triples_path = os.path.join(project_root, 'data', 'skg', dataset_name, 'triples.tsv')
    
    print(f"Loading triples from: {triples_path}")
    
    
    
    # Create triples factory
    training_factory = TriplesFactory.from_path(
        triples_path, 
        create_inverse_triples=create_inverse
    )
    
    # Split into train/valid/test
    train_factory, valid_factory, test_factory = training_factory.split([0.8, 0.1, 0.1])
    
    return train_factory, valid_factory, test_factory

In [4]:
def ensure_output_file(output_file):
    """
    Ensure the output CSV file exists with proper headers.
    """
    if not os.path.exists(output_file):
        with open(output_file, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([
                'timestamp', 'dataset', 'model', 'embedding_dim', 'inverse_triples',
                'learning_rate', 'num_epochs', 'batch_size', 'training_time',
                'hits@1', 'hits@3', 'hits@10', 'mrr', 'mr'
            ])
    return output_file

In [5]:
def log_results(result, config, output_file):
    """
    Log training results to CSV file.
    """
    with open(output_file, 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            config['dataset'],
            config['model_name'],
            config['embedding_dim'],
            config['inverse_triples'],
            config['learning_rate'],
            config['num_epochs'],
            config['batch_size'],
            config['training_time'],
            result.metric_results.get_metric('hits_at_1'),
            result.metric_results.get_metric('hits_at_3'),
            result.metric_results.get_metric('hits_at_10'),
            result.metric_results.get_metric('mrr'),
            result.metric_results.get_metric('mr')
        ])
    
    # Also print results to console
    print("\n=== Results ===")
    print(f"Model: {config['model_name']}, Dim: {config['embedding_dim']}, LR: {config['learning_rate']}")
    print(f"Hits@1: {result.metric_results.get_metric('hits_at_1'):.4f}")
    print(f"Hits@3: {result.metric_results.get_metric('hits_at_3'):.4f}")
    print(f"Hits@10: {result.metric_results.get_metric('hits_at_10'):.4f}")
    print(f"MRR: {result.metric_results.get_metric('mrr'):.4f}")
    print(f"MR: {result.metric_results.get_metric('mr'):.4f}")
    print(f"Training time: {config['training_time']:.2f} seconds")
    print("==============\n")

## Model Training Functions

In [6]:
def get_model_class(model_name):
    """
    Get the model class by name.
    """
    models = {
        'RotatE': RotatE,
        'ComplEx': ComplEx,
        'TransE': TransE,
        'DistMult': DistMult,
        'CrossE': CrossE,
        'ConvE': ConvE,
        'RESCAL': RESCAL
    }
    return models.get(model_name)

In [7]:
def train_model(model_name, train_factory, valid_factory, test_factory, config):
    """
    Train a single model with the given configuration.
    
    Args:
        model_name: Name of the model to train
        train_factory: Training triples factory
        valid_factory: Validation triples factory
        test_factory: Test triples factory
        config: Dictionary with configuration parameters
        
    Returns:
        Pipeline result object
    """
    print(f"\nTraining {model_name} with embedding_dim={config['embedding_dim']}, lr={config['learning_rate']}")
    
    # Get model class
    model_class = get_model_class(model_name)
    
    # Create model
    model = model_class(
        triples_factory=train_factory,
        embedding_dim=config['embedding_dim']
    )
    
    # Create optimizer
    optimizer = Adam(
        params=model.get_grad_params(),
        lr=config['learning_rate']
    )
    
    # Create trainer
    trainer = SLCWATrainingLoop(
        model=model,
        triples_factory=train_factory,
        optimizer=optimizer,
    )
    
    # Start timer
    start_time = time.time()

    from pykeen.stoppers import EarlyStopper
    evaluator = RankBasedEvaluator()


    # stopper = EarlyStopper(
    #     model=model,
    #     evaluation_triples_factory=valid_factory,
    #     evaluator=evaluator,
    #     evaluation_batch_size=config['batch_size'],
    #     frequency=100,  
    #     patience=4,  
    #     relative_delta=0.01,
    #     metric='hits_at_1',
    #     training_triples_factory=train_factory,
    #     use_tdqm=True,
        
    # )
    
    # Run pipeline
    result = pipeline(
        training=train_factory,
        validation=valid_factory,
        testing=test_factory,
        model=model,
        training_loop=trainer,
        negative_sampler='basic',
        evaluator=evaluator,
        # stopper=stopper,
        training_kwargs=dict(
            num_epochs=config['num_epochs'],
            batch_size=config['batch_size'],
        ),
        evaluator_kwargs=dict(
            batch_size=config['batch_size'],
        ),
        device=config['device'],
        random_seed=config['seed'],
    )


    training_time = time.time() - start_time
    
    config['training_time'] = training_time
    
    return result

## Main Execution

In [8]:
def main():
    args = parse_arguments()
    
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    
    output_file = ensure_output_file(args.output_file)
    
    models = ['RotatE' ] #  'ComplEx', 'TransE', 'DistMult'
    
    inverse_triples_models = ['CrossE', 'TransE']
    
    # Loop over models
    for model_name in models:
        # Determine if this model should use inverse triples
        use_inverse = model_name in inverse_triples_models
        
        # Set up dataset
        train_factory, valid_factory, test_factory = setup_dataset(
            args.dataset, 
            create_inverse=use_inverse
        )
        
        for embedding_dim in args.embedding_dims:
            for lr in args.learning_rates:
                config = {
                    'model_name': model_name,
                    'dataset': args.dataset,
                    'embedding_dim': embedding_dim,
                    'learning_rate': lr,
                    'num_epochs': args.num_epochs,
                    'batch_size': 2048,
                    'device': args.device,
                    'inverse_triples': use_inverse,
                    'seed': args.seed,
                }
                
                result = train_model(
                    model_name,
                    train_factory,
                    valid_factory,
                    test_factory,
                    config
                )
                
                log_results(result, config, output_file)

In [9]:
main()

Loading triples from: /home/bosa/skg/data/skg/gyafc/triples.tsv


using automatically assigned random_state=275319731
No random seed is specified. This may lead to non-reproducible results.



Training RotatE with embedding_dim=1024, lr=0.0005


Training epochs on cuda:0: 100%|██████████| 1000/1000 [47:42<00:00,  2.86s/epoch, loss=0.00118, prev_loss=0.00118]
Evaluating on cuda:0: 100%|██████████| 19.2k/19.2k [21:00<00:00, 15.3triple/s] 
INFO:pykeen.evaluation.evaluator:Evaluation took 1272.80s seconds



=== Results ===
Model: RotatE, Dim: 1024, LR: 0.0005
Hits@1: 0.2567
Hits@3: 0.3849
Hits@10: 0.4909
MRR: 0.3398
MR: 3590.3472
Training time: 4136.65 seconds

