In [None]:
# Import required libraries for knowledge graph embeddings
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torch.optim import Adam
from pykeen import predict
from pykeen.models import TransE, ComplEx, DistMult, RotatE
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pykeen.evaluation import RankBasedEvaluator
from pykeen.training import SLCWATrainingLoop
from pykeen.sampling import BasicNegativeSampler
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import numpy as np



# Custom function to load triples from our dataset folders
def load_custom_dataset(dataset_name):
    """
    Load custom dataset triples from the given folder
    
    Args:
        dataset_name: Name of the dataset folder (e.g., 'appdia', 'imdb', etc.)
    
    Returns:
        A TriplesFactory object with the dataset loaded
    """
    # Path to the triples file
    triples_path = f"/home/bosa/skg/data/skg/{dataset_name}/triples.tsv"
    
    # Read triples
    triples_df = pd.read_csv(triples_path, sep='\t')
    
    # Create entity and relation mappings
    entity_to_id = {}
    relation_to_id = {}
    
    # First pass to create mappings
    for h, r, t in triples_df.values:
        if h not in entity_to_id:
            entity_to_id[h] = len(entity_to_id)
        if t not in entity_to_id:
            entity_to_id[t] = len(entity_to_id)
        if r not in relation_to_id:
            relation_to_id[r] = len(relation_to_id)
    
    # Second pass to create mapped triples
    mapped_triples = []
    for h, r, t in triples_df.values:
        mapped_triples.append([entity_to_id[h], relation_to_id[r], entity_to_id[t]])
    
    # Create TriplesFactory
    return TriplesFactory(
        mapped_triples=np.array(mapped_triples),
        entity_to_id=entity_to_id,
        relation_to_id=relation_to_id
    )

# Function to run experiment and print results
def run_experiment(dataset_name, model_class, embedding_dim=128, num_epochs=200, batch_size=512, learning_rate=0.01, use_graph_factory=False, graph_factory=None):
    """
    Run a knowledge graph embedding experiment on the given dataset
    
    Args:
        dataset_name: Name of the dataset folder
        model_class: PyKEEN model class to use
        embedding_dim: Dimension of embeddings
        num_epochs: Number of training epochs
        batch_size: Batch size for training
        learning_rate: Learning rate for optimizer
        use_graph_factory: Whether to use the provided graph_factory instead of loading from file
        graph_factory: TriplesFactory created from the NetworkX graph
    """
    print(f"Running experiment with {model_class.__name__} on {dataset_name} dataset")
    
    # Load dataset
    if use_graph_factory and graph_factory is not None:
        print("Using graph created with create_nx_graph instead of loading from triples.tsv")
        triples_factory = graph_factory
    else:
        print("Loading triples from triples.tsv file")
        triples_factory = load_custom_dataset(dataset_name)
    
    # Create train/test split (80/20)
    training, testing = triples_factory.split([0.8, 0.2])
    
    # Create model
    model = model_class(triples_factory=training, embedding_dim=embedding_dim)
    
    # Create optimizer and training loop
    optimizer = Adam(params=model.get_grad_params(), lr=learning_rate)
    trainer = SLCWATrainingLoop(
        model=model,
        triples_factory=training,
        optimizer=optimizer
    )
    
    # Run pipeline
    result = pipeline(
        training=training,
        testing=testing,
        model=model,
        evaluator=RankBasedEvaluator,
        negative_sampler=BasicNegativeSampler, 
        training_loop=trainer,
        training_kwargs=dict(
            num_epochs=num_epochs,
            batch_size=batch_size
        ),
        evaluator_kwargs=dict(
            batch_size=batch_size
        ),
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    
    # Print metrics
    hits10 = result.get_metric('hits_at_10')
    hits1 = result.get_metric('hits_at_1')
    hits3 = result.get_metric('hits_at_3')
    mrr = result.get_metric('mean_reciprocal_rank')
    mr = result.get_metric('mean_rank')
    
    print(f'Hits@1: {hits1:.4f}')
    print(f'Hits@3: {hits3:.4f}')
    print(f'Hits@10: {hits10:.4f}')
    print(f'MRR: {mrr:.4f}')
    print(f'MR: {mr:.4f}')
    
    return result

  from .autonotebook import tqdm as notebook_tqdm


# Knowledge Graph Embedding Experiments on Custom Datasets

This notebook performs knowledge graph embedding experiments on custom datasets stored in the `data/skg/` folder. It replicates the experiments from the `wn_completion_experiments.ipynb` notebook but uses custom knowledge graphs instead of WordNet.

The notebook includes experiments with four different knowledge graph embedding models:
1. RotatE
2. DistMult
3. ComplEx
4. TransE

The main steps are:
1. Loading custom triples from the dataset folders
2. Running experiments with each model
3. Comparing results across models
4. Experimenting with multiple datasets
5. Hyperparameter tuning

In [None]:
# Get list of available datasets
dataset_folders = [d for d in os.listdir('/home/bosa/skg/data/skg/') 
                 if os.path.isdir(os.path.join('/home/bosa/skg/data/skg/', d)) 
                 and os.path.exists(os.path.join('/home/bosa/skg/data/skg/', d, 'triples.tsv'))]
print(f"Available datasets: {dataset_folders}")

# Select a dataset to experiment with
selected_dataset = 'imdb'  # Change this to experiment with different datasets

Available datasets: ['shakespeare', 'politeness', 'olid', 'sarcasm', 'appdia', 'paradetox', 'gyafc', 'yelp', 'imdb', 'wnc']


## Graph Creation Approach

This notebook offers two approaches to create knowledge graphs for experiments:

1. **Direct Graph Creation**: Using the `create_nx_graph` function from the `graph_creation` module to create NetworkX graph objects directly from the source data files (nodes, edges, etc.)

2. **Triples File Loading**: Loading pre-generated triples from `triples.tsv` files in each dataset folder

The notebook will attempt the direct graph creation first and fall back to triples file loading if there are any issues. You can also specify which method to use when creating the graph.

In [None]:
# Function to create triples from NetworkX graph for PyKEEN
def create_triples_from_graph(graph):
    """
    Convert a NetworkX graph to a list of triples (head, relation, tail) for PyKEEN
    
    Args:
        graph: NetworkX MultiGraph object
        
    Returns:
        List of triples in the format [head, relation, tail]
    """
    triples = []
    
    for u, v, data in graph.edges(data=True):
        edge_type = data.get('edge_type', 'default_edge')
        triples.append([u, edge_type, v])
        
    return triples


# Fix import issues with the graph_creation module
import sys
sys.path.append('/home/bosa/skg')

# Import networkx explicitly
import networkx as nx
import os
import numpy as np

# Create a wrapper function to run create_nx_graph with proper imports
def create_nx_graph_wrapper(dataset_name, style_1_name, style_2_name):
    """
    Wrapper to create NetworkX graph with proper imports
    
    This function handles the import issues in create_graph.py by
    ensuring all required modules are in the correct path
    """
    # Save current directory
    original_dir = os.getcwd()
    
    try:
        # Change to the graph_creation directory to make relative imports work
        # os.chdir('/home/bosa/skg/graph_creation')
        
        # # Add graph_creation to the path
        # if '/home/bosa/skg/graph_creation' not in sys.path:
        #     sys.path.append('/home/bosa/skg/graph_creation')
        
        # # Now import modules directly from their location
        from util.graph_creation.create_graph import create_nx_graph
        
        # Create and return the graph
        return create_nx_graph(dataset_name, style_1_name, style_2_name)
    
    finally:
        # Restore original directory
        os.chdir(original_dir)

In [4]:
# Make sure networkx is imported
import networkx as nx

# Check if all necessary modules are available
try:
    print(f"Imported create_graph module: {create_graph.__name__}")
    print(f"Imported create_nodes module: {create_nodes.__name__}")
    print(f"Imported create_edges module: {create_edges.__name__}")
    print(f"All required modules are available")
except Exception as e:
    print(f"Error checking modules: {e}")
    print("Please make sure all required modules are installed and properly imported")

Error checking modules: name 'create_graph' is not defined
Please make sure all required modules are installed and properly imported


In [None]:
# Function to use prepare_triples to generate triple files
def create_triples_file(dataset_name):
    """
    Use the prepare_triples module to create a triples.tsv file for the dataset
    
    Args:
        dataset_name: Name of the dataset (e.g., 'appdia', 'imdb', etc.)
    """
    # Get style names from the pmi edge files
    data_dir = f'/home/bosa/skg/data/skg/{dataset_name}'
    pmi_files = [f for f in os.listdir(data_dir) if f.startswith('pmi_')]
    
    # Extract style names
    style_names = [f.replace('pmi_', '').replace('.edges', '') for f in pmi_files]
    if len(style_names) < 2:
        raise ValueError(f"Need at least 2 style names for dataset {dataset_name}, found {style_names}")
    
    # Import prepare_triples
    from graph_creation import prepare_triples
    
    # Use prepare_triples to create the triples file
    print(f"Creating triples for {dataset_name} with styles: {style_names[0]} and {style_names[1]}")
    prepare_triples.prepare_triples(dataset_name, style_names[0], style_names[1])
    
    # Verify the file was created
    triples_path = f"/home/bosa/skg/data/skg/{dataset_name}/triples.tsv"
    if os.path.exists(triples_path):
        print(f"Successfully created triples file at {triples_path}")
        # Count number of triples
        with open(triples_path, 'r') as f:
            num_lines = sum(1 for _ in f) - 1  # Subtract 1 for header
        print(f"Number of triples: {num_lines}")
    else:
        print(f"Failed to create triples file at {triples_path}")

# Uncomment to create triples file for a dataset if needed
# create_triples_file(selected_dataset)

In [None]:
# Function to choose the best approach for knowledge graph creation
def create_knowledge_graph(dataset_name, method='direct', recreate_triples=False):
    """
    Creates a knowledge graph for the given dataset using either direct graph creation
    or loading from triples file
    
    Args:
        dataset_name: Name of the dataset (e.g., 'appdia', 'imdb', etc.)
        method: Method to use for graph creation ('direct' or 'triples')
        recreate_triples: Whether to recreate the triples file if using 'triples' method
        
    Returns:
        TriplesFactory object with the graph data
    """
    if method == 'direct':
        try:
            print(f"Attempting direct graph creation for {dataset_name}...")
            return create_and_prepare_graph(dataset_name), True
        except Exception as e:
            print(f"Direct graph creation failed: {e}")
            print("Falling back to triples file method...")
            method = 'triples'
    
    if method == 'triples':
        # Check if triples file exists and recreate if needed
        triples_path = f"/home/bosa/skg/data/skg/{dataset_name}/triples.tsv"
        if not os.path.exists(triples_path) or recreate_triples:
            try:
                print(f"Creating triples file for {dataset_name}...")
                create_triples_file(dataset_name)
            except Exception as e:
                print(f"Failed to create triples file: {e}")
                raise ValueError(f"Cannot create knowledge graph for {dataset_name}: both methods failed")
        
        # Load from triples file
        print(f"Loading from triples file for {dataset_name}...")
        return load_custom_dataset(dataset_name), False
    
    raise ValueError(f"Invalid method: {method}. Use 'direct' or 'triples'.")

# Example usage:
# graph_triples_factory, use_created_graph = create_knowledge_graph(selected_dataset, method='direct')
# Or to force using triples file:
# graph_triples_factory, use_created_graph = create_knowledge_graph(selected_dataset, method='triples', recreate_triples=False)

In [None]:
## Function to create graph and prepare it for PyKEEN
def create_and_prepare_graph(dataset_name):
    """
    Create a NetworkX graph using create_nx_graph and prepare it for PyKEEN
    
    Args:
        dataset_name: Name of the dataset (e.g., 'appdia', 'imdb', etc.)
        
    Returns:
        TriplesFactory object with the graph data
    """
    # Get style names from the pmi edge files
    data_dir = f'/home/bosa/skg/data/skg/{dataset_name}'
    pmi_files = [f for f in os.listdir(data_dir) if f.startswith('pmi_')]
    
    # Extract style names
    style_names = [f.replace('pmi_', '').replace('.edges', '') for f in pmi_files]
    if len(style_names) < 2:
        raise ValueError(f"Need at least 2 style names for dataset {dataset_name}, found {style_names}")
    
    # Call our wrapper function instead of directly using create_nx_graph
    print(f"Creating graph for {dataset_name} with styles: {style_names[0]} and {style_names[1]}")
    graph = create_nx_graph_wrapper(dataset_name, style_names[0], style_names[1])
    
    # Get some statistics
    print(f"Graph Statistics:")
    print(f"Number of nodes: {graph.number_of_nodes()}")
    print(f"Number of edges: {graph.number_of_edges()}")
    
    # Get edge types
    edge_types = set()
    for _, _, data in graph.edges(data=True):
        if 'edge_type' in data:
            edge_types.add(data['edge_type'])
    print(f"Edge types: {edge_types}")
    
    # Convert to triples
    triples = create_triples_from_graph(graph)
    
    # Create mappings for entities and relations
    entity_to_id = {}
    relation_to_id = {}
    
    # First pass to create mappings
    for h, r, t in triples:
        if h not in entity_to_id:
            entity_to_id[h] = len(entity_to_id)
        if t not in entity_to_id:
            entity_to_id[t] = len(entity_to_id)
        if r not in relation_to_id:
            relation_to_id[r] = len(relation_to_id)
    
    # Second pass to create mapped triples
    mapped_triples = []
    for h, r, t in triples:
        mapped_triples.append([entity_to_id[h], relation_to_id[r], entity_to_id[t]])
    
    # Create TriplesFactory
    from pykeen.triples import TriplesFactory
    return TriplesFactory(
        mapped_triples=np.array(mapped_triples),
        entity_to_id=entity_to_id,
        relation_to_id=relation_to_id
    )

In [8]:
# Create a graph for the selected dataset
try:
    print(f"Creating knowledge graph for {selected_dataset}...")
    # Use the unified approach - tries direct graph creation first, falls back to triples if needed
    graph_triples_factory, use_created_graph = create_knowledge_graph(selected_dataset, method='direct')
    
    # Print info about the created graph
    print(f"\nSuccessfully created graph and converted to PyKEEN TriplesFactory")
    print(f"Method used: {'Direct graph creation' if use_created_graph else 'Triples file loading'}")
    print(f"Number of unique entities: {len(graph_triples_factory.entity_to_id)}")
    print(f"Number of unique relations: {len(graph_triples_factory.relation_to_id)}")
    print(f"Number of triples: {graph_triples_factory.num_triples}")
    
    # Set flag for model experiments
    graph_factory_available = True
    
    # Optionally visualize a small sample of the graph
    # Uncomment to visualize
    # if use_created_graph:
    #     visualize_graph_sample(selected_dataset, sample_size=30)
    
except Exception as e:
    print(f"Error creating graph: {e}")
    print("Will fall back to loading triples from triples.tsv file in each experiment")
    graph_factory_available = False
    graph_triples_factory = None
    use_created_graph = False

Creating knowledge graph for appdia...
Attempting direct graph creation for appdia...
Creating graph for appdia with styles: offensive and non-offensive
Direct graph creation failed: No module named 'nltk'
Falling back to triples file method...
Loading from triples file for appdia...

Successfully created graph and converted to PyKEEN TriplesFactory
Method used: Triples file loading
Number of unique entities: 14880
Number of unique relations: 6
Number of triples: 24482


# Graph Visualization

# RotatE Model Experiments

In [None]:
# Run experiment with RotatE model
rotate_result = run_experiment(
    dataset_name=selected_dataset,
    model_class=RotatE,
    embedding_dim=128,  
    num_epochs=200,
    batch_size=512,
    learning_rate=0.01,
    use_graph_factory=use_created_graph,
    graph_factory=graph_triples_factory if use_created_graph else None
)

using automatically assigned random_state=1915716316
No random seed is specified. This may lead to non-reproducible results.


Running experiment with RotatE on appdia dataset
Loading triples from triples.tsv file


No random seed is specified. Setting to 1768047711.
Training epochs on cuda:0: 100%|██████████| 200/200 [00:53<00:00,  3.73epoch/s, loss=0.00266, prev_loss=0.00222]
Evaluating on cuda:0: 100%|██████████| 4.90k/4.90k [00:03<00:00, 1.39ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 3.56s seconds


Hits@1: 0.2614
Hits@3: 0.3317
Hits@10: 0.4015
MRR: 0.3109
MR: 1588.3792


# DistMult Model Experiments

In [10]:
# Run experiment with DistMult model
distmult_result = run_experiment(
    dataset_name=selected_dataset,
    model_class=DistMult,
    embedding_dim=128,
    num_epochs=200,
    batch_size=512,
    learning_rate=0.01,
    use_graph_factory=use_created_graph,
    graph_factory=graph_triples_factory if use_created_graph else None
)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [5264, 4897]
INFO:pykeen.pipeline.api:Using device: cuda


Running experiment with DistMult on appdia dataset
Loading triples from triples.tsv file


Training epochs on cuda:0: 100%|██████████| 200/200 [00:55<00:00,  3.59epoch/s, loss=0.563, prev_loss=0.573]
Evaluating on cuda:0: 100%|██████████| 4.90k/4.90k [00:00<00:00, 5.37ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.96s seconds


Hits@1: 0.1095
Hits@3: 0.1507
Hits@10: 0.2116
MRR: 0.1443
MR: 3912.2021


# ComplEx Model Experiments

In [None]:
# Run experiment with ComplEx model
complex_result = run_experiment(
    dataset_name=selected_dataset,
    model_class=ComplEx,
    embedding_dim=128,
    num_epochs=200,
    batch_size=512,
    learning_rate=0.01,
    use_graph_factory=use_created_graph,
    graph_factory=graph_triples_factory if use_created_graph else None
)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [5264, 4897]
INFO:pykeen.pipeline.api:Using device: cuda


Running experiment with ComplEx on appdia dataset
Loading triples from triples.tsv file


Training epochs on cuda:0:  15%|█▌        | 30/200 [00:10<01:01,  2.74epoch/s, loss=2.94, prev_loss=3.01]

# TransE Model Experiments

In [None]:
# Run experiment with TransE model
transe_result = run_experiment(
    dataset_name=selected_dataset,
    model_class=TransE,
    embedding_dim=128,
    num_epochs=200,
    batch_size=512,
    learning_rate=0.01,
    use_graph_factory=use_created_graph,
    graph_factory=graph_triples_factory if use_created_graph else None
)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [5264, 4897]


INFO:pykeen.pipeline.api:Using device: cuda


Running experiment with TransE on appdia dataset
Loading triples from triples.tsv file


Training epochs on cuda:0: 100%|██████████| 100/100 [00:25<00:00,  3.90epoch/s, loss=0.0057, prev_loss=0.00575]
Evaluating on cuda:0: 100%|██████████| 4.90k/4.90k [00:00<00:00, 5.44ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.95s seconds


Hits@1: 0.0000
Hits@3: 0.0379
Hits@10: 0.0854
MRR: 0.0316
MR: 2320.5002


# Comparing Results Across Models

In [None]:
# Function to collect metrics from all models
def compare_models(results_dict):
    metrics = ['hits_at_1', 'hits_at_3', 'hits_at_10', 'mean_reciprocal_rank', 'mean_rank']
    comparison = {}
    
    for metric in metrics:
        comparison[metric] = []
        for model_name, result in results_dict.items():
            comparison[metric].append((model_name, result.get_metric(metric)))
    
    # Print comparison
    for metric in metrics:
        print(f"\n{metric.upper()}:")
        for model_name, value in sorted(comparison[metric], key=lambda x: x[1], reverse=True if metric != 'mean_rank' else False):
            print(f"{model_name}: {value:.4f}")

# Collect all results
all_results = {
    'RotatE': rotate_result,
    'DistMult': distmult_result,
    'ComplEx': complex_result,
    'TransE': transe_result
}

# Compare models
compare_models(all_results)


HITS_AT_1:
RotatE: 0.2493
DistMult: 0.0711
ComplEx: 0.0002
TransE: 0.0000

HITS_AT_3:
RotatE: 0.3182
DistMult: 0.0984
TransE: 0.0379
ComplEx: 0.0006

HITS_AT_10:
RotatE: 0.3841
DistMult: 0.1327
TransE: 0.0854
ComplEx: 0.0015

MEAN_RECIPROCAL_RANK:
RotatE: 0.2979
DistMult: 0.0918
TransE: 0.0316
ComplEx: 0.0011

MEAN_RANK:
RotatE: 1338.8655
TransE: 2320.5002
DistMult: 5336.5811
ComplEx: 7393.7578


# Experimenting with Multiple Datasets

In [None]:
# Function to run experiments on multiple datasets with the best performing model
def run_multi_dataset_experiments(datasets, model_class, use_created_graphs=True, **kwargs):
    results = {}
    
    for dataset in datasets:
        print(f"\n{'='*50}\nRunning experiments on {dataset} dataset\n{'='*50}")
        
        # Try to create graph first
        if use_created_graphs:
            try:
                graph_factory = create_and_prepare_graph(dataset)
                use_graph = True
            except Exception as e:
                print(f"Error creating graph for {dataset}: {e}")
                print("Will fall back to loading triples from triples.tsv file")
                graph_factory = None
                use_graph = False
        else:
            graph_factory = None
            use_graph = False
        
        # Run experiment
        results[dataset] = run_experiment(
            dataset_name=dataset, 
            model_class=model_class, 
            use_graph_factory=use_graph,
            graph_factory=graph_factory,
            **kwargs
        )
    
    return results

# Example: Uncomment and run the following to experiment with multiple datasets
'''
# Select datasets to experiment with
selected_datasets = ['appdia', 'imdb', 'olid']  # Add more or change as needed

# Run experiments with the best performing model (based on previous comparison)
best_model = ComplEx  # Change this to the best model from your comparison

# Run multi-dataset experiments
dataset_results = run_multi_dataset_experiments(
    datasets=selected_datasets,
    model_class=best_model,
    use_created_graphs=True,  # Set to False to use triples.tsv files instead
    embedding_dim=128,
    num_epochs=200,
    batch_size=512,
    learning_rate=0.01
)
'''

"\n# Select datasets to experiment with\nselected_datasets = ['appdia', 'imdb', 'olid']  # Add more or change as needed\n\n# Run experiments with the best performing model (based on previous comparison)\nbest_model = ComplEx  # Change this to the best model from your comparison\n\n# Run multi-dataset experiments\ndataset_results = run_multi_dataset_experiments(\n    datasets=selected_datasets,\n    model_class=best_model,\n    use_created_graphs=True,  # Set to False to use triples.tsv files instead\n    embedding_dim=128,\n    num_epochs=100,\n    batch_size=512,\n    learning_rate=0.01\n)\n"

# Hyperparameter Tuning

In [None]:
# Function to run hyperparameter tuning experiments
def run_hyperparameter_tuning(dataset_name, model_class, embedding_dims, learning_rates, use_created_graph=True, graph_factory=None, num_epochs=200, batch_size=512):
    results = []
    
    for embedding_dim in embedding_dims:
        for lr in learning_rates:
            print(f"\n{'='*50}\nTesting with embedding_dim={embedding_dim}, lr={lr}\n{'='*50}")
            
            result = run_experiment(
                dataset_name=dataset_name,
                model_class=model_class,
                embedding_dim=embedding_dim,
                num_epochs=num_epochs,
                batch_size=batch_size,
                learning_rate=lr,
                use_graph_factory=use_created_graph,
                graph_factory=graph_factory
            )
            
            results.append({
                'embedding_dim': embedding_dim,
                'learning_rate': lr,
                'hits@1': result.get_metric('hits_at_1'),
                'hits@3': result.get_metric('hits_at_3'),
                'hits@10': result.get_metric('hits_at_10'),
                'mrr': result.get_metric('mean_reciprocal_rank'),
                'mr': result.get_metric('mean_rank')
            })
    
    # Convert to DataFrame for easy analysis
    return pd.DataFrame(results)

# Example: Uncomment and run the following to perform hyperparameter tuning
'''
# Define hyperparameters to tune
embedding_dims = [100, 500, 1024]
learning_rates = [0.001, 0.01, 0.1]

# Select best model and dataset
best_model = ComplEx  # Change based on your earlier results
tuning_dataset = selected_dataset

# Run hyperparameter tuning
hyperparameter_results = run_hyperparameter_tuning(
    dataset_name=tuning_dataset,
    model_class=best_model,
    embedding_dims=embedding_dims,
    learning_rates=learning_rates,
    use_created_graph=use_created_graph,
    graph_factory=graph_triples_factory if use_created_graph else None
)

# Sort by MRR (or another metric of your choice) to find best hyperparameters
hyperparameter_results.sort_values('mrr', ascending=False)
'''

"\n# Define hyperparameters to tune\nembedding_dims = [100, 500, 1024]\nlearning_rates = [0.001, 0.01, 0.1]\n\n# Select best model and dataset\nbest_model = ComplEx  # Change based on your earlier results\ntuning_dataset = selected_dataset\n\n# Run hyperparameter tuning\nhyperparameter_results = run_hyperparameter_tuning(\n    dataset_name=tuning_dataset,\n    model_class=best_model,\n    embedding_dims=embedding_dims,\n    learning_rates=learning_rates,\n    use_created_graph=use_created_graph,\n    graph_factory=graph_triples_factory if use_created_graph else None\n)\n\n# Sort by MRR (or another metric of your choice) to find best hyperparameters\nhyperparameter_results.sort_values('mrr', ascending=False)\n"

In [None]:
# Optional: Visualize a subset of the graph
def visualize_graph_sample(dataset_name, sample_size=100):
    """
    Create and visualize a sample of the graph
    
    Args:
        dataset_name: Name of the dataset
        sample_size: Number of nodes to sample
    """
    # Get style names
    data_dir = f'/home/bosa/skg/data/skg/{dataset_name}'
    pmi_files = [f for f in os.listdir(data_dir) if f.startswith('pmi_')]
    style_names = [f.replace('pmi_', '').replace('.edges', '') for f in pmi_files]
    
    # Create graph using our wrapper function
    graph = create_nx_graph_wrapper(dataset_name, style_names[0], style_names[1])
    
    # Sample nodes
    nodes = list(graph.nodes())
    if len(nodes) > sample_size:
        sampled_nodes = np.random.choice(nodes, sample_size, replace=False)
        subgraph = graph.subgraph(sampled_nodes)
    else:
        subgraph = graph
    
    # Set up the plot
    plt.figure(figsize=(12, 10))
    
    # Set edge colors based on edge type
    edge_colors = []
    edge_types = set()
    for _, _, data in subgraph.edges(data=True):
        edge_type = data.get('edge_type', 'default_edge')
        edge_types.add(edge_type)
    
    # Create a color map
    edge_type_to_color = {}
    colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink']
    for i, edge_type in enumerate(edge_types):
        edge_type_to_color[edge_type] = colors[i % len(colors)]
    
    # Get edge colors
    for _, _, data in subgraph.edges(data=True):
        edge_type = data.get('edge_type', 'default_edge')
        edge_colors.append(edge_type_to_color[edge_type])
    
    # Draw the graph
    pos = nx.spring_layout(subgraph, seed=42)  # Position nodes using Fruchterman-Reingold
    nx.draw(subgraph, pos, with_labels=True, node_size=300, node_color='skyblue', 
            font_size=8, font_weight='bold', edge_color=edge_colors, width=1.5, alpha=0.7)
    
    # Create legend
    legend_elements = [
        plt.Line2D([0], [0], color=color, lw=2, label=edge_type)
        for edge_type, color in edge_type_to_color.items()
    ]
    plt.legend(handles=legend_elements, title="Edge Types")
    
    plt.title(f"Sample of {dataset_name} Knowledge Graph")
    plt.tight_layout()
    plt.show()

# Uncomment to visualize a sample of the graph
# visualize_graph_sample(selected_dataset, sample_size=50)