In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
!pip install pykeen

Collecting pykeen
  Downloading pykeen-1.11.1-py3-none-any.whl.metadata (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click_default_group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting more_click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.7.0-py3-none-any.whl.metadata (17 kB)
Collecting docdata>=0.0.5 (from pykeen)
  Downloading docdata-0.0.5-py3-none-any.whl.metadata (13 kB)
Collecting class_resolver>=0.6.0 (from 

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder # Not strictly needed for LIAR labels but good practice
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time # To measure training times
import torch # PyKEEN uses PyTorch
import os # For creating directories

# PyKEEN imports
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
# Importing specific KGE models from PyKEEN
from pykeen.models import TransE, RotatE, DistMult, ComplEx

# --- 0. Create directory for saving models ---
MODEL_SAVE_DIR = "saved_pykeen_models"
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)
    print(f"Created directory: {MODEL_SAVE_DIR}")

# --- 1. Load LIAR Dataset ---
print("Step 1: Loading LIAR Dataset from Hugging Face...")
try:
    # Attempt to force redownload to bypass cache issues
    liar_dataset = load_dataset('liar', download_mode="force_redownload")
    train_hf = liar_dataset['train']
    valid_hf = liar_dataset['validation'] # Typically for hyperparameter tuning
    test_hf = liar_dataset['test']

    train_df = pd.DataFrame(train_hf)
    valid_df = pd.DataFrame(valid_hf)
    test_df = pd.DataFrame(test_hf)
    print(f"Loaded {len(train_df)} training, {len(valid_df)} validation, and {len(test_df)} test statements.")
except Exception as e:
    print(f"Error loading dataset from Hugging Face: {e}")
    print("Please ensure you have an internet connection and the 'datasets' library is correctly installed.")
    print("If the error persists, the cache might be an issue or there could be network restrictions.")
    exit() # Crucial to stop execution if dataset loading fails

# --- 2. Construct a Single Knowledge Graph Representation (Triplets) ---
print("\nStep 2: Constructing Knowledge Graph triplets from LIAR dataset...")
# Combine all splits to build a comprehensive KG vocabulary
combined_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)
kg_triplets_list = [] # Using a list first

def normalize_entity(entity_str, prefix=""):
    """Normalizes entity strings for KG consistency and adds a prefix."""
    if pd.isna(entity_str) or not isinstance(entity_str, str) or entity_str.strip() == "":
        return f"{prefix}unknown" if prefix else "unknown"
    normalized_str = entity_str.lower().replace(" ", "_").replace("-", "_").replace("'", "").replace(".", "").replace(",", "_").replace(":", "").replace(";", "").strip()
    return prefix + normalized_str if prefix else normalized_str

# Create triplets
for _, row in combined_df.iterrows():
    stmt_id_entity = normalize_entity(str(row['id']), "stmt_")

    speaker_entity = normalize_entity(row['speaker'], "spkr_")
    if speaker_entity != "spkr_unknown":
        kg_triplets_list.append((stmt_id_entity, 'has_speaker', speaker_entity))

    subject_entity_str = str(row['subject'])
    if pd.notna(subject_entity_str) and subject_entity_str.strip() != "":
        subjects = subject_entity_str.split(',')
        for sub in subjects:
            norm_sub = normalize_entity(sub.strip(), "subj_")
            if norm_sub != "subj_unknown" and norm_sub != "subj_":
                kg_triplets_list.append((stmt_id_entity, 'has_subject', norm_sub))

    context_entity = normalize_entity(row['context'], "ctx_")
    if context_entity != "ctx_unknown":
        kg_triplets_list.append((stmt_id_entity, 'stated_in_context', context_entity))

    if speaker_entity != "spkr_unknown":
        job_entity = normalize_entity(row['job_title'], "job_")
        if job_entity != "job_unknown":
            kg_triplets_list.append((speaker_entity, 'has_job_title', job_entity))

        state_entity = normalize_entity(row['state_info'], "state_")
        if state_entity != "state_unknown":
            kg_triplets_list.append((speaker_entity, 'from_state', state_entity))

        party_entity = normalize_entity(row['party_affiliation'], "party_")
        if party_entity != "party_unknown":
            kg_triplets_list.append((speaker_entity, 'affiliated_with_party', party_entity))

# Convert to NumPy array of unique triplets
kg_triplets_np = np.array(list(set(kg_triplets_list)), dtype=str)
if kg_triplets_np.shape[0] == 0:
    print("No triplets generated. Check data and normalization. Exiting.")
    exit()
print(f"Generated {len(kg_triplets_np)} unique triplets for the Knowledge Graph.")

# Create TriplesFactory for PyKEEN
# This maps entities and relations to integer IDs
tf = TriplesFactory.from_labeled_triples(kg_triplets_np)

# --- Prepare Labels (Common for all models) ---
y_train_labels = train_df['label'].values
y_valid_labels = valid_df['label'].values
y_test_labels = test_df['label'].values

label_mapping = {0: 'pants-fire', 1: 'false', 2: 'barely-true', 3: 'half-true', 4: 'mostly-true', 5: 'true'}
unique_labels_all = np.unique(np.concatenate((y_train_labels, y_valid_labels, y_test_labels)))
class_names = [label_mapping.get(i, f'unknown_{i}') for i in sorted(unique_labels_all)]


# --- 3. Loop Through KGE Models, Train, Extract Embeddings, and Run Logistic Regression ---
kge_models_to_test_pykeen = {
    "TransE": TransE,
    "RotatE": RotatE,
    "DistMult": DistMult,
    "ComplEx": ComplEx
    # Note: PyKEEN offers many more models.
}

results_comparison = {}
# This embedding_dim is the one passed to PyKEEN models.
# For complex models, it's the dim of real/imag parts.
# The actual feature dim for LR will be 2*embedding_dim for complex models.
embedding_dim_pykeen = 100
kge_epochs = 30     # Reduced epochs for PyKEEN pipeline for faster demo.

# Common Logistic Regression parameters
log_reg_params = {
    'solver': 'saga',
    'multi_class': 'ovr',
    'C': 1.0,
    'max_iter': 1500,
    'random_state': 42,
    'penalty': 'l2',
    'n_jobs': -1
}

# Check if GPU is available for PyKEEN
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"PyKEEN will use device: {device}")

for model_name, KGEModelClass_pykeen in kge_models_to_test_pykeen.items():
    print(f"\n--- Processing KGE Model: {model_name} (using PyKEEN) ---")
    start_time_kge = time.time()

    print(f"Training {model_name} with k={embedding_dim_pykeen}, epochs={kge_epochs}...")

    # PyKEEN pipeline parameters
    pipeline_kwargs = dict(
        model=KGEModelClass_pykeen,
        model_kwargs=dict(embedding_dim=embedding_dim_pykeen), # This is the 'k' for PyKEEN models
        training_kwargs=dict(num_epochs=kge_epochs, batch_size=1024, use_tqdm_batch=False),
        random_seed=42,
        device=device,
    )
    # Model-specific pipeline adjustments
    if model_name == "RotatE":
        pipeline_kwargs['loss'] = 'MarginRankingLoss'
        pipeline_kwargs['loss_kwargs'] = {'margin': 1.0}
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.0001}
    elif model_name == "ComplEx":
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.0001}
        # Using NegativeLogLikelihood as it's a common choice for ComplEx
        pipeline_kwargs['loss'] = 'NegativeLogLikelihood'
    elif model_name == "TransE":
        pipeline_kwargs['loss'] = 'MarginRankingLoss'
        pipeline_kwargs['loss_kwargs'] = {'margin': 1.0}
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.001}
    elif model_name == "DistMult":
        pipeline_kwargs['loss'] = 'NegativeLogLikelihood' # Corrected from NLLLoss
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.001}

    try:
        pykeen_result = pipeline(
            training=tf,
            testing=tf, # Satisfy pipeline requirement
            **pipeline_kwargs
        )
        trained_kge_model = pykeen_result.model

        model_save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_pykeen_model.pt")
        torch.save(trained_kge_model, model_save_path)
        print(f"Saved {model_name} model to {model_save_path}")

    except Exception as e:
        print(f"Error training or saving {model_name} with PyKEEN: {e}. Skipping this model.")
        results_comparison[model_name] = {'accuracy': 'KGE Error', 'f1_macro': 'KGE Error', 'report': str(e), 'training_time_kge': 0, 'training_time_lr': 0}
        continue

    kge_training_time = time.time() - start_time_kge
    print(f"{model_name} (PyKEEN) training complete. Time: {kge_training_time:.2f} seconds.")

    # Extract Statement Embeddings using PyKEEN
    # The primary embedding layer for entities
    entity_embedding_layer = trained_kge_model.entity_representations[0]
    raw_embeddings_tensor = entity_embedding_layer(indices=None).detach().cpu()

    actual_feature_dim_for_lr = embedding_dim_pykeen # Default for real models
    if raw_embeddings_tensor.is_complex():
        print(f"  {model_name} produces complex embeddings. Concatenating real and imaginary parts.")
        real_parts = raw_embeddings_tensor.real.numpy()
        imag_parts = raw_embeddings_tensor.imag.numpy()
        # PyKEEN's embedding_dim for complex models is for each part.
        # So, if embedding_dim_pykeen=100, real_parts is N x 100, imag_parts is N x 100.
        entity_representations_np = np.concatenate((real_parts, imag_parts), axis=1)
        actual_feature_dim_for_lr = embedding_dim_pykeen * 2
    else:
        entity_representations_np = raw_embeddings_tensor.numpy()
        # For real models, the shape should be (num_entities, embedding_dim_pykeen)
        actual_feature_dim_for_lr = entity_representations_np.shape[1]


    entity_to_embedding_pykeen = {}
    for entity_id, entity_label in tf.entity_id_to_label.items():
        # entity_id is the integer ID from TriplesFactory, used to index numpy array
        entity_to_embedding_pykeen[entity_label] = entity_representations_np[entity_id]

    default_embedding = np.zeros(actual_feature_dim_for_lr) # Use correct dimension

    def get_statement_embeddings_for_split_pykeen(df, embeddings_dict):
        stmt_embeddings = []
        for stmt_id_orig in df['id']:
            stmt_id_entity = normalize_entity(str(stmt_id_orig), "stmt_")
            stmt_embeddings.append(embeddings_dict.get(stmt_id_entity, default_embedding))
        return np.array(stmt_embeddings)

    X_train_embed = get_statement_embeddings_for_split_pykeen(train_df, entity_to_embedding_pykeen)
    X_test_embed = get_statement_embeddings_for_split_pykeen(test_df, entity_to_embedding_pykeen)

    # Train Logistic Regression
    print(f"Training Logistic Regression using {model_name} (PyKEEN) embeddings (features: {X_train_embed.shape[1]})...")
    start_time_lr = time.time()
    log_reg = LogisticRegression(**log_reg_params)
    try:
        if X_train_embed.shape[0] == 0:
            raise ValueError("No training samples available after embedding extraction.")
        log_reg.fit(X_train_embed, y_train_labels)
    except Exception as e:
        print(f"Error training Logistic Regression for {model_name} (PyKEEN): {e}. Skipping.")
        results_comparison[model_name] = {'accuracy': 'LR Error', 'f1_macro': 'LR Error', 'report': str(e), 'training_time_kge': kge_training_time, 'training_time_lr': 0}
        continue

    lr_training_time = time.time() - start_time_lr
    print(f"Logistic Regression training with {model_name} (PyKEEN) embeddings complete. Time: {lr_training_time:.2f} seconds.")

    # Evaluate Logistic Regression
    if X_test_embed.shape[0] > 0:
        y_pred_test = log_reg.predict(X_test_embed)
        accuracy = accuracy_score(y_test_labels, y_pred_test)
        f1_macro = f1_score(y_test_labels, y_pred_test, average='macro', zero_division=0)
        report = classification_report(y_test_labels, y_pred_test, target_names=class_names, zero_division=0)
    else:
        accuracy = 0.0
        f1_macro = 0.0
        report = "No test samples to evaluate."
        print("Warning: No test samples found after embedding extraction for Logistic Regression.")

    results_comparison[model_name] = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'report': report,
        'training_time_kge': kge_training_time,
        'training_time_lr': lr_training_time
    }
    print(f"Results for {model_name} (PyKEEN) embeddings: Accuracy={accuracy:.4f}, F1-Macro={f1_macro:.4f}")


# --- 4. Compare Results ---
print("\n\n--- Overall Results Comparison (with PyKEEN) ---")
for model_name, metrics in results_comparison.items():
    print(f"\n--- {model_name} ---")
    if isinstance(metrics['accuracy'], str) and "Error" in metrics['accuracy']:
        print(f"  Error during processing: {metrics['report']}")
    else:
        print(f"  KGE Training Time: {metrics['training_time_kge']:.2f} seconds")
        print(f"  LR Training Time: {metrics['training_time_lr']:.2f} seconds")
        print(f"  Test Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Test F1-Score (Macro): {metrics['f1_macro']:.4f}")
        print("  Classification Report on Test Set:\n", metrics['report'])

summary_data = []
for model_name, metrics in results_comparison.items():
    acc = metrics.get('accuracy', 'N/A')
    f1 = metrics.get('f1_macro', 'N/A')
    summary_data.append({
        'KGE Model': model_name,
        'Accuracy': f"{acc:.4f}" if isinstance(acc, float) else acc,
        'F1 (Macro)': f"{f1:.4f}" if isinstance(f1, float) else f1,
        'KGE Time (s)': f"{metrics.get('training_time_kge', 0):.2f}",
        'LR Time (s)': f"{metrics.get('training_time_lr', 0):.2f}"
    })
summary_df = pd.DataFrame(summary_data)
print("\n--- Summary Table (with PyKEEN) ---")
try:
    print(summary_df.to_string(index=False))
except AttributeError: # older pandas might not have to_string index arg
    print(summary_df)



Step 1: Loading LIAR Dataset from Hugging Face...


liar.py:   0%|          | 0.00/6.41k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

Loaded 10269 training, 1284 validation, and 1283 test statements.

Step 2: Constructing Knowledge Graph triplets from LIAR dataset...
Generated 61066 unique triplets for the Knowledge Graph.


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


PyKEEN will use device: cuda

--- Processing KGE Model: TransE (using PyKEEN) ---
Training TransE with k=100, epochs=30...


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/61.1k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 18.76s seconds


Saved TransE model to saved_pykeen_models/TransE_pykeen_model.pt
TransE (PyKEEN) training complete. Time: 33.55 seconds.
Training Logistic Regression using TransE (PyKEEN) embeddings (features: 100)...


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Logistic Regression training with TransE (PyKEEN) embeddings complete. Time: 2.36 seconds.
Results for TransE (PyKEEN) embeddings: Accuracy=0.1995, F1-Macro=0.1565

--- Processing KGE Model: RotatE (using PyKEEN) ---
Training RotatE with k=100, epochs=30...


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/61.1k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 31.59s seconds


Saved RotatE model to saved_pykeen_models/RotatE_pykeen_model.pt
RotatE (PyKEEN) training complete. Time: 48.92 seconds.
  RotatE produces complex embeddings. Concatenating real and imaginary parts.
Training Logistic Regression using RotatE (PyKEEN) embeddings (features: 200)...


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)


Logistic Regression training with RotatE (PyKEEN) embeddings complete. Time: 5.29 seconds.
Results for RotatE (PyKEEN) embeddings: Accuracy=0.1988, F1-Macro=0.1562

--- Processing KGE Model: DistMult (using PyKEEN) ---
Training DistMult with k=100, epochs=30...


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/61.1k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 19.32s seconds


Saved DistMult model to saved_pykeen_models/DistMult_pykeen_model.pt
DistMult (PyKEEN) training complete. Time: 35.88 seconds.
Training Logistic Regression using DistMult (PyKEEN) embeddings (features: 100)...


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)


Logistic Regression training with DistMult (PyKEEN) embeddings complete. Time: 2.48 seconds.
Results for DistMult (PyKEEN) embeddings: Accuracy=0.1933, F1-Macro=0.1472

--- Processing KGE Model: ComplEx (using PyKEEN) ---
Training ComplEx with k=100, epochs=30...


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/61.1k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 17.36s seconds


Saved ComplEx model to saved_pykeen_models/ComplEx_pykeen_model.pt
ComplEx (PyKEEN) training complete. Time: 40.74 seconds.
  ComplEx produces complex embeddings. Concatenating real and imaginary parts.
Training Logistic Regression using ComplEx (PyKEEN) embeddings (features: 200)...
Logistic Regression training with ComplEx (PyKEEN) embeddings complete. Time: 5.55 seconds.
Results for ComplEx (PyKEEN) embeddings: Accuracy=0.1910, F1-Macro=0.1564


--- Overall Results Comparison (with PyKEEN) ---

--- TransE ---
  KGE Training Time: 33.55 seconds
  LR Training Time: 2.36 seconds
  Test Accuracy: 0.1995
  Test F1-Score (Macro): 0.1565
  Classification Report on Test Set:
               precision    recall  f1-score   support

  pants-fire       0.19      0.28      0.23       250
       false       0.21      0.36      0.27       267
 barely-true       0.19      0.22      0.20       249
   half-true       0.21      0.10      0.14       211
 mostly-true       0.17      0.07      0.10      

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder # Not strictly needed for LIAR labels but good practice
from sklearn.metrics import classification_report, accuracy_score, f1_score
import time # To measure training times
import torch # PyKEEN uses PyTorch
import os # For creating directories

# PyKEEN imports
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
# Importing specific KGE models from PyKEEN
from pykeen.models import TransE, RotatE, DistMult, ComplEx

# --- 0. Create directory for saving models ---
MODEL_SAVE_DIR = "saved_pykeen_models"
if not os.path.exists(MODEL_SAVE_DIR):
    os.makedirs(MODEL_SAVE_DIR)
    print(f"Created directory: {MODEL_SAVE_DIR}")

# --- 1. Load LIAR Dataset ---
print("Step 1: Loading LIAR Dataset from Hugging Face...")
try:
    # Attempt to force redownload to bypass cache issues
    liar_dataset = load_dataset('liar', download_mode="force_redownload")
    train_hf = liar_dataset['train']
    valid_hf = liar_dataset['validation'] # Typically for hyperparameter tuning
    test_hf = liar_dataset['test']

    train_df = pd.DataFrame(train_hf)
    valid_df = pd.DataFrame(valid_hf)
    test_df = pd.DataFrame(test_hf)
    print(f"Loaded {len(train_df)} training, {len(valid_df)} validation, and {len(test_df)} test statements.")
except Exception as e:
    print(f"Error loading dataset from Hugging Face: {e}")
    print("Please ensure you have an internet connection and the 'datasets' library is correctly installed.")
    print("If the error persists, the cache might be an issue or there could be network restrictions.")
    exit() # Crucial to stop execution if dataset loading fails

# --- 2. Construct a Single Knowledge Graph Representation (Triplets) ---
print("\nStep 2: Constructing Knowledge Graph triplets from LIAR dataset...")
# Combine all splits to build a comprehensive KG vocabulary
combined_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)
kg_triplets_list = [] # Using a list first

def normalize_entity(entity_str, prefix=""):
    """Normalizes entity strings for KG consistency and adds a prefix."""
    if pd.isna(entity_str) or not isinstance(entity_str, str) or entity_str.strip() == "":
        return f"{prefix}unknown" if prefix else "unknown"
    normalized_str = entity_str.lower().replace(" ", "_").replace("-", "_").replace("'", "").replace(".", "").replace(",", "_").replace(":", "").replace(";", "").strip()
    return prefix + normalized_str if prefix else normalized_str

def discretize_count(count):
    """Discretizes numerical counts into categories."""
    if pd.isna(count) or count == 0:
        return "none"
    elif count <= 2: # Example threshold, can be tuned
        return "low"
    elif count <= 10: # Example threshold
        return "medium"
    else:
        return "high"

# Create triplets
for _, row in combined_df.iterrows():
    stmt_id_entity = normalize_entity(str(row['id']), "stmt_")

    speaker_entity = normalize_entity(row['speaker'], "spkr_")
    if speaker_entity != "spkr_unknown":
        kg_triplets_list.append((stmt_id_entity, 'has_speaker', speaker_entity))

    subject_entity_str = str(row['subject'])
    if pd.notna(subject_entity_str) and subject_entity_str.strip() != "":
        subjects = subject_entity_str.split(',')
        for sub in subjects:
            norm_sub = normalize_entity(sub.strip(), "subj_")
            if norm_sub != "subj_unknown" and norm_sub != "subj_":
                kg_triplets_list.append((stmt_id_entity, 'has_subject', norm_sub))

    context_entity = normalize_entity(row['context'], "ctx_")
    if context_entity != "ctx_unknown":
        kg_triplets_list.append((stmt_id_entity, 'stated_in_context', context_entity))

    if speaker_entity != "spkr_unknown":
        job_entity = normalize_entity(row['job_title'], "job_")
        if job_entity != "job_unknown":
            kg_triplets_list.append((speaker_entity, 'has_job_title', job_entity))

        state_entity = normalize_entity(row['state_info'], "state_")
        if state_entity != "state_unknown":
            kg_triplets_list.append((speaker_entity, 'from_state', state_entity))

        party_entity = normalize_entity(row['party_affiliation'], "party_")
        if party_entity != "party_unknown":
            kg_triplets_list.append((speaker_entity, 'affiliated_with_party', party_entity))

        # UPDATED: Add discretized credit history for speakers
        credit_history_cols = {
            'barely_true_counts': 'has_barely_true_profile',
            'false_counts': 'has_false_profile',
            'half_true_counts': 'has_half_true_profile',
            'mostly_true_counts': 'has_mostly_true_profile',
            'pants_on_fire_counts': 'has_pants_fire_profile'
        }
        for col_name, relation_name in credit_history_cols.items():
            if col_name in row and pd.notna(row[col_name]):
                count_category = discretize_count(row[col_name])
                profile_entity = normalize_entity(f"profile_{count_category}", "ch_") # ch for credit history
                kg_triplets_list.append((speaker_entity, relation_name, profile_entity))


# Convert to NumPy array of unique triplets
kg_triplets_np = np.array(list(set(kg_triplets_list)), dtype=str)
if kg_triplets_np.shape[0] == 0:
    print("No triplets generated. Check data and normalization. Exiting.")
    exit()
print(f"Generated {len(kg_triplets_np)} unique triplets for the Knowledge Graph (including credit history).")

# Create TriplesFactory for PyKEEN
# This maps entities and relations to integer IDs
tf = TriplesFactory.from_labeled_triples(kg_triplets_np)

# --- Prepare Labels (Common for all models) ---
y_train_labels = train_df['label'].values
y_valid_labels = valid_df['label'].values
y_test_labels = test_df['label'].values

label_mapping = {0: 'pants-fire', 1: 'false', 2: 'barely-true', 3: 'half-true', 4: 'mostly-true', 5: 'true'}
unique_labels_all = np.unique(np.concatenate((y_train_labels, y_valid_labels, y_test_labels)))
class_names = [label_mapping.get(i, f'unknown_{i}') for i in sorted(unique_labels_all)]


# --- 3. Loop Through KGE Models, Train, Extract Embeddings, and Run Logistic Regression ---
kge_models_to_test_pykeen = {
    "TransE": TransE,
    "RotatE": RotatE,
    "DistMult": DistMult,
    "ComplEx": ComplEx
}

results_comparison = {}
embedding_dim_pykeen = 200
kge_epochs = 100     # UPDATED: Increased epochs slightly

# Common Logistic Regression parameters
log_reg_params = {
    'solver': 'saga',
    'multi_class': 'ovr',
    'C': 1.0,
    'max_iter': 1500,
    'random_state': 42,
    'penalty': 'l2',
    'n_jobs': -1
}

# Check if GPU is available for PyKEEN
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"PyKEEN will use device: {device}")

for model_name, KGEModelClass_pykeen in kge_models_to_test_pykeen.items():
    print(f"\n--- Processing KGE Model: {model_name} (using PyKEEN) ---")
    start_time_kge = time.time()

    print(f"Training {model_name} with k={embedding_dim_pykeen}, epochs={kge_epochs}...")

    # PyKEEN pipeline parameters
    pipeline_kwargs = dict(
        model=KGEModelClass_pykeen,
        model_kwargs=dict(embedding_dim=embedding_dim_pykeen),
        training_kwargs=dict(num_epochs=kge_epochs, batch_size=1024, use_tqdm_batch=False),
        random_seed=42,
        device=device,
    )
    # Model-specific pipeline adjustments
    if model_name == "RotatE":
        pipeline_kwargs['loss'] = 'MarginRankingLoss'
        pipeline_kwargs['loss_kwargs'] = {'margin': 1.0}
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.00005} # UPDATED: Slightly lower LR for RotatE
    elif model_name == "ComplEx":
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.0001}
        pipeline_kwargs['loss'] = 'NegativeLogLikelihood'
    elif model_name == "TransE":
        pipeline_kwargs['loss'] = 'MarginRankingLoss'
        pipeline_kwargs['loss_kwargs'] = {'margin': 1.0}
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.001}
        # Example of adding model-specific regularizer if desired (PyKEEN pipeline also has a top-level regularizer arg)
        # pipeline_kwargs['model_kwargs']['regularizer'] = 'LpRegularizer'
        # pipeline_kwargs['model_kwargs']['regularizer_kwargs'] = {'weight': 0.001, 'p': 2}
    elif model_name == "DistMult":
        pipeline_kwargs['loss'] = 'NegativeLogLikelihood'
        pipeline_kwargs['optimizer_kwargs'] = {'lr': 0.001}

    try:
        pykeen_result = pipeline(
            training=tf,
            testing=tf,
            **pipeline_kwargs
        )
        trained_kge_model = pykeen_result.model

        model_save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_pykeen_model.pt")
        torch.save(trained_kge_model, model_save_path)
        print(f"Saved {model_name} model to {model_save_path}")

    except Exception as e:
        print(f"Error training or saving {model_name} with PyKEEN: {e}. Skipping this model.")
        results_comparison[model_name] = {'accuracy': 'KGE Error', 'f1_macro': 'KGE Error', 'report': str(e), 'training_time_kge': 0, 'training_time_lr': 0}
        continue

    kge_training_time = time.time() - start_time_kge
    print(f"{model_name} (PyKEEN) training complete. Time: {kge_training_time:.2f} seconds.")

    # Extract Statement Embeddings using PyKEEN
    entity_embedding_layer = trained_kge_model.entity_representations[0]
    raw_embeddings_tensor = entity_embedding_layer(indices=None).detach().cpu()

    actual_feature_dim_for_lr = embedding_dim_pykeen
    if raw_embeddings_tensor.is_complex():
        print(f"  {model_name} produces complex embeddings. Concatenating real and imaginary parts.")
        real_parts = raw_embeddings_tensor.real.numpy()
        imag_parts = raw_embeddings_tensor.imag.numpy()
        entity_representations_np = np.concatenate((real_parts, imag_parts), axis=1)
        actual_feature_dim_for_lr = embedding_dim_pykeen * 2
    else:
        entity_representations_np = raw_embeddings_tensor.numpy()
        actual_feature_dim_for_lr = entity_representations_np.shape[1]


    entity_to_embedding_pykeen = {}
    for entity_id, entity_label in tf.entity_id_to_label.items():
        entity_to_embedding_pykeen[entity_label] = entity_representations_np[entity_id]

    default_embedding = np.zeros(actual_feature_dim_for_lr)

    def get_statement_embeddings_for_split_pykeen(df, embeddings_dict):
        stmt_embeddings = []
        for stmt_id_orig in df['id']:
            stmt_id_entity = normalize_entity(str(stmt_id_orig), "stmt_")
            stmt_embeddings.append(embeddings_dict.get(stmt_id_entity, default_embedding))
        return np.array(stmt_embeddings)

    X_train_embed = get_statement_embeddings_for_split_pykeen(train_df, entity_to_embedding_pykeen)
    X_test_embed = get_statement_embeddings_for_split_pykeen(test_df, entity_to_embedding_pykeen)

    # Train Logistic Regression
    print(f"Training Logistic Regression using {model_name} (PyKEEN) embeddings (features: {X_train_embed.shape[1]})...")
    start_time_lr = time.time()
    log_reg = LogisticRegression(**log_reg_params)
    try:
        if X_train_embed.shape[0] == 0:
            raise ValueError("No training samples available after embedding extraction.")
        log_reg.fit(X_train_embed, y_train_labels)
    except Exception as e:
        print(f"Error training Logistic Regression for {model_name} (PyKEEN): {e}. Skipping.")
        results_comparison[model_name] = {'accuracy': 'LR Error', 'f1_macro': 'LR Error', 'report': str(e), 'training_time_kge': kge_training_time, 'training_time_lr': 0}
        continue

    lr_training_time = time.time() - start_time_lr
    print(f"Logistic Regression training with {model_name} (PyKEEN) embeddings complete. Time: {lr_training_time:.2f} seconds.")

    # Evaluate Logistic Regression
    if X_test_embed.shape[0] > 0:
        y_pred_test = log_reg.predict(X_test_embed)
        accuracy = accuracy_score(y_test_labels, y_pred_test)
        f1_macro = f1_score(y_test_labels, y_pred_test, average='macro', zero_division=0)
        report = classification_report(y_test_labels, y_pred_test, target_names=class_names, zero_division=0)
    else:
        accuracy = 0.0
        f1_macro = 0.0
        report = "No test samples to evaluate."
        print("Warning: No test samples found after embedding extraction for Logistic Regression.")

    results_comparison[model_name] = {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'report': report,
        'training_time_kge': kge_training_time,
        'training_time_lr': lr_training_time
    }
    print(f"Results for {model_name} (PyKEEN) embeddings: Accuracy={accuracy:.4f}, F1-Macro={f1_macro:.4f}")


# --- 4. Compare Results ---
print("\n\n--- Overall Results Comparison (with PyKEEN) ---")
for model_name, metrics in results_comparison.items():
    print(f"\n--- {model_name} ---")
    if isinstance(metrics['accuracy'], str) and "Error" in metrics['accuracy']:
        print(f"  Error during processing: {metrics['report']}")
    else:
        print(f"  KGE Training Time: {metrics['training_time_kge']:.2f} seconds")
        print(f"  LR Training Time: {metrics['training_time_lr']:.2f} seconds")
        print(f"  Test Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Test F1-Score (Macro): {metrics['f1_macro']:.4f}")
        print("  Classification Report on Test Set:\n", metrics['report'])

summary_data = []
for model_name, metrics in results_comparison.items():
    acc = metrics.get('accuracy', 'N/A')
    f1 = metrics.get('f1_macro', 'N/A')
    summary_data.append({
        'KGE Model': model_name,
        'Accuracy': f"{acc:.4f}" if isinstance(acc, float) else acc,
        'F1 (Macro)': f"{f1:.4f}" if isinstance(f1, float) else f1,
        'KGE Time (s)': f"{metrics.get('training_time_kge', 0):.2f}",
        'LR Time (s)': f"{metrics.get('training_time_lr', 0):.2f}"
    })
summary_df = pd.DataFrame(summary_data)
print("\n--- Summary Table (with PyKEEN) ---")
try:
    print(summary_df.to_string(index=False))
except AttributeError: # older pandas might not have to_string index arg
    print(summary_df)



Step 1: Loading LIAR Dataset from Hugging Face...


liar.py:   0%|          | 0.00/6.41k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

Loaded 10269 training, 1284 validation, and 1283 test statements.

Step 2: Constructing Knowledge Graph triplets from LIAR dataset...
Generated 77666 unique triplets for the Knowledge Graph (including credit history).


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


PyKEEN will use device: cuda

--- Processing KGE Model: TransE (using PyKEEN) ---
Training TransE with k=200, epochs=100...


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/77.7k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 38.71s seconds


Saved TransE model to saved_pykeen_models/TransE_pykeen_model.pt
TransE (PyKEEN) training complete. Time: 100.83 seconds.
Training Logistic Regression using TransE (PyKEEN) embeddings (features: 200)...


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Logistic Regression training with TransE (PyKEEN) embeddings complete. Time: 4.82 seconds.
Results for TransE (PyKEEN) embeddings: Accuracy=0.2182, F1-Macro=0.1928

--- Processing KGE Model: RotatE (using PyKEEN) ---
Training RotatE with k=200, epochs=100...


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/77.7k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 73.56s seconds


Saved RotatE model to saved_pykeen_models/RotatE_pykeen_model.pt
RotatE (PyKEEN) training complete. Time: 156.34 seconds.
  RotatE produces complex embeddings. Concatenating real and imaginary parts.
Training Logistic Regression using RotatE (PyKEEN) embeddings (features: 400)...


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)


Logistic Regression training with RotatE (PyKEEN) embeddings complete. Time: 9.03 seconds.
Results for RotatE (PyKEEN) embeddings: Accuracy=0.1995, F1-Macro=0.1720

--- Processing KGE Model: DistMult (using PyKEEN) ---
Training DistMult with k=200, epochs=100...


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/77.7k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 38.52s seconds


Saved DistMult model to saved_pykeen_models/DistMult_pykeen_model.pt
DistMult (PyKEEN) training complete. Time: 106.63 seconds.
Training Logistic Regression using DistMult (PyKEEN) embeddings (features: 200)...


INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)
INFO:pykeen.nn.representation:Inferred unique=False for Embedding(
  (regularizer): LpRegularizer()
)


Logistic Regression training with DistMult (PyKEEN) embeddings complete. Time: 4.14 seconds.
Results for DistMult (PyKEEN) embeddings: Accuracy=0.2104, F1-Macro=0.2005

--- Processing KGE Model: ComplEx (using PyKEEN) ---
Training ComplEx with k=200, epochs=100...


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/77.7k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 41.41s seconds


Saved ComplEx model to saved_pykeen_models/ComplEx_pykeen_model.pt
ComplEx (PyKEEN) training complete. Time: 146.75 seconds.
  ComplEx produces complex embeddings. Concatenating real and imaginary parts.
Training Logistic Regression using ComplEx (PyKEEN) embeddings (features: 400)...
Logistic Regression training with ComplEx (PyKEEN) embeddings complete. Time: 10.29 seconds.
Results for ComplEx (PyKEEN) embeddings: Accuracy=0.2065, F1-Macro=0.1874


--- Overall Results Comparison (with PyKEEN) ---

--- TransE ---
  KGE Training Time: 100.83 seconds
  LR Training Time: 4.82 seconds
  Test Accuracy: 0.2182
  Test F1-Score (Macro): 0.1928
  Classification Report on Test Set:
               precision    recall  f1-score   support

  pants-fire       0.21      0.27      0.24       250
       false       0.22      0.33      0.26       267
 barely-true       0.24      0.22      0.23       249
   half-true       0.20      0.17      0.18       211
 mostly-true       0.21      0.14      0.17   