In [None]:
from IPython.display import clear_output
!pip uninstall -y numpy
!pip cache purge
!pip install numpy==1.26.4
clear_output()
print("Numpy install successful!")

import os
import IPython
os._exit(0)

In [None]:
from IPython.display import clear_output

!pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0
!pip install dgl -f https://data.dgl.ai/wheels/torch-2.2/repo.html
!pip install torchmetrics==1.2.1 transformers==4.38.0
!pip install torcheval
!pip install scikit-learn
!pip install deep-translator
clear_output()

import os
import dgl
import torch
import torchmetrics
import transformers
import torcheval

os.environ['TORCH'] = torch.__version__
os.environ['DGLBACKEND'] = "pytorch"
device = torch.device("cpu")

try:
    import dgl
    import dgl.graphbolt as gb
    installed = True
except ImportError as error:
    installed = False
    print(error)

print("DGL installed!" if installed else "DGL not found!")
print("PyTorch Version: ", torch.__version__)
print("TorchMetrics Version: ", torchmetrics.__version__)
print("Transformers Version: ", transformers.__version__)
print("DGL Version: ", dgl.__version__)
print("TorchEval Is: ", torcheval.__version__)

### 1- Create PersianILP

**Extrac Zip File**

In [None]:
import pandas as pd
import zipfile
import os

zip_path = "/content/SPARQL.zip"
extract_path = "/content/extracted_excels"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

excel_files = [os.path.join(extract_path, f) for f in os.listdir(extract_path) if f.endswith('.xlsx') or f.endswith('.xls')]
merged_df = pd.DataFrame()

**Save Incomplete Triple**

In [None]:
import os
import pandas as pd

# تنظیمات اولیه
extract_path = "/content/extracted_excels"
output_path = "/content/FarsiBase"
os.makedirs(output_path, exist_ok=True)  # ایجاد پوشه خروجی اگر وجود نداشته باشد

# یافتن و ادغام تمام فایل‌های CSV
csv_files = [os.path.join(extract_path, f) for f in os.listdir(extract_path) if f.lower().endswith('.csv')]
print(f"🔎 تعداد فایل‌های CSV یافت‌شده: {len(csv_files)}")

# خواندن و ادغام تمام فایل‌ها
merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
merged_df.to_csv('/content/mergeData', index=False, encoding='utf-8-sig')
print("✅ تمام فایل‌ها با موفقیت ادغام شدند.\n")

# گزارش کامل از دیتافریم ادغام شده
print("📊 گزارش کامل دیتافریم ادغام شده:")
print("=================================")
print(f"➡ تعداد کل ردیف‌ها: {len(merged_df):,}")
print(f"➡ تعداد کل ستون‌ها: {len(merged_df.columns)}")
print("\n🔹 تعداد مقادیر null در هر ستون:")
print(merged_df.isnull().sum())

# تابع ساده‌سازی URI
def simplify_uri(uri):
    if isinstance(uri, str):
        return uri.strip().split("/")[-1].split("#")[-1]  # بهبود برای هندل کردن URIهای مختلف
    return uri

# اعمال ساده‌سازی روی ستون‌ها
cols_to_simplify = ["subjectLabel", "predicateLabel", "objectLabel"]
simplified_df = merged_df.copy()
for col in cols_to_simplify:
    simplified_df[col] = simplified_df[col].apply(simplify_uri)

#  حذف موارد تکراری و خالی
simplified_df.drop_duplicates(inplace=True)
simplified_df = simplified_df.dropna(how='all')
print(f"\n♻ تعداد ردیف‌ها پس از حذف موارد تکراری: {len(simplified_df):,}")

# محاسبه تعداد فیلدهای پر شده
simplified_df["filled_count"] = simplified_df[cols_to_simplify].notna().sum(axis=1)

# ذخیره فایل‌های مختلف بر اساس کامل بودن داده‌ها
complete_df = simplified_df[simplified_df["filled_count"] == 3].drop(columns=["filled_count"])
complete_path = os.path.join(output_path, "complete_triples.csv")
complete_df.to_csv(complete_path, index=False, encoding='utf-8-sig')
print(f"\n💾 فایل triples کامل ({len(complete_df):,} ردیف) در {complete_path} ذخیره شد.")

two_filled_df = simplified_df[simplified_df["filled_count"] == 2].drop(columns=["filled_count"])
two_path = os.path.join(output_path, "triples_with_two_values.csv")
two_filled_df.to_csv(two_path, index=False, encoding='utf-8-sig')
print(f"💾 فایل triples با دو مقدار ({len(two_filled_df):,} ردیف) در {two_path} ذخیره شد.")

one_filled_df = simplified_df[simplified_df["filled_count"] == 1].drop(columns=["filled_count"])
one_path = os.path.join(output_path, "triples_with_one_value.csv")
one_filled_df.to_csv(one_path, index=False, encoding='utf-8-sig')
print(f"💾 فایل triples با یک مقدار ({len(one_filled_df):,} ردیف) در {one_path} ذخیره شد.")

# گزارش نهایی
print("\n🎉 پردازش با موفقیت به پایان رسید!")
print(f"\n📝 گزارش نهایی:\n{simplified_df.count()}")

**FarsiBase Data Cleaning**

In [None]:
import pandas as pd

def convert_persian_to_english(number):
    persian_to_english = str.maketrans('۰۱۲۳۴۵۶۷۸۹', '0123456789')
    return str(number).translate(persian_to_english)

df = pd.read_csv("/content/FarsiBase/complete_triples.csv")
for column in ['subjectLabel', 'predicateLabel', 'objectLabel']:
    df[column] = df[column].apply(convert_persian_to_english)

# Clean Relation
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^dcterms#subject','موضوع/محتوا', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^subject','موضوع', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^birth place','محل تولد', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^birth_place','محل تولد', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^birthPlace','محل تولد', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^instanceOf','نمونه‌ای از', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^deathPlace','محل مرگ', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^death place','محل مرگ', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^field','موضوع', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^genre','ژانر', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^nationality','ملیت', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^occupation','شغل', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^picture','تصویر', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^ActiveYears','سال‌های فعالیت', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^activeYears','سال‌های فعالیت', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^timezone1 dst','ناحیه زمانی ۱', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^confed_cup','جام کنفدراسیون', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^distance to London (μ)','فاصله تا لندن (میانگین)', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^fs_date','تاریخ سیستم فایل', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^państwo','کشور', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^państwo','کشور', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^sp_date','تاریخ طرح', regex=True)

df = df.drop(df[df['predicateLabel'] == '22-rdf-syntax-ns#instanceOf'].index)
df = df[~df['objectLabel'].str.contains('relation', case=False, na=False)]
df = df[~df['objectLabel'].str.endswith('.JPG')] # Delete Row with .JPG Value
df = df[~df['objectLabel'].str.endswith('.jpg')]
df = df[~df['objectLabel'].str.endswith('.png')]
df = df[~df['objectLabel'].str.endswith('.svg')]
df = df[~df['objectLabel'].str.endswith('Pages_using_infobox3cols_with_multidatastyle')]
df = df[~df['objectLabel'].str.endswith(':hy:Սյուզան_Գարագաշ')]
df['objectLabel'] = df['objectLabel'].str.replace(r'^Actor','بازیگر', regex=True)
df['objectLabel'] = df['objectLabel'].str.replace(r'^Ali_Daei','علی دایی', regex=True)
df['objectLabel'] = df['objectLabel'].str.replace(r'^Person','شخص', regex=True)
df['objectLabel'] = df['objectLabel'].str.replace(r'^SoccerPlayer','بازیکن سوکر', regex=True)
df['objectLabel'] = df['objectLabel'].str.replace(r'^Writer','نویسنده', regex=True)

# Remove duplicate row
df.drop_duplicates(inplace=True)
df.to_csv("/content/FarsiBase/complete_triples.csv", index=False, encoding='utf-8-sig')

**Data Shuffling And Aggregation(FarsiBase + Deepseek)**

In [None]:
import pandas as pd
import numpy as np

# Aggregate FarsiBase Data And DeepSeek Data
DeepSeek_df = pd.read_excel('/content/DeepSeek_Triple.xlsx')
input_file = '/content/FarsiBase/complete_triples.csv'
FarsiBase_df = pd.read_csv(input_file)
df = pd.concat([DeepSeek_df, FarsiBase_df], axis=0)

# Shuffled Data
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
output_file = '/content/FarsiBase/shuffled_triple.csv'
shuffled_df.to_csv(output_file ,index=False , encoding='utf-8-sig')
print(f"فایل اکسل به صورت تصادفی به هم ریخته و در '{output_file}' ذخیره شد.")

**PersianILP Normalizing**

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator

# Translate Google
def translate_relation(relation, target_lang="fa"):
    try:
        translated = GoogleTranslator(source='auto', target=target_lang).translate(relation)
        return translated
    except Exception as e:
        print(f"خطا در ترجمه '{relation}': {e}")
        return relation

# Translate Data
def normalize_excel(input_path, output_path, use_translation=False):

    df = pd.read_csv(input_path)
    normalized_relations = []
    for relation in df['predicateLabel']:
        if pd.isna(relation):
            normalized = relation
        else:
            relation = str(relation)
            if use_translation and relation.isascii():
              normalized = translate_relation(relation)
            else:
              normalized = relation
        normalized_relations.append(normalized)
    df['predicateLabel'] = normalized_relations

    # Delete Duplicate Row And Shuffling Data
    df.drop_duplicates(inplace=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"داده‌های نرمال‌سازی شده در '{output_path}' ذخیره شد.")

input_excel = "/content/FarsiBase/shuffled_triple.csv"
output_excel = "/content/FarsiBase/triple.csv"
normalize_excel(input_path=input_excel,
                output_path=output_excel,
                use_translation=True)

In [None]:
import pandas as pd
import networkx as nx
from collections import Counter

def analyze_dataset_for_link_prediction(file_path):

    # 1. خواندن داده‌ها
    try:
        df = pd.read_csv(file_path, names=['subject', 'predicate', 'object'])
        print(f"✅ دیتاست با موفقیت خوانده شد. تعداد سه‌تایی‌ها: {len(df):,}")
    except Exception as e:
        print(f"❌ خطا در خواندن فایل: {e}")
        return

    # 2. محاسبه معیارهای پایه
    num_entities = len(set(df['subject']).union(set(df['object'])))
    num_relations = len(set(df['predicate']))
    print(f"\n📊 آماره‌های پایه:")
    print(f"تعداد موجودیت‌های منحصر به فرد: {num_entities:,}")
    print(f"تعداد روابط منحصر به فرد: {num_relations:,}")

    # 3. ایجاد گراف
    G = nx.MultiDiGraph()  # گراف جهت‌دار با امکان چندین یال بین گره‌ها
    for _, row in df.iterrows():
        G.add_edge(row['subject'], row['object'], key=row['predicate'])

    # 4. تحلیل درجه گره‌ها
    degrees = dict(G.degree())
    degree_counts = Counter(degrees.values())

    print("\n📈 توزیع درجه گره‌ها:")
    print(f"• گره‌های با درجه ۱: {degree_counts.get(1, 0):,} ({degree_counts.get(1, 0)/G.number_of_nodes():.1%})")
    print(f"• گره‌های با درجه ۲: {degree_counts.get(2, 0):,} ({degree_counts.get(2, 0)/G.number_of_nodes():.1%})")
    print(f"• گره‌های با درجه ۳: {degree_counts.get(3, 0):,} ({degree_counts.get(3, 0)/G.number_of_nodes():.1%})")

    # 5. محاسبه معیارهای گراف
    density = nx.density(G)
    sparsity = 1 - density
    avg_degree = sum(degrees.values()) / G.number_of_nodes()
    print("\n🔍 معیارهای ساختاری گراف:")
    print(f"تعداد گره‌ها: {G.number_of_nodes():,}")
    print(f"تعداد یال‌ها: {G.number_of_edges():,}")
    print(f"چگالی گراف: {density:.6f}")
    print(f"اسپارس بودن: {sparsity:.4f}")
    print(f"میانگین درجه گره‌ها: {avg_degree:.2f}")

    # 6. بررسی اتصالات
    if nx.is_weakly_connected(G):
        print("\n🔄 گراف به صورت ضعیف متصل است")
    else:
        components = nx.number_weakly_connected_components(G)
        print(f"\n🔗 گراف دارای {components} جزء ناهمبند است")

    # 7. تحلیل نهایی برای پیش‌بینی پیوند
    print("\n🧪 ارزیابی مناسب بودن برای پیش‌بینی پیوند:")

    suitability_score = 0

    # معیار 1: تنوع روابط
    if num_relations > 50:
        print(f"✓ تنوع روابط عالی ({num_relations} نوع رابطه)")
        suitability_score += 2
    elif num_relations > 10:
        print(f"✓ تنوع روابط قابل قبول ({num_relations} نوع رابطه)")
        suitability_score += 1
    else:
        print(f"✗ تنوع روابط ناکافی ({num_relations} نوع رابطه)")

    # معیار 2: اسپارس بودن
    if sparsity > 0.99:
        print("✓ اسپارس بودن ایده‌آل (بسیار مناسب برای پیش‌بینی پیوند)")
        suitability_score += 2
    elif sparsity > 0.95:
        print("✓ اسپارس بودن قابل قبول")
        suitability_score += 1
    else:
        print("✗ اسپارس بودن ناکافی")

    # معیار 3: توزیع درجه
    if degree_counts.get(1, 0) < G.number_of_nodes() * 0.4:
        print("✓ توزیع درجه متعادل")
        suitability_score += 1
    else:
        print(f"✗ توزیع درجه نامتعادل ({degree_counts.get(1, 0)/G.number_of_nodes():.1%} گره‌ها درجه ۱ دارند)")

    # نتیجه‌گیری نهایی
    print("\n🎯 نتیجه‌گیری نهایی:")
    if suitability_score >= 4:
        print("✅ این دیتاست برای پیش‌بینی پیوند بسیار مناسب است")
    elif suitability_score >= 2:
        print("⚠️ این دیتاست برای پیش‌بینی پیوند نیاز به بهبودهایی دارد")
    else:
        print("❌ این دیتاست برای پیش‌بینی پیوند مناسب نیست")

# نمونه استفاده
analyze_dataset_for_link_prediction('/content/FarsiBase/triple.csv')

**Extracting three variants of the main dataset**

In [None]:
import numpy as np
import pandas as pd
import os

os.makedirs('/content/PersianILP', exist_ok=True)
df = pd.read_csv('/content/FarsiBase/triple.csv')
n = len(df)
idx1 = int(0.25 * n)
idx2 = int(0.60 * n)

part1 = df.iloc[:idx1].to_csv('/content/PersianILP/PersianILP_V1.csv', index=False, encoding='utf-8-sig')
part2 = df.iloc[idx1:idx2].to_csv('/content/PersianILP/PersianILP_V2.csv', index=False, encoding='utf-8-sig')
part3 = df.iloc[idx2:].to_csv('/content/PersianILP/PersianILP_V3.csv', index=False, encoding='utf-8-sig')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# تفکیک مجموعه‌ی آموزش و تست
def split_train_test_for_file(file_path, test_size=0.2):

    df = pd.read_csv(file_path)
    subjects = set(df.iloc[:, 0].dropna().unique())
    objects = set(df.iloc[:, 2].dropna().unique())
    all_entities = subjects.union(objects)

    train_entities, test_entities = train_test_split(
        list(all_entities),
        test_size=test_size,
        random_state=42)

    train_entities = set(train_entities)
    test_entities = set(test_entities)

    train_mask = df.iloc[:, 0].isin(train_entities) & df.iloc[:, 2].isin(train_entities)
    test_mask = df.iloc[:, 0].isin(test_entities) & df.iloc[:, 2].isin(test_entities)
    train_df = df[train_mask]
    test_df = df[test_mask]
    return train_df, test_df

# ایجاد مجموعه داده‌ی فارسی و انگلیسی استقرایی
file_paths = ['/content/PersianILP/PersianILP_V1.csv',
              '/content/PersianILP/PersianILP_V2.csv',
              '/content/PersianILP/PersianILP_V3.csv']

output_dir = '/content/PersianILP-trainTest'
os.makedirs(output_dir, exist_ok=True)
for file_path in file_paths:
    base_name = os.path.basename(file_path)
    file_name = os.path.splitext(base_name)[0]

    version_dir = os.path.join(output_dir, file_name)
    os.makedirs(version_dir, exist_ok=True)
    train_data, test_data = split_train_test_for_file(file_path, test_size=0.3)

    train_output = os.path.join(version_dir, f'train.csv')
    test_output = os.path.join(version_dir, f'test.csv')

    train_data.to_csv(train_output, index=False, encoding='utf-8-sig')
    test_data.to_csv(test_output, index=False, encoding='utf-8-sig')

# ایجاد فایل زیپ
output_dir = '/content/PersianILP-trainTest'
zip_path = os.path.join(output_dir, 'PersianILP-data.zip')
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(output_dir):
        for file in files:
            if not file.endswith('.zip'):
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, output_dir)
                zipf.write(file_path, arcname)

print(f"فایل zip در مسیر زیر ایجاد شد: {zip_path}")

**ًImport Dataset**

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

ILP_Date_Zip_File = '/content/drive/MyDrive/DataSet/Data_InductiveLinkPrediction.zip'
TLP_Data_Zip_File = '/content/drive/MyDrive/DataSet/Data_TransductiveLinkPrediciton.zip'

!unzip -q {ILP_Date_Zip_File} -d {'/content'}
!unzip -q {TLP_Data_Zip_File} -d {'/content'}

datasets = sorted([folder for folder in os.listdir('/content') if os.path.isdir(os.path.join('/content', folder))])
def create_dataset_dict(base_dir:str='/content'):
    datasets = {}
    for dataset_name in os.listdir(base_dir):
        dataset_path = os.path.join(base_dir, dataset_name)
        if os.path.isdir(dataset_path):
            datasets[dataset_name] = {
                "train": os.path.join(dataset_path, "train.txt"),
                "valid": os.path.join(dataset_path, "valid.txt"),
                "test":  os.path.join(dataset_path, "test.txt")}
    return datasets

# Save Path Dictionay
ILP_dataset_paths = create_dataset_dict('/content/Data_InductiveLinkPrediction')
ILP_dataset_paths = dict(sorted(ILP_dataset_paths.items()))

**Analysis PersianILP With English BencmarkDataset**

In [None]:
import os
import pandas as pd
import networkx as nx
from collections import Counter
from tabulate import tabulate

def load_data(file_path):
    sep = "," if file_path.endswith('.csv') else "\t"
    return pd.read_csv(file_path, sep=sep, header=None, names=["head", "relation", "tail"])

def analyze_graph_metrics(file_path):
        df = load_data(file_path)
        G = nx.MultiDiGraph()
        G.add_edges_from(zip(df["head"], df["tail"], df["relation"]))

        degrees = dict(G.degree())
        counter = Counter(degrees.values())
        avg_deg = sum(degrees.values()) / G.number_of_nodes() if G.number_of_nodes() else 0

        return {
            "Deg_1": counter.get(1, 0),
            "Deg_2": counter.get(2, 0),
            "Deg_3": counter.get(3, 0),
            "Avg_Degree": round(avg_deg, 2),
            "Density": round(nx.density(G), 6),
            "Sparsity": round(1 - nx.density(G), 6)
        }

def process_file(file_path, label):
    if os.path.isfile(file_path) and file_path.endswith(('.csv', '.txt')):
        metrics = analyze_graph_metrics(file_path)
        if metrics:
            metrics['Dataset'] = label
            return metrics
    return None

def analyze_all_datasets(all_dirs):
    results = []
    for base_dir in all_dirs:

        for root, _, files in os.walk(base_dir):
            dataset_name = os.path.basename(root)
            for file in files:
                path = os.path.join(root, file)
                ext = os.path.splitext(file)[1].lower()
                label_type = "CSV" if ext == '.csv' else "TXT"
                label = f"{dataset_name}_{os.path.splitext(file)[0]}"
                result = process_file(path, label)
                if result:
                    results.append(result)

    return pd.DataFrame(results)[["Dataset", "Deg_1", "Deg_2", "Deg_3", "Avg_Degree", "Density", "Sparsity"]]

# مقایسه‌ی ساختار مجموعه داده‌های فارسی و انگلیسی
all_dirs = [
    "/content/Data_InductiveLinkPrediction",
    "/content/PersianILP-trainTest"]
df_result = analyze_all_datasets(all_dirs).sort_values("Dataset")
print(tabulate(df_result, headers="keys", tablefmt="grid", showindex=False))

### 2- Inductive Link Prediction(CoraGraphDataset)

In [None]:
import itertools
import os

os.environ["DGLBACKEND"] = "pytorch"

import dgl
import dgl.data
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
dataset = dgl.data.CoraGraphDataset()
g = dataset[0]

In [None]:
# Split edge set for training and testing
u, v = g.edges()

eids = np.arange(g.num_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.num_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.num_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.num_edges())
test_neg_u, test_neg_v = (
    neg_u[neg_eids[:test_size]],
    neg_v[neg_eids[:test_size]],)

train_neg_u, train_neg_v = (
    neg_u[neg_eids[test_size:]],
    neg_v[neg_eids[test_size:]],)

In [None]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [None]:
from dgl.nn import SAGEConv


# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, h_feats, "mean")

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [None]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.num_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.num_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.num_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.num_nodes())

In [None]:
import dgl.function as fn


class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v("h", "h", "score"))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata["score"][:, 0]

In [None]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src["h"], edges.dst["h"]], 1)
        return {"score": self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.apply_edges(self.apply_edges)
            return g.edata["score"]

In [None]:
model = GraphSAGE(train_g.ndata["feat"].shape[1], 16)
pred = DotPredictor()


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]
    )
    return F.binary_cross_entropy_with_logits(scores, labels)


def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

def compute_auc_pr(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return average_precision_score(labels, scores)

In [None]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(100):
    # forward
    h = model(train_g, train_g.ndata["feat"])
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print("In epoch {}, loss: {}".format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score, roc_auc_score
from torchmetrics.retrieval import RetrievalMRR, RetrievalHitRate

clear_output()
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)

    print("AUC", compute_auc(pos_score, neg_score))
    print("AUC_PR", compute_auc_pr(pos_score, neg_score))

    # set pos_scores and neg_scores
    pos_scores = pos_score.squeeze().detach().cpu().tolist()
    neg_scores = neg_score.squeeze().detach().cpu().tolist()

    # set Labels
    targets_pos = torch.ones(len(pos_scores))
    targets_neg = torch.zeros(len(neg_scores))
    pos_labels = targets_pos.detach().cpu().tolist()
    neg_labels = targets_neg.detach().cpu().tolist()

    # convert to tensors for further processing
    pos_tensor = torch.tensor(pos_scores, dtype=torch.float32).view(-1, 1)
    neg_tensor = torch.tensor(neg_scores, dtype=torch.float32).view(-1, 1)

    scores = torch.cat([pos_tensor, neg_tensor], dim=1)  # shape: [batch_size, 2]
    scores = torch.softmax(scores, dim=1)
    scores = scores.detach().cpu().numpy()

    rank = np.argwhere(np.argsort(scores, axis=1)[:, ::-1] == 0)[:, 1] + 1
    ranks += rank.tolist()

    hit1 = [1 if item <= 1 else 0 for item in rank]
    hit3 = [1 if item <= 3 else 0 for item in rank]
    hit10 = [1 if item <= 10 else 0 for item in rank]

    mrr = np.mean(1.0 / np.array(ranks)).item()
    hit1 = np.mean(hit1)
    hit3 = np.mean(hit3)
    hit10 = np.mean(hit10)
    print(f'mrr: {mrr}')
    print(f'hit1: {hit1}')
    print(f'hit3:{hit3}')
    print(f'hit10:{hit10}')

### 3- Inductive Link Prediction(PersianILP)

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

ILP_Date_Zip_File = '/content/drive/MyDrive/DataSet/ILPDataSet.zip'
!unzip -q {ILP_Date_Zip_File} -d {'/content'}

datasets = sorted([folder for folder in os.listdir('/content') if os.path.isdir(os.path.join('/content', folder))])
def create_dataset_dict(base_dir: str = '/content'):
    datasets = {}
    for dataset_name in os.listdir(base_dir):
        dataset_path = os.path.join(base_dir, dataset_name)
        if os.path.isdir(dataset_path):
            dataset_files = {
                "train": None,
                "valid": None,
                "test": None}

            # Check for both .txt and .csv files
            for split in dataset_files.keys():
                txt_path = os.path.join(dataset_path, f"{split}.txt")
                csv_path = os.path.join(dataset_path, f"{split}.csv")

                if os.path.exists(txt_path):
                    dataset_files[split] = txt_path
                elif os.path.exists(csv_path):
                    dataset_files[split] = csv_path

            datasets[dataset_name] = dataset_files

    return datasets


# Save Path Dictionay
ILP_dataset_paths = create_dataset_dict('/content/ILPDataSet')
ILP_dataset_paths = dict(sorted(ILP_dataset_paths.items()))

**Link Prediction By DGL**

In [None]:
import dgl
import torch
import pandas as pd
from dgl.data import DGLDataset

class PersianDGLDataset(DGLDataset):
    def __init__(self, train_file, test_file, seed=42):
        self.train_file = train_file
        self.test_file = test_file
        self.seed = seed
        self.process()
        super().__init__(name="PersianLinkPrediction")

    def process(self):
        # Initialize mappings
        self.entity2id = {}
        self.relation2id = {}
        ent_id, rel_id = 0, 0

        # Process training data
        train_triples = self._load_and_process_file(self.train_file, ent_id, rel_id)
        ent_id, rel_id = len(self.entity2id), len(self.relation2id)

        # Process test data (using same mappings)
        test_triples = self._load_and_process_file(self.test_file, ent_id, rel_id)

        # Build graphs
        self.graphs = {
            "train": self._build_graph(train_triples),
            "test": self._build_graph(test_triples)
        }

    def _load_file(self, file_path):
        """Load file based on its extension"""
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.txt'):
            return pd.read_csv(file_path, sep='\t', header=None,
                             names=['subjectLabel', 'predicateLabel', 'objectLabel'])
        else:
            raise ValueError("Unsupported file format. Only .csv and .txt files are supported.")

    def _load_and_process_file(self, file_path, ent_id_start, rel_id_start):
        """Load and process a single file, updating mappings"""
        triples = []
        df = self._load_file(file_path)

        for _, row in df.iterrows():
            h, r, t = row['subjectLabel'], row['predicateLabel'], row['objectLabel']

            # Update entity mappings
            for ent in [h, t]:
                if ent not in self.entity2id:
                    self.entity2id[ent] = ent_id_start
                    ent_id_start += 1

            # Update relation mappings
            if r not in self.relation2id:
                self.relation2id[r] = rel_id_start
                rel_id_start += 1

            triples.append((
                self.entity2id[h],
                self.relation2id[r],
                self.entity2id[t]))

        return triples

    def _build_graph(self, triples):
        """Build DGL graph from triples"""
        src, rel, dst = zip(*triples)
        src = torch.tensor(src)
        dst = torch.tensor(dst)
        rel = torch.tensor(rel)

        g = dgl.graph((src, dst), num_nodes=len(self.entity2id))
        g.edata["e_type"] = rel
        g.edata["edge_mask"] = torch.ones(g.num_edges(), dtype=torch.bool)
        g.ndata["ntype"] = torch.zeros(g.num_nodes(), dtype=torch.int)
        g.ndata["feat"] = torch.randn(g.num_nodes(), 64)
        return g

    def __getitem__(self, split):
        return self.graphs[split]

    def __len__(self):
        return len(self.graphs)

class GraphBatchDataset(torch.utils.data.Dataset):
    def __init__(self, graphs, pos_graphs, neg_graphs):
        self.graphs = graphs
        self.pos_graphs = pos_graphs
        self.neg_graphs = neg_graphs

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return {
            "graph": self.graphs[idx],
            "pos_graph": self.pos_graphs[idx],
            "neg_graph": self.neg_graphs[idx]}


dataset = PersianDGLDataset(train_file = ILP_dataset_paths['PersianILP_V1']['train'],
                            test_file = ILP_dataset_paths['PersianILP_V1']['test'])
train_g = dataset["train"]
test_g = dataset["test"]

**Generate Positive Graph And Negative Graph**

In [None]:
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import dgl
import scipy.sparse as sp
from tabulate import tabulate
import torch

class GraphNegativeSampler:
    def __init__(self, train_graph, test_graph, train_neg_ratio=1.0, test_neg_ratio=1.0):
        self.train_graph = train_graph
        self.test_graph = test_graph
        self.train_neg_ratio = train_neg_ratio
        self.test_neg_ratio = test_neg_ratio
        self.train_pos_g, self.train_neg_g = self._prepare_graphs(train_graph, train_neg_ratio)
        self.test_pos_g, self.test_neg_g = self._prepare_graphs(test_graph, test_neg_ratio)

    def _generate_negative_samples(self, graph):
        u, v = graph.edges()
        adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())),
                          shape=(graph.num_nodes(), graph.num_nodes()))
        return np.where(1 - adj.todense() - np.eye(graph.num_nodes()) != 0)

    def _prepare_graphs(self, graph, ratio):
        return ( self._create_positive_graph(graph),
                 self._create_negative_graph(graph, ratio))

    def _create_positive_graph(self, graph):
        g = dgl.graph(graph.edges(), num_nodes=graph.num_nodes())
        g.edata["e_type"] = graph.edata["e_type"]
        g.ndata.update({k: graph.ndata[k] for k in ["feat", "ntype"]})
        return g

    def _create_negative_graph(self, graph, ratio):
        neg_u, neg_v = self._generate_negative_samples(graph)
        num_samples = int(graph.num_edges() * ratio)
        replace = len(neg_u) < num_samples
        sample_ids = np.random.choice(len(neg_u), num_samples, replace=replace)

        g = dgl.graph((neg_u[sample_ids], neg_v[sample_ids]), num_nodes=graph.num_nodes())
        g.edata["e_type"] = torch.randint(0, graph.edata["e_type"].max().item()+1, (g.num_edges(),))
        g.ndata.update({
            "feat": graph.ndata["feat"],
            "ntype": torch.ones(graph.num_nodes(), dtype=torch.int)})
        return g

    @property
    def training_graphs(self):
        return self.train_pos_g, self.train_neg_g

    @property
    def test_graphs(self):
        return self.test_pos_g, self.test_neg_g

# Sampling From Knowladge Graph
sampler = GraphNegativeSampler(dataset['train'],
                               dataset['test'],
                               train_neg_ratio=1,
                               test_neg_ratio=1)

train_pos, train_neg = sampler.training_graphs
test_pos, test_neg = sampler.test_graphs

In [None]:
from dgl.nn import SAGEConv
import torch.nn as nn

class ImprovedGraphSAGE(nn.Module):
  def __init__(self, in_feats, h_feats, out_feats, dropout=0.5):
        super(ImprovedGraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, "mean")
        self.conv2 = SAGEConv(h_feats, out_feats, "mean")
        self.dropout = nn.Dropout(dropout)

  def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv2(g, h)
        return h


import dgl.function as fn
class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata["h"] = h
            g.apply_edges(fn.u_dot_v("h", "h", "score"))
            return g.edata["score"][:, 0]

In [None]:
from torch.utils.data import DataLoader
import itertools
from tqdm import tqdm
import dgl

def train_model(model,
                pred,
                dataloader,
                epochs,
                lr=0.01):

    optimizer = torch.optim.Adam(itertools.chain(model.parameters(),
                                                 pred.parameters()),
                                                 lr=lr)

    all_losses = []
    for epoch in tqdm(range(epochs)):
        epoch_loss = 0.0

        for batch in dataloader:
            batch_graph = batch["graph"]    # گراف اصلی
            pos_graph = batch["pos_graph"]  # گراف مثبت
            neg_graph = batch["neg_graph"]  # گراف منفی

            # Forward pass
            h = model(batch_graph, batch_graph.ndata["feat"])
            pos_score = pred(pos_graph, h)
            neg_score = pred(neg_graph, h)
            loss = compute_loss(pos_score,neg_score)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            e = epoch
            loss = epoch_loss

        all_losses.append(epoch_loss)

    print(f"\nEpoch: {e}, Loss: {loss:.4f}")
    return h, all_losses

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

In [None]:
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import average_precision_score
from prettytable import PrettyTable

class GraphEvaluator:
    def __init__(self, metrics, pred, test_pos_g, test_neg_g, h, dataset_name, k=10):
        self.metrics = metrics
        self.dataset_name = dataset_name
        self.k = k
        self.__evaluate(pred, test_pos_g, test_neg_g, h)

    def compute_metrics(self, pos_score, neg_score):
        pos_array = pos_score.cpu().detach().numpy()
        neg_array = neg_score.cpu().detach().numpy()
        labels_pr = np.concatenate([np.ones_like(pos_array), np.zeros_like(neg_array)])
        scores_pr = np.concatenate([pos_array, neg_array])
        auc_pr = average_precision_score(labels_pr, scores_pr)

        ranks = []
        hits_at_k = []
        for pos, neg in zip(pos_score, neg_score):
            neg = neg.view(-1)
            pos = pos.view(-1)
            all_scores = torch.cat([neg, pos])
            sorted_scores, indices = torch.sort(all_scores, descending=True)
            rank = (indices == len(neg)).nonzero(as_tuple=True)[0].item() + 1
            ranks.append(rank)
            hits_at_k.append(1 if rank <= self.k else 0)

        mrr = np.mean([1.0 / rank for rank in ranks])
        hit_at_k = np.mean(hits_at_k)
        loss = F.binary_cross_entropy_with_logits(torch.cat([pos_score, neg_score]),
                                                  torch.cat([torch.ones(pos_score.shape[0]),
                                                             torch.zeros(neg_score.shape[0])]))

        return {"AUC-PR": auc_pr, "MRR": mrr, f"Hit@{self.k}": hit_at_k, "Loss": loss.item()}

    def display_metrics(self):
        '''نمایش دیکشنری metrics در قالب جدول'''
        table = PrettyTable()
        table.field_names = ["Dataset"] + list(next(iter(self.metrics.values())).keys())

        for dataset, metrics in self.metrics.items():
            row = [dataset] + list(metrics.values())
            table.add_row(row)

        print(table)

    def __evaluate(self, pred, test_pos_g, test_neg_g, h):
        '''ارزیابی مدل و نمایش نتایج (تابع خصوصی)'''
        with torch.no_grad():
            pos_score = pred(test_pos_g, h)
            neg_score = pred(test_neg_g, h)
            new_metrics = self.compute_metrics(pos_score, neg_score)
            self.metrics[self.dataset_name] = new_metrics

        # نمایش نتایج در قالب جدول
        self.display_metrics()

**Link Prediction With GraphSAGE + Dot Predictor**

In [None]:
# ======== Imports =======
import torch
import torch.nn.functional as F
import numpy as np
import itertools
from tqdm import tqdm
from dgl.dataloading import GraphDataLoader
from IPython.display import clear_output
from sklearn.metrics import average_precision_score, roc_auc_score
from tabulate import tabulate
from torchmetrics.retrieval import RetrievalMRR, RetrievalHitRate
from sklearn import metrics

# ==== Hyperparameters ====
h_feats = 16
out_feats = 10
dropout = 0.5
epochs = 2000
lr = 0.01
k = 10
train_neg_ratio = 2
test_neg_ratio = 1
result = {'DataSet': 'Persian_LP'}

# ==== Step 1: Dataset Preparation ====
graphs = PersianDGLDataset(train_file=ILP_dataset_paths['PersianILP_V1']['train'],
                           test_file =ILP_dataset_paths['PersianILP_V1']['test'])

sampler = GraphNegativeSampler(
    graphs['train'], graphs['test'],
    train_neg_ratio=train_neg_ratio,
    test_neg_ratio=test_neg_ratio
)
train_pos_g, train_neg_g = sampler.training_graphs
test_pos_g,  test_neg_g  = sampler.test_graphs

train_dataset = GraphBatchDataset([graphs['train']], [train_pos_g], [train_neg_g])
train_loader  = GraphDataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x[0])

test_dataset = GraphBatchDataset([graphs['test']], [test_pos_g], [test_neg_g])
test_loader  = GraphDataLoader(test_dataset, batch_size=1, collate_fn=lambda x: x[0])

# ==== Step 2: Training ====
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([
        torch.ones(pos_score.shape[0]),
        torch.zeros(neg_score.shape[0])
    ])
    return F.binary_cross_entropy_with_logits(scores, labels)

feats = graphs["train"].ndata["feat"].shape[1]
model = ImprovedGraphSAGE(
    in_feats=feats,
    h_feats=h_feats,
    out_feats=out_feats,
    dropout=dropout
)
pred = DotPredictor()
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()),
    lr=lr
)

all_losses = []
for epoch in tqdm(range(epochs)):
    epoch_loss = 0.0
    for batch in train_loader:
        batch_graph = batch["graph"]
        pos_graph   = batch["pos_graph"]
        neg_graph   = batch["neg_graph"]

        # Forward
        h          = model(batch_graph, batch_graph.ndata["feat"])
        pos_score  = pred(pos_graph, h)
        neg_score  = pred(neg_graph, h)
        loss       = compute_loss(pos_score, neg_score)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    all_losses.append(epoch_loss)

# ==== Step3: Testing =====
pos_scores = []
pos_labels = []
neg_scores = []
neg_labels = []

hit1_list = []
hit3_list = []
hit10_list = []

with torch.no_grad():
    ranks = []
    for b_idx, batch in enumerate(test_loader):

        batch_graph = batch['graph']
        pos_graph   = batch['pos_graph']
        neg_graph   = batch['neg_graph']

        # محاسبه امتیازهای مثبت و منفی
        h = model(batch_graph, batch_graph.ndata["feat"])
        score_pos = pred(pos_graph, h)  # خروجی shape: (batch_size, 1)
        score_neg = pred(neg_graph, h)  # خروجی shape: (batch_size, 1)

        # افزودن به لیست‌ها
        pos_scores += score_pos.squeeze().detach().cpu().tolist()
        neg_scores += score_neg.squeeze().detach().cpu().tolist()

        # برچسب‌ها
        targets_pos = torch.ones(len(score_pos))
        targets_neg = torch.zeros(len(score_neg))
        pos_labels += targets_pos.detach().cpu().tolist()
        neg_labels += targets_neg.detach().cpu().tolist()

        # ساخت ماتریس امتیاز برای softmax و محاسبه رتبه
        score_pos = score_pos.view(-1, 1)
        score_neg = score_neg.view(-1, 1)
        scores = torch.cat([score_pos, score_neg], dim=1)  # فرض بر اینکه هر دو [batch_size, 1] باشند
        scores = torch.softmax(scores, dim=1)
        scores = scores.detach().cpu().numpy()

        rank = np.argwhere(np.argsort(scores, axis=1)[:, ::-1] == 0)[:, 1] + 1
        ranks += rank.tolist()

        # محاسبه Hit@K
        hit1 = [1 if item <= 1 else 0 for item in rank]
        hit3 = [1 if item <= 3 else 0 for item in rank]
        hit10 = [1 if item <= 10 else 0 for item in rank]
        hit1_list += hit1
        hit3_list += hit3
        hit10_list += hit10

result = {}
result["Dataset"] = "Persian_LP"
result["AUC"] = metrics.roc_auc_score(pos_labels + neg_labels, pos_scores + neg_scores)
result["AUC_PR"] = metrics.average_precision_score(pos_labels + neg_labels, pos_scores + neg_scores)
result["MRR"] = np.mean(1.0 / np.array(ranks)).item()
result["Hit1"] = np.mean(hit1_list)
result["Hit3"] = np.mean(hit3_list)
result["Hit10"] = np.mean(hit10_list)
print(result)

headers = ['Dataset', 'AUC' ,'AUC_PR', 'MRR', 'Hit1', 'Hit3', 'Hit10']
values  = [[result[key] for key in headers]]
print("\n" + tabulate(
    values,
    headers=headers,
    tablefmt="fancy_grid",
    floatfmt=".4f",
    stralign="center",
    numalign="left"))

In [None]:
# ======== Imports =======
import torch
import torch.nn.functional as F
import numpy as np
import itertools
from tqdm import tqdm
from dgl.dataloading import GraphDataLoader
from IPython.display import clear_output
from sklearn.metrics import average_precision_score, roc_auc_score
from tabulate import tabulate
from torchmetrics.retrieval import RetrievalMRR, RetrievalHitRate

# ======== Hyperparameters =======
h_feats = 32
out_feats = 16
dropout = 0.5
epochs = 2000
lr = 0.01
k = 10
train_neg_ratio = 1
test_neg_ratio = 1

# ======== Dataset Preparation =======
graphs = PersianDGLDataset(
    train_file=ILP_dataset_paths['WN18RR_v1_ind']['train'],
    test_file=ILP_dataset_paths['WN18RR_v1_ind']['test'])

sampler = GraphNegativeSampler(
    graphs['train'], graphs['test'],
    train_neg_ratio=train_neg_ratio,
    test_neg_ratio=test_neg_ratio)

train_pos_g, train_neg_g = sampler.training_graphs
test_pos_g, test_neg_g = sampler.test_graphs


train_dataset = GraphBatchDataset([graphs['train']], [train_pos_g], [train_neg_g])
train_loader = GraphDataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x[0])

test_dataset = GraphBatchDataset([graphs['test']], [test_pos_g], [test_neg_g])
test_loader = GraphDataLoader(test_dataset, batch_size=1, collate_fn=lambda x: x[0])

# ======== Model Initialization =======
feats = graphs["train"].ndata["feat"].shape[1]
model = ImprovedGraphSAGE(
    in_feats=feats,
    h_feats=h_feats,
    out_feats=out_feats,
    dropout=dropout)

pred = DotPredictor()
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()),
    lr=lr
)

# ======== Training Function =======
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([
        torch.ones(pos_score.shape[0]),
        torch.zeros(neg_score.shape[0])
    ])
    return F.binary_cross_entropy_with_logits(scores, labels)

# ======== Training Loop =======
all_losses = []
for epoch in tqdm(range(epochs), desc="Training"):
    model.train()
    epoch_loss = 0.0

    for batch in train_loader:
        batch_graph = batch["graph"]
        pos_graph = batch["pos_graph"]
        neg_graph = batch["neg_graph"]

        # Forward pass
        h = model(batch_graph, batch_graph.ndata["feat"])
        pos_score = pred(pos_graph, h)
        neg_score = pred(neg_graph, h)
        loss = compute_loss(pos_score, neg_score)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    all_losses.append(epoch_loss / len(train_loader))

# ======== Evaluation =======
@torch.no_grad()
def prediction_model(model, test_loader):
    model.eval()
    logits = []
    labels = []
    indexes = []

    query_id = 0
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch_graph = batch['graph']
        pos_graph = batch['pos_graph']
        neg_graph = batch['neg_graph']

        h = model(batch_graph, batch_graph.ndata["feat"])
        pos_scores = pred(pos_graph, h)
        neg_scores = pred(neg_graph, h)

        for pos_score in pos_scores:
            # هر نمونه‌ی مثبت یک query است
            all_scores = torch.cat([pos_score.unsqueeze(0), neg_scores])
            all_labels = torch.cat([
                torch.ones(1, dtype=torch.bool),
                torch.zeros(neg_scores.shape[0], dtype=torch.bool)])
            all_indexes = torch.full((all_scores.shape[0],), query_id, dtype=torch.long)

            logits.append(all_scores)
            labels.append(all_labels)
            indexes.append(all_indexes)

            query_id += 1

    logits = torch.cat(logits, dim=0)
    labels = torch.cat(labels, dim=0)
    indexes = torch.cat(indexes, dim=0)
    return logits, labels, indexes

# Get predictions
logits, labels, indexes = prediction_model(model, test_loader)

# Compute metrics
auc = roc_auc_score(labels.numpy(), logits.numpy())
auc_pr = average_precision_score(labels.numpy(), logits.numpy())

# Initialize metrics
mrr_metric = RetrievalMRR()
hit_rate_metric = RetrievalHitRate(top_k=10)

# Update metrics with all predictions
mrr_metric.update(logits, labels, indexes)
hit_rate_metric.update(logits, labels, indexes)

# Compute final metrics
mrr = mrr_metric.compute().item()
hit_at_k = hit_rate_metric.compute().item()

# Compute loss
loss = F.binary_cross_entropy_with_logits(logits, labels.float()).item()

# Store results
dataset_name = 'WN18RR_v1_ind'
result_dict = {
    dataset_name: {
        "AUC": auc,
        "AUC_PR": auc_pr,
        "MRR": mrr,
        "HIT_at_K": hit_at_k,
        "LOSS": loss
    }
}

# ======== Display Results =======
headers = ['DataSet', 'AUC', 'AUC_PR', 'MRR', 'HIT_at_K', 'LOSS']
table_data = [[
    dataset_name,
    result_dict[dataset_name]['AUC'],
    result_dict[dataset_name]['AUC_PR'],
    result_dict[dataset_name]['MRR'],
    result_dict[dataset_name]['HIT_at_K'],
    result_dict[dataset_name]['LOSS']
]]

print("\nنتایج نهایی آزمایش:")
print(tabulate(
    table_data,
    headers=headers,
    tablefmt="fancy_grid",
    floatfmt=".4f",
    stralign="center",
    numalign="left"))

# ======== Save Model =======
torch.save(model.state_dict(), f'model_{dataset_name}.pth')

In [None]:
# ======== Imports =======
import torch
import torch.nn.functional as F
import numpy as np
import itertools
from tqdm import tqdm
from dgl.dataloading import GraphDataLoader
from IPython.display import clear_output
from sklearn.metrics import average_precision_score, roc_auc_score
from tabulate import tabulate
from torchmetrics.retrieval import RetrievalMRR, RetrievalHitRate

# ======== Hyperparameters =======
h_feats = 64
out_feats = 16
dropout = 0.5
epochs = 2000
lr = 0.001
k = 10
train_neg_ratio = 1
test_neg_ratio = 50

# ======== Dataset Preparation =======
graphs = PersianDGLDataset(
    train_file=ILP_dataset_paths['WN18RR_v1_ind']['train'],
    test_file=ILP_dataset_paths['WN18RR_v1_ind']['test']
)

sampler = GraphNegativeSampler(
    graphs['train'], graphs['test'],
    train_neg_ratio=train_neg_ratio,
    test_neg_ratio=test_neg_ratio)

train_pos_g, train_neg_g = sampler.training_graphs
test_pos_g, test_neg_g = sampler.test_graphs


train_dataset = GraphBatchDataset([graphs['train']], [train_pos_g], [train_neg_g])
train_loader = GraphDataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x[0])

test_dataset = GraphBatchDataset([graphs['test']], [test_pos_g], [test_neg_g])
test_loader = GraphDataLoader(test_dataset, batch_size=1, collate_fn=lambda x: x[0])

# ======== Model Initialization =======
feats = graphs["train"].ndata["feat"].shape[1]
model = ImprovedGraphSAGE(
    in_feats=feats,
    h_feats=h_feats,
    out_feats=out_feats,
    dropout=dropout
)
pred = DotPredictor()
optimizer = torch.optim.Adam(
    itertools.chain(model.parameters(), pred.parameters()),
    lr=lr
)

# ======== Training Function =======
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([
        torch.ones(pos_score.shape[0]),
        torch.zeros(neg_score.shape[0])
    ])
    return F.binary_cross_entropy_with_logits(scores, labels)

# ======== Training Loop =======
all_losses = []
for epoch in tqdm(range(epochs), desc="Training"):
    model.train()
    epoch_loss = 0.0

    for batch in train_loader:
        batch_graph = batch["graph"]
        pos_graph = batch["pos_graph"]
        neg_graph = batch["neg_graph"]

        # Forward pass
        h = model(batch_graph, batch_graph.ndata["feat"])
        pos_score = pred(pos_graph, h)
        neg_score = pred(neg_graph, h)
        loss = compute_loss(pos_score, neg_score)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    all_losses.append(epoch_loss / len(train_loader))

# ======== Evaluation =======
@torch.no_grad()
def prediction_model(model, test_loader):
    model.eval()
    all_pos_scores = []
    all_neg_scores = []

    for batch in tqdm(test_loader, desc="Evaluating"):
        batch_graph = batch['graph']
        pos_graph = batch['pos_graph']
        neg_graph = batch['neg_graph']

        h = model(batch_graph, batch_graph.ndata["feat"])
        pos_scores = pred(pos_graph, h)
        neg_scores = pred(neg_graph, h)

        all_pos_scores.append(pos_scores.cpu())
        all_neg_scores.append(neg_scores.cpu())

    return all_pos_scores, all_neg_scores

def calculate_mrr(pos_scores, neg_scores):
    reciprocal_ranks = []

    for pos_batch, neg_batch in zip(pos_scores, neg_scores):
        for pos_score in pos_batch:

            combined_scores = torch.cat([neg_batch, pos_score.unsqueeze(0)])
            sorted_scores, indices = torch.sort(combined_scores, descending=True)
            pos_index = len(combined_scores) - 1  # چون نمره‌ی مثبت در انتها اضافه شده
            rank = (indices == pos_index).nonzero(as_tuple=True)[0].item() + 1
            reciprocal_ranks.append(1.0 / rank)

    return np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0

def calculate_hit_at_k(pos_scores, neg_scores, k=10):
    hits = 0
    total = 0

    for pos_batch, neg_batch in zip(pos_scores, neg_scores):
        for pos_score in pos_batch:
            # ترکیب نمره‌ی مثبت با نمرات منفی
            combined_scores = torch.cat([neg_batch, pos_score.unsqueeze(0)])

            # مرتب‌سازی نزولی نمرات
            sorted_scores, indices = torch.sort(combined_scores, descending=True)

            # موقعیت نمره‌ی مثبت در لیست مرتب‌شده
            pos_index = len(combined_scores) - 1  # چون نمره‌ی مثبت در انتها اضافه شده
            rank = (indices == pos_index).nonzero(as_tuple=True)[0].item() + 1

            # بررسی اینکه آیا رتبه در بین K برتر قرار دارد
            if rank <= k:
                hits += 1
            total += 1

    return hits / total if total else 0.0

# محاسبه متریک‌ها
all_pos_scores, all_neg_scores = prediction_model(model, test_loader)
mrr = calculate_mrr(all_pos_scores, all_neg_scores)
hit_at_k = calculate_hit_at_k(all_pos_scores, all_neg_scores, k=k)

# محاسبه سایر متریک‌ها
all_labels = np.concatenate([
    np.ones(sum(len(x) for x in all_pos_scores)),
    np.zeros(sum(len(x) for x in all_neg_scores))
])
all_scores = np.concatenate([
    torch.cat(all_pos_scores).numpy(),
    torch.cat(all_neg_scores).numpy()
])

auc = roc_auc_score(all_labels, all_scores)
auc_pr = average_precision_score(all_labels, all_scores)
loss = F.binary_cross_entropy_with_logits(torch.cat(all_pos_scores + all_neg_scores),
                                         torch.Tensor(all_labels)).item()



# نمایش نتایج
results = {
    'AUC': auc,
    'AUC_PR': auc_pr,
    'MRR': mrr,
    'HIT_at_K': hit_at_k,
    'LOSS': loss}
print('\n', results)

In [None]:
from torch import tensor
from torchmetrics.retrieval import RetrievalMRR
# logits, labels, indexes

# indexes = tensor([0, 0, 0, 1, 1, 1, 1])
# preds = tensor([0.2, 0.3, 0.5, 0.1, 0.3, 0.5, 0.2])
# target = tensor([False, False, True, False, True, False, True])
mrr = RetrievalMRR()
print(f'logits: {logits.shape} , {logits}')
print(f'labels: {labels.shape} , {labels}')
print(f'indexes:{indexes.shape} , {indexes}')
mrr(logits, labels, indexes=indexes)


In [None]:
@torch.no_grad()
def compute_mrr(args, model, node_emb, seeds, labels, indexes):
    """Compute the Mean Reciprocal Rank (MRR) for given source and destination
    nodes.

    This function computes the MRR for a set of node pairs, dividing the task
    into batches to handle potentially large graphs.
    """

    preds = torch.empty(seeds.shape[0], device=indexes.device)
    mrr = RetrievalMRR()
    seeds_src, seeds_dst = seeds.T
    # The constant number is 1001, due to negtive ratio in the `ogbl-citation2`
    # dataset is 1000.
    eval_size = args.eval_batch_size * 1001
    # Loop over node pairs in batches.
    for start in tqdm.trange(0, seeds_src.shape[0], eval_size, desc="Evaluate"):
        end = min(start + eval_size, seeds_src.shape[0])

        # Fetch embeddings for current batch of source and destination nodes.
        h_src = node_emb[seeds_src[start:end]].to(args.device)
        h_dst = node_emb[seeds_dst[start:end]].to(args.device)

        # Compute prediction scores using the model.
        pred = model.predictor(h_src * h_dst).squeeze()
        preds[start:end] = pred
    return mrr(preds, labels, indexes=indexes)

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import itertools
from tqdm import tqdm
from dgl.dataloading import GraphDataLoader
from IPython.display import clear_output
from sklearn.metrics import average_precision_score
from tabulate import tabulate
from torchmetrics.retrieval import RetrievalMRR, RetrievalHitRate

def train_and_evaluate_link_prediction(
    train_file,
    test_file,
    h_feats=16,
    out_feats=10,
    dropout=0.5,
    epochs=2000,
    lr=0.01,
    k=10,
    train_neg_ratio=1,
    test_neg_ratio=10,
    dataset_name='PersianILP_V1',
    save_model=True):

    # ======== آماده‌سازی داده‌ها ========
    graphs = PersianDGLDataset(train_file=train_file, test_file=test_file)
    sampler = GraphNegativeSampler(
        graphs['train'], graphs['test'],
        train_neg_ratio=train_neg_ratio,
        test_neg_ratio=test_neg_ratio)

    train_pos_g, train_neg_g = sampler.training_graphs
    test_pos_g, test_neg_g = sampler.test_graphs

    train_dataset = GraphBatchDataset([graphs['train']], [train_pos_g], [train_neg_g])
    train_loader = GraphDataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x[0])

    test_dataset = GraphBatchDataset([graphs['test']], [test_pos_g], [test_neg_g])
    test_loader = GraphDataLoader(test_dataset, batch_size=1, collate_fn=lambda x: x[0])

    # ======== مدل و بهینه‌ساز ========
    feats = graphs["train"].ndata["feat"].shape[1]
    model = ImprovedGraphSAGE(
        in_feats=feats,
        h_feats=h_feats,
        out_feats=out_feats,
        dropout=dropout
    )
    pred = DotPredictor()
    optimizer = torch.optim.Adam(
        itertools.chain(model.parameters(), pred.parameters()),
        lr=lr
    )

    # ======== آموزش ========
    def compute_loss(pos_score, neg_score):
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([
            torch.ones(pos_score.shape[0]),
            torch.zeros(neg_score.shape[0])
        ])
        return F.binary_cross_entropy_with_logits(scores, labels)

    all_losses = []
    for epoch in tqdm(range(epochs), desc="Training"):
        model.train()
        epoch_loss = 0.0

        for batch in train_loader:
            batch_graph = batch["graph"]
            pos_graph = batch["pos_graph"]
            neg_graph = batch["neg_graph"]

            h = model(batch_graph, batch_graph.ndata["feat"])
            pos_score = pred(pos_graph, h)
            neg_score = pred(neg_graph, h)
            loss = compute_loss(pos_score, neg_score)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        all_losses.append(epoch_loss / len(train_loader))

    # ======== ارزیابی ========
    model.eval()
    result_dict = {dataset_name: {}}

    mrr_metric = RetrievalMRR()
    hit_rate_metric = RetrievalHitRate(top_k=k)
    all_pos_scores = []
    all_neg_scores = []

    for batch in tqdm(test_loader, desc="Evaluating"):
        batch_graph = batch['graph']
        pos_graph = batch['pos_graph']
        neg_graph = batch['neg_graph']

        with torch.no_grad():
            h = model(batch_graph, batch_graph.ndata["feat"])
            pos_scores = pred(pos_graph, h)
            neg_scores = pred(neg_graph, h)

            logits = torch.cat([pos_scores, neg_scores])
            labels = torch.cat([
                torch.ones(pos_scores.shape[0], dtype=torch.int),
                torch.zeros(neg_scores.shape[0], dtype=torch.int)
            ])
            indexes = torch.cat([
                torch.arange(pos_scores.shape[0]),
                torch.arange(neg_scores.shape[0])
            ])

            mrr_metric.update(logits, labels, indexes)
            hit_rate_metric.update(logits, labels, indexes)

            all_pos_scores.append(pos_scores.cpu().numpy())
            all_neg_scores.append(neg_scores.cpu().numpy())

            result_dict[dataset_name]['LOSS'] = F.binary_cross_entropy_with_logits(
                logits, labels.float()).item()

    # محاسبه معیارهای نهایی
    result_dict[dataset_name]['MRR'] = mrr_metric.compute().item()
    result_dict[dataset_name]['HIT_at_K'] = hit_rate_metric.compute().item()

    labels_pr = np.concatenate([
        np.ones(sum(len(x) for x in all_pos_scores)),
        np.zeros(sum(len(x) for x in all_neg_scores))
    ])
    scores_pr = np.concatenate([
        np.concatenate(all_pos_scores),
        np.concatenate(all_neg_scores)
    ])
    result_dict[dataset_name]['AUC_PR'] = average_precision_score(labels_pr, scores_pr)

    # ======== نمایش نتایج ========
    clear_output()
    headers = ['DataSet', 'AUC_PR', 'MRR', 'HIT_at_K', 'LOSS']
    table_data = [[
        dataset_name,
        result_dict[dataset_name]['AUC_PR'],
        result_dict[dataset_name]['MRR'],
        result_dict[dataset_name]['HIT_at_K'],
        result_dict[dataset_name]['LOSS']
    ]]

    print("\nنتایج نهایی آزمایش:")
    print(tabulate(
        table_data,
        headers=headers,
        tablefmt="fancy_grid",
        floatfmt=".4f",
        stralign="center",
        numalign="left"
    ))

    # ======== ذخیره مدل ========
    if save_model:
        torch.save(model.state_dict(), f'model_{dataset_name}.pth')
        print(f"\nمدل در 'model_{dataset_name}.pth' ذخیره شد.")

    return result_dict

In [None]:
train_and_evaluate_link_prediction(
    ILP_dataset_paths['WN18RR_v1_ind']['train'],
    ILP_dataset_paths['WN18RR_v1_ind']['test'],
    h_feats=16,
    out_feats=10,
    dropout=0.5,
    epochs=100,
    lr=0.01,
    k=10,
    train_neg_ratio=1,
    test_neg_ratio=10,
    dataset_name='PersianILP_V1',
    save_model=True)

In [None]:
results = {}
for key, value in ILP_dataset_paths.items():
  results = run_Inductive_link_prediction_experiment( train_file = value['train'],
                                                      test_file  = value['test'],
                                                      dataset_name = key,
                                                      result_dict = results)
  print(key, value['train'])
  print_final_results(results)

**Link Prediction With GraphSAGE + TransE**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.nn import SAGEConv, TransE
from torch.utils.data import DataLoader
from sklearn.metrics import average_precision_score

# Step 0: Hyperparameter Setting
result = {}
epoch = 50
result['Dataset'] = 'Persian_LP'

# Step 1: Load Data
# kg = PersianDGLDataset('/content/FarsiBase/triple.csv')
train_file = '/content/Data_InductiveLinkPrediction/WN18RR_v1_ind/train.txt'
test_file =  '/content/Data_InductiveLinkPrediction/WN18RR_v1_ind/test.txt'
kg = PersianDGLDataset(train_file=train_file, test_file=test_file)

train_graph = kg["train"]
test_graph = kg["test"]
feat_dim = kg["train"].ndata["feat"].shape[1]
num_rels = kg["train"].edata["e_type"].shape[0]

# Step 2: Generate Negative And Positive Graph And Batching Data
sampler = GraphNegativeSampler(
    graphs['train'], graphs['test'],
    train_neg_ratio=train_neg_ratio,
    test_neg_ratio=test_neg_ratio
)
train_pos_g, train_neg_g = sampler.training_graphs
test_pos_g,  test_neg_g  = sampler.test_graphs
train_dataset = GraphBatchDataset([kg['train']], [train_pos_g], [train_neg_g])
train_dataloader = DataLoader(train_dataset, batch_size=1, collate_fn=lambda x: x[0])
test_dataset = GraphBatchDataset([kg['test']], [test_pos_g], [test_neg_g])
test_dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=lambda x: x[0])

# Step 3: Model Definition And Optimization
sage_model = ImprovedGraphSAGE(in_feats=feat_dim, h_feats=64, out_feats=64)
transe_scorer = TransE(num_rels=num_rels, feats=64)
optimizer = torch.optim.Adam(list(sage_model.parameters()) + list(transe_scorer.parameters()), lr=0.01)

# Step 4: Loss Function
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

# Step 5: Training Phase
for epoch in tqdm(range(epoch)):
    total_loss = 0
    for batch in train_dataloader:
        batch_graph = batch["graph"]
        pos_graph = batch["pos_graph"]
        neg_graph = batch["neg_graph"]

        # Forward
        sage_model.train()
        h = sage_model(batch_graph, batch_graph.ndata["feat"])
        pos_src, pos_dst = pos_graph.edges()
        neg_src, neg_dst = neg_graph.edges()
        pos_rels = pos_graph.edata["e_type"]
        neg_rels = neg_graph.edata["e_type"]
        pos_score = transe_scorer(h[pos_src], h[pos_dst], pos_rels)
        neg_score = transe_scorer(h[neg_src], h[neg_dst], neg_rels)
        loss = compute_loss(pos_score, neg_score)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # print(f"📘 Epoch {epoch+1} | Loss: {total_loss:.4f}") if (epoch + 1) % 10 == 0 else None
result["LOSS"] = round(total_loss, 4)

# Step 6: Evaluating Phase
for batch in test_dataloader:
    test_batch_graph = batch["graph"]
    test_pos_graph = batch["pos_graph"]
    test_neg_graph = batch["neg_graph"]

    sage_model.eval()
    transe_scorer.eval()
    with torch.no_grad():
        # Calculate AUC-PR Metric
        h = sage_model(test_batch_graph, test_batch_graph.ndata["feat"])
        pos_src, pos_dst = test_pos_graph.edges()
        neg_src, neg_dst = test_neg_graph.edges()
        pos_rels = test_pos_graph.edata["e_type"]
        neg_rels = test_neg_graph.edata["e_type"]
        pos_score = transe_scorer(h[pos_src], h[pos_dst], pos_rels)
        neg_score = transe_scorer(h[neg_src], h[neg_dst], neg_rels)
        all_scores = torch.cat([pos_score, neg_score]).cpu().numpy()
        all_labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).cpu().numpy()
        auc_pr = average_precision_score(all_labels, all_scores)
        result['AUC-PR'] = round(auc_pr, 4)

# Step 7: Display Result
from tabulate import tabulate
headers = result.keys()
values = [result.values()]
print('\n\n',tabulate(values,
                    headers=headers,
                    tablefmt="grid",
                    floatfmt=".4f"))