### Install Library

In [None]:
from IPython.display import clear_output
!pip install scikit-learn
!pip install deep-translator
clear_output()

import os
import transformers

### Extrac Zip File

In [None]:
import pandas as pd
import zipfile
import os

zip_path = "/content/FarsiBaseTriple.zip"
extract_path = "/content/extracted_excels"
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

excel_files = [os.path.join(extract_path, f) for f in os.listdir(extract_path) if f.endswith('.xlsx') or f.endswith('.xls')]
merged_df = pd.DataFrame()

### Save Incomplete Triple

In [None]:
import os
import pandas as pd

extract_path = "/content/extracted_excels"
output_path = "/content/FarsiBase"
os.makedirs(output_path, exist_ok=True)  # create output folder if it does not exist

csv_files = [os.path.join(extract_path, f) for f in os.listdir(extract_path) if f.lower().endswith('.csv')]
print(f"🔎 Number of CSV files found: {len(csv_files)}")

merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
merged_df.to_csv('/content/mergeData.csv', index=False, encoding='utf-8-sig')
print("✅ All files merged successfully.\n")

print("📊 Full report of merged DataFrame:")
print("=================================")
print(f"➡ Total number of rows: {len(merged_df):,}")
print(f"➡ Total number of columns: {len(merged_df.columns)}")
print("\n🔹 Number of null values in each column:")
print(merged_df.isnull().sum())

def simplify_uri(uri):
    if isinstance(uri, str):
        return uri.strip().split("/")[-1].split("#")[-1]  # improvement for handling different URIs
    return uri

cols_to_simplify = ["subjectLabel", "predicateLabel", "objectLabel"]
simplified_df = merged_df.copy()
for col in cols_to_simplify:
    simplified_df[col] = simplified_df[col].apply(simplify_uri)

simplified_df.drop_duplicates(inplace=True)
simplified_df = simplified_df.dropna(how='all')
print(f"\n♻ Number of rows after removing duplicates: {len(simplified_df):,}")

simplified_df["filled_count"] = simplified_df[cols_to_simplify].notna().sum(axis=1)

complete_df = simplified_df[simplified_df["filled_count"] == 3].drop(columns=["filled_count"])
complete_path = os.path.join(output_path, "complete_triples.csv")
complete_df.to_csv(complete_path, index=False, encoding='utf-8-sig')
print(f"\n💾 Complete triples file ({len(complete_df):,} rows) saved at {complete_path}")

two_filled_df = simplified_df[simplified_df["filled_count"] == 2].drop(columns=["filled_count"])
two_path = os.path.join(output_path, "triples_with_two_values.csv")
two_filled_df.to_csv(two_path, index=False, encoding='utf-8-sig')
print(f"💾 Triples with two values file ({len(two_filled_df):,} rows) saved at {two_path}")

one_filled_df = simplified_df[simplified_df["filled_count"] == 1].drop(columns=["filled_count"])
one_path = os.path.join(output_path, "triples_with_one_value.csv")
one_filled_df.to_csv(one_path, index=False, encoding='utf-8-sig')
print(f"💾 Triples with one value file ({len(one_filled_df):,} rows) saved at {one_path}")

print("\n🎉 Processing completed successfully!")
print(f"\n📝 Final report:\n{simplified_df.count()}")

🔎 Number of CSV files found: 231
✅ All files merged successfully.

📊 Full report of merged DataFrame:
➡ Total number of rows: 2,306,413
➡ Total number of columns: 3

🔹 Number of null values in each column:
subjectLabel      1920360
predicateLabel     245845
objectLabel        182278
dtype: int64

♻ Number of rows after removing duplicates: 251,600

💾 Complete triples file (228,927 rows) saved at /content/FarsiBase/complete_triples.csv
💾 Triples with two values file (21,492 rows) saved at /content/FarsiBase/triples_with_two_values.csv
💾 Triples with one value file (1,181 rows) saved at /content/FarsiBase/triples_with_one_value.csv

🎉 Processing completed successfully!

📝 Final report:
subjectLabel      238789
predicateLabel    243026
objectLabel       249131
filled_count      251600
dtype: int64


### FarsiBase Data Cleaning

In [None]:
import pandas as pd

# Read Data
df = pd.read_csv("/content/FarsiBase/complete_triples.csv")

def convert_persian_to_english(number):
    persian_to_english = str.maketrans('۰۱۲۳۴۵۶۷۸۹', '0123456789')
    return str(number).translate(persian_to_english)

for column in ['subjectLabel', 'predicateLabel', 'objectLabel']:
    df[column] = df[column].apply(convert_persian_to_english)

# Clean Relation
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^dcterms#subject','موضوع/محتوا', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^subject','موضوع', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^birth place','محل تولد', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^birth_place','محل تولد', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^birthPlace','محل تولد', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^instanceOf','نمونه‌ای از', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^deathPlace','محل مرگ', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^death place','محل مرگ', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^field','موضوع', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^genre','ژانر', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^nationality','ملیت', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^occupation','شغل', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^picture','تصویر', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^ActiveYears','سال‌های فعالیت', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^activeYears','سال‌های فعالیت', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^timezone1 dst','ناحیه زمانی ۱', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^confed_cup','جام کنفدراسیون', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^distance to London (μ)','فاصله تا لندن (میانگین)', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^fs_date','تاریخ سیستم فایل', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^państwo','کشور', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^państwo','کشور', regex=True)
df['predicateLabel'] = df['predicateLabel'].str.replace(r'^sp_date','تاریخ طرح', regex=True)

# Remove duplicate row
df.drop_duplicates(inplace=True)
df.to_csv("/content/FarsiBase/complete_triples.csv", index=False, encoding='utf-8-sig')

#### FarsiBase Triple Normalization (Via Translation)

In [None]:
import pandas as pd
import re
from deep_translator import GoogleTranslator

# 1. Read the dataset
df = pd.read_csv('/content/FarsiBase/complete_triples.csv')
columns = ['subjectLabel', 'predicateLabel', 'objectLabel']

# 2. Translation function with caching
translation_cache = {}
def translate_word(word, target_lang="fa"):
    if word in translation_cache:
        return translation_cache[word]
    try:
        translated = GoogleTranslator(source='auto', target=target_lang).translate(word)
        translation_cache[word] = translated
        return translated
    except Exception as e:
        print(f"⚠️ Error translating {word}: {e}")
        fallback = f"ترجمه_{word}"
        translation_cache[word] = fallback
        return fallback

# 3. Extract and translate unique English words per column
column_translations = {col: {} for col in columns}
for col in columns:
    all_text = ' '.join(df[col].astype(str))
    english_words = set(re.findall(r'\b[a-zA-Z]+\b', all_text))

    for word in english_words:
        persian_word = translate_word(word)
        column_translations[col][word] = persian_word

# 4. Apply translation mappings to each column
for col, translations in column_translations.items():
    for pattern, replacement in translations.items():
        df[col] = df[col].str.replace(rf'^{pattern}', replacement, regex=True)

# Remove duplicate row
df.drop_duplicates(inplace=True)
df.to_csv("/content/FarsiBase/complete_triples.csv", index=False, encoding='utf-8-sig')
print("✅ All translations applied to the DataFrame.")

✅ All translations applied to the DataFrame.


### Data Shuffling And Aggregation(FarsiBase + Deepseek)

In [None]:
import pandas as pd
import numpy as np

# Aggregate FarsiBase Data And DeepSeek Data
DeepSeek_df = pd.read_excel('/content/DeepSeek_Triple.xlsx')
input_file = '/content/FarsiBase/complete_triples.csv'
FarsiBase_df = pd.read_csv(input_file)
df = pd.concat([DeepSeek_df, FarsiBase_df], axis=0)

# Shuffle the combined data
shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
output_file = '/content/FarsiBase/shuffled_triple.csv'
shuffled_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"The Excel file has been randomly shuffled and saved to '{output_file}'.")

The Excel file has been randomly shuffled and saved to '/content/FarsiBase/shuffled_triple.csv'.


### PersianILP Normalizing

In [None]:
import pandas as pd
from deep_translator import GoogleTranslator

# Google Translation
def translate_relation(relation, target_lang="fa"):
    try:
        translated = GoogleTranslator(source='auto', target=target_lang).translate(relation)
        return translated
    except Exception as e:
        print(f"Error while translating '{relation}': {e}")
        return relation

# Translate and normalize dataset
def normalize_excel(input_path, output_path, use_translation=False):

    df = pd.read_csv(input_path)
    normalized_relations = []
    for relation in df['predicateLabel']:
        if pd.isna(relation):
            normalized = relation
        else:
            relation = str(relation)
            if use_translation and relation.isascii():
                normalized = translate_relation(relation)
            else:
                normalized = relation
        normalized_relations.append(normalized)
    df['predicateLabel'] = normalized_relations

    # Remove duplicate rows and shuffle data
    df.drop_duplicates(inplace=True)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"Normalized data has been saved to '{output_path}'.")

input_excel = "/content/FarsiBase/complete_triples.csv"
output_excel = "/content/FarsiBase/triple.csv"
normalize_excel(input_path=input_excel,
                output_path=output_excel,
                use_translation=True)

Normalized data has been saved to '/content/FarsiBase/triple.csv'.


In [None]:
import pandas as pd
import networkx as nx
from collections import Counter

def analyze_dataset_for_link_prediction(file_path):

    # 1. Read the dataset
    try:
        df = pd.read_csv(file_path, names=['subject', 'predicate', 'object'])
        print(f"✅ Dataset successfully loaded. Number of triples: {len(df):,}")
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return

    # 2. Compute basic statistics
    num_entities = len(set(df['subject']).union(set(df['object'])))
    num_relations = len(set(df['predicate']))
    print(f"\n📊 Basic statistics:")
    print(f"Number of unique entities: {num_entities:,}")
    print(f"Number of unique relations: {num_relations:,}")

    # 3. Build the graph
    G = nx.MultiDiGraph()  # Directed graph allowing multiple edges between nodes
    for _, row in df.iterrows():
        G.add_edge(row['subject'], row['object'], key=row['predicate'])

    # 4. Analyze node degrees
    degrees = dict(G.degree())
    degree_counts = Counter(degrees.values())

    print("\n📈 Node degree distribution:")
    print(f"• Nodes with degree 1: {degree_counts.get(1, 0):,} ({degree_counts.get(1, 0)/G.number_of_nodes():.1%})")
    print(f"• Nodes with degree 2: {degree_counts.get(2, 0):,} ({degree_counts.get(2, 0)/G.number_of_nodes():.1%})")
    print(f"• Nodes with degree 3: {degree_counts.get(3, 0):,} ({degree_counts.get(3, 0)/G.number_of_nodes():.1%})")

    # 5. Compute graph metrics
    density = nx.density(G)
    sparsity = 1 - density
    avg_degree = sum(degrees.values()) / G.number_of_nodes()
    print("\n🔍 Graph structural metrics:")
    print(f"Number of nodes: {G.number_of_nodes():,}")
    print(f"Number of edges: {G.number_of_edges():,}")
    print(f"Graph density: {density:.6f}")
    print(f"Sparsity: {sparsity:.4f}")
    print(f"Average node degree: {avg_degree:.2f}")

    # 6. Check connectivity
    if nx.is_weakly_connected(G):
        print("\n🔄 The graph is weakly connected")
    else:
        components = nx.number_weakly_connected_components(G)
        print(f"\n🔗 The graph has {components} disconnected components")

    # 7. Final evaluation for link prediction suitability
    print("\n🧪 Link prediction suitability assessment:")

    suitability_score = 0

    # Criterion 1: Relation diversity
    if num_relations > 50:
        print(f"✓ Excellent relation diversity ({num_relations} relation types)")
        suitability_score += 2
    elif num_relations > 10:
        print(f"✓ Acceptable relation diversity ({num_relations} relation types)")
        suitability_score += 1
    else:
        print(f"✗ Insufficient relation diversity ({num_relations} relation types)")

    # Criterion 2: Sparsity
    if sparsity > 0.99:
        print("✓ Ideal sparsity (very suitable for link prediction)")
        suitability_score += 2
    elif sparsity > 0.95:
        print("✓ Acceptable sparsity")
        suitability_score += 1
    else:
        print("✗ Insufficient sparsity")

    # Criterion 3: Degree distribution
    if degree_counts.get(1, 0) < G.number_of_nodes() * 0.4:
        print("✓ Balanced degree distribution")
        suitability_score += 1
    else:
        print(f"✗ Unbalanced degree distribution ({degree_counts.get(1, 0)/G.number_of_nodes():.1%} of nodes have degree 1)")

    # Final conclusion
    print("\n🎯 Final conclusion:")
    if suitability_score >= 4:
        print("✅ This dataset is highly suitable for link prediction")
    elif suitability_score >= 2:
        print("⚠️ This dataset requires improvements for link prediction")
    else:
        print("❌ This dataset is not suitable for link prediction")

# Example usage
analyze_dataset_for_link_prediction('/content/FarsiBase/complete_triples.csv')

✅ Dataset successfully loaded. Number of triples: 191,415

📊 Basic statistics:
Number of unique entities: 49,951
Number of unique relations: 2,728

📈 Node degree distribution:
• Nodes with degree 1: 9,003 (18.0%)
• Nodes with degree 2: 11,742 (23.5%)
• Nodes with degree 3: 5,258 (10.5%)

🔍 Graph structural metrics:
Number of nodes: 49,951
Number of edges: 191,415
Graph density: 0.000077
Sparsity: 0.9999
Average node degree: 7.66

🔗 The graph has 573 disconnected components

🧪 Link prediction suitability assessment:
✓ Excellent relation diversity (2728 relation types)
✓ Ideal sparsity (very suitable for link prediction)
✓ Balanced degree distribution

🎯 Final conclusion:
✅ This dataset is highly suitable for link prediction


### Extracting three variants of the main dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os, zipfile

TARGET_SIZE = 15000
TRAIN_RATIO, VAL_RATIO, TEST_RATIO = 0.7, 0.1, 0.2


# Full inductive split
def split_full_inductive(file_path):
    df = pd.read_csv(file_path)
    subjects, objects = set(df.iloc[:, 0].dropna()), set(df.iloc[:, 2].dropna())
    all_entities = subjects.union(objects)

    # disjoint split of entities
    train_entities, remainder = train_test_split(list(all_entities), test_size=(1 - TRAIN_RATIO), random_state=42)
    val_entities, test_entities = train_test_split(
        remainder, test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO), random_state=42
    )

    # enforce disjoint sets
    train_entities, val_entities, test_entities = set(train_entities), set(val_entities), set(test_entities)
    assert train_entities.isdisjoint(val_entities)
    assert train_entities.isdisjoint(test_entities)
    assert val_entities.isdisjoint(test_entities)

    train_df = df[df.iloc[:, 0].isin(train_entities) & df.iloc[:, 2].isin(train_entities)]
    val_df   = df[df.iloc[:, 0].isin(val_entities) & df.iloc[:, 2].isin(val_entities)]
    test_df  = df[df.iloc[:, 0].isin(test_entities) & df.iloc[:, 2].isin(test_entities)]

    # sample to fixed size
    n_train, n_val = int(TARGET_SIZE*TRAIN_RATIO), int(TARGET_SIZE*VAL_RATIO)
    return (
        train_df.sample(n=n_train, random_state=42),
        val_df.sample(n=n_val, random_state=42),
        test_df.sample(n=TARGET_SIZE-n_train-n_val, random_state=42)
    )


# Semi-inductive split
def split_semi_inductive(file_path):
    df = pd.read_csv(file_path)
    subjects, objects = set(df.iloc[:, 0].dropna()), set(df.iloc[:, 2].dropna())
    all_entities = subjects.union(objects)

    seen_entities, remainder = train_test_split(list(all_entities), test_size=(1 - TRAIN_RATIO), random_state=42)
    val_entities, test_entities = train_test_split(
        remainder, test_size=TEST_RATIO / (VAL_RATIO + TEST_RATIO), random_state=42
    )

    seen_entities, val_entities, test_entities = set(seen_entities), set(val_entities), set(test_entities)

    # enforce disjoint sets
    assert seen_entities.isdisjoint(val_entities.union(test_entities))
    assert val_entities.isdisjoint(test_entities)

    # train: both entities in seen
    train_df = df[df.iloc[:, 0].isin(seen_entities) & df.iloc[:, 2].isin(seen_entities)]

    # val: one entity in seen, one in val_entities
    val_mask = (
        (df.iloc[:,0].isin(seen_entities) & df.iloc[:,2].isin(val_entities)) |
        (df.iloc[:,0].isin(val_entities) & df.iloc[:,2].isin(seen_entities))
    )
    val_df = df[val_mask]

    # test: one entity in seen, one in test_entities
    test_mask = (
        (df.iloc[:,0].isin(seen_entities) & df.iloc[:,2].isin(test_entities)) |
        (df.iloc[:,0].isin(test_entities) & df.iloc[:,2].isin(seen_entities))
    )
    test_df = df[test_mask]

    # sample to fixed size
    n_train, n_val = int(TARGET_SIZE*TRAIN_RATIO), int(TARGET_SIZE*VAL_RATIO)
    return (
        train_df.sample(n=n_train, random_state=42),
        val_df.sample(n=n_val, random_state=42),
        test_df.sample(n=TARGET_SIZE-n_train-n_val, random_state=42)
    )


# Transductive split
def split_transductive(file_path):
    df = pd.read_csv(file_path).sample(frac=1, random_state=42).reset_index(drop=True)
    n_train, n_val = int(TARGET_SIZE*TRAIN_RATIO), int(TARGET_SIZE*VAL_RATIO)
    return (
        df.iloc[:n_train],
        df.iloc[n_train:n_train+n_val],
        df.iloc[n_train+n_val:n_train+n_val+(TARGET_SIZE-n_train-n_val)]
    )


# Paths and setup
file_path = '/content/FarsiBase/complete_triples.csv'
output_dir = '/content/PersianILP-trainTest'
os.makedirs(output_dir, exist_ok=True)

versions = {
    'PersianILP_V1': split_full_inductive,
    'PersianILP_V2': split_semi_inductive,
    'PersianILP_V3': split_transductive
}

for version_name, split_func in versions.items():
    version_dir = os.path.join(output_dir, version_name)
    os.makedirs(version_dir, exist_ok=True)

    train_data, val_data, test_data = split_func(file_path)
    train_data.to_csv(os.path.join(version_dir,'train.csv'),index=False,encoding='utf-8-sig')
    val_data.to_csv(os.path.join(version_dir,'valid.csv'),index=False,encoding='utf-8-sig')
    test_data.to_csv(os.path.join(version_dir,'test.csv'),index=False,encoding='utf-8-sig')

    print(f"[{version_name}] Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

# compress into one zip
zip_path = os.path.join(output_dir, 'PersianILP-data.zip')
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root,_,files in os.walk(output_dir):
        for file in files:
            if not file.endswith('.zip'):
                file_path=os.path.join(root,file)
                arcname=os.path.relpath(file_path,output_dir)
                zipf.write(file_path,arcname)
print(f"✅ ZIP file created successfully:\n{zip_path}")

[PersianILP_V1] Train: 10500, Val: 1500, Test: 3000
[PersianILP_V2] Train: 10500, Val: 1500, Test: 3000
[PersianILP_V3] Train: 10500, Val: 1500, Test: 3000
✅ ZIP file created successfully:
/content/PersianILP-trainTest/PersianILP-data.zip
