In [None]:
import pandas as pd

# Load the TSV file into a DataFrame
file_path = 'constraint-corrections-oneOf.tsv'

# Assign column names
col_names = [
    "constraint.statement",
    "revision.id.url",
    "subject.t0", "predicate.t0", "object.t0",
    "follows.symbol",
    "subject.t1", "predicate.t1", "object.t1", 
    "cud.action",
    "V11", "V12", "V13", "V14"
]
df = pd.read_csv(file_path, sep='\t', header=None, names=col_names)

In [None]:
df

# Extract Users

In [None]:
# Extract revision_id and property_id
df['revision_id'] = df['revision.id.url'].str.extract(r".*/(\d+)>")[0]
df['property_id'] = df['predicate.t0'].str.extract(r".*/(P\d+)>")[0]

# Create the result_string and initialize Full_User_URL as empty
df['result_string'] = (
    "https://www.wikidata.org/w/index.php?title=Property:" + 
    df['property_id'] + "&oldid=" + df['revision_id']
)
df['Full_User_URL'] = ''  # Initialize with empty strings

In [None]:
# Reorder columns: new columns first, followed by the rest
new_order = ['revision_id', 'property_id', 'result_string', 'Full_User_URL'] + df.columns[:-4].tolist()
df = df[new_order]

In [None]:
df

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time
import numpy as np

# Wikidata API base URL
wikidata_api_url = "https://www.wikidata.org/w/api.php"
output_file = "users_one_of.tsv"

# Process each row and update the DataFrame
for i in tqdm(range(len(df)), desc="Processing rows"):
    if pd.isna(df.loc[i, 'Full_User_URL']) or df.loc[i, 'Full_User_URL'] == '':
        full_user_url = ""
        revision_id = df.loc[i, 'revision_id']
        
        # API parameters
        params = {
            'action': 'query',
            'prop': 'revisions',
            'revids': revision_id,
            'rvprop': 'user|timestamp',
            'format': 'json'
        }
        
        # Make an HTTP GET request to the API
        response = requests.get(wikidata_api_url, params=params)
        
        if response.status_code == 200:
            json_data = response.json()
            
            # Navigate through the JSON response to find the user
            pages = json_data.get('query', {}).get('pages', {})
            for page_id, page_data in pages.items():
                revisions = page_data.get('revisions', [])
                if revisions:
                    user = revisions[0].get('user')
                    if user:
                        # Construct full user URL
                        full_user_url = f"https://www.wikidata.org/wiki/User:{user}"
                        
                        # Update the DataFrame with the Full_User_URL
                        df.at[i, 'Full_User_URL'] = full_user_url

        # Simulate delay to avoid overwhelming the server
        #if (i+1) % 1000 == 0:
        #    time.sleep(max(1, np.random.normal(3, 1)))

        # Save progress to output file periodically or after each iteration
        if (i+1) % 1000000 == 0:
            print("saving backup")
            df.to_csv(output_file, sep='\t', index=False)

# Final save of the complete DataFrame to the output file
df.to_csv(output_file, sep='\t', index=False)
print("Processing complete. Output saved to:", output_file)

In [None]:
import pandas as pd

def analyze_user_urls(df, column_name='Full_User_URL', top_n=20):
    """
    Analyzes the 'Full_User_URL' column to find the top users and their share.
    """
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in DataFrame.")
        return None

    user_counts = df[column_name].value_counts()
    total_urls = len(df[column_name])

    top_users = user_counts.head(top_n)
    top_users_df = pd.DataFrame({'Count': top_users})
    top_users_df['Share'] = top_users_df['Count'] / total_urls * 100

    return top_users_df

def print_top_users(df_analysis):
    if df_analysis is None:
        return
    print("Top Users Analysis:")
    for user, row in df_analysis.iterrows():
        print(f"User: {user}, Count: {row['Count']}, Share: {row['Share']:.2f}%")

# Example usage (assuming 'df' is your DataFrame):
# df = pd.read_csv('your_data.csv') #if reading from a file.

top_users_analysis = analyze_user_urls(df) #run the analysis, and save the result.
print_top_users(top_users_analysis) #pass the result of the analysis to the print function.

# Get Abstracts

### Fetching and Processing DBPedia Abstracts

This code cell implements a pipeline to retrieve abstracts from DBPedia for a list of Wikidata entities.

1.  **File Reading (`read_file`, `extract_subject_t0`)**: It starts by reading a TSV or CSV file, extracting unique Wikidata QIDs from the 'subject.t0' column.

2.  **Title Mapping (`get_titles`)**: It uses a local `sitelinks.en.tsv` file to map these Wikidata QIDs to English Wikipedia page titles, which are needed to query DBPedia.

3.  **Abstract Fetching (`get_abstracts`)**: For each entity with a found title, it queries the DBPedia SPARQL endpoint to fetch the English abstract.

4.  **Processing and Saving (`process_abstracts`)**: The main function iterates through the entities, fetches abstracts, and saves the results into separate CSV files: one for successfully retrieved abstracts, one for errors during fetching, and one for entities where no sitelink or abstract could be found. The results are processed and saved in batches. 
    - The errors file is intended to be reprocessed until no errors remain or the maximum number of retries (3) is reached.

    - The missing abstracts file, containing entities without sitelinks or DBPedia abstracts, is saved for a fallback method.

In [None]:
%pip install tqdm
%pip install pandas
%pip install requests

In [None]:
import pandas as pd
import requests
import os
import urllib.parse
from tqdm import tqdm

# Generic function to read CSV or TSV files 
def read_file(input_file):
    if input_file.endswith('.tsv'):
        return pd.read_csv(input_file, sep='\t')
    elif input_file.endswith('.csv'):
        return pd.read_csv(input_file)
    else:
        raise ValueError("Unsupported file format. Use .csv or .tsv")

# Extract unique QIDs (subjects) from input file
def extract_subjects(input_file):
    df = read_file(input_file)
    unique_subjects = df['subject.t0'].unique()
    qids = []
    for qid in unique_subjects:
        cleaned_qid = str(qid).replace('<', '').replace('>', '').replace('http://www.wikidata.org/entity/', '') # Cleans up the URIs to keep only the QID (e.g., Q42).
        qids.append(cleaned_qid)
    print(f"Number of unique subjects: {len(qids)}")
    return qids

# Query the DBpedia SPARQL endpoint for an abstract
def get_dbpedia_abstract(title):
    endpoint = "http://dbpedia.org/sparql"
    query = f"""
    SELECT ?abstract WHERE {{
        <http://dbpedia.org/resource/{title}> dbo:abstract ?abstract .
        FILTER (lang(?abstract) = "en")
    }}
    """
    params = {'query': query, 'format': 'json'}

    retries = 3
    for _ in range(retries):
        try:
            response = requests.get(endpoint, params=params, timeout=20)
            if response.status_code == 200:
                results = response.json()
                bindings = results.get('results', {}).get('bindings', [])
                if bindings:
                    abstract = bindings[0]['abstract']['value']
                    return abstract.replace('\n', ' ').replace('\r', ' ')
                else:
                    return 'No abstract found'
            else:
                return f"HTTP error {response.status_code}"
        except requests.exceptions.RequestException as e:
            return f"Request error: {e}"

    return "Failed after retries"

# Map QIDs to their Wikipedia titles from a sitelinks file
def load_titles(subjects, sitelinks_file):
    sitelinks_df = pd.read_csv(sitelinks_file, sep='\t')
    sitelinks_map = {}
    for _, row in sitelinks_df.iterrows():
        if row['label'] == 'wikipedia_sitelink':
            qid = row['node1'].replace('<', '').replace('>', '').replace('http://www.wikidata.org/entity/', '')
            raw_title = row['node2'].split('/')[-1]
            # Escape special characters for a valid DBpedia resource URI
            safe_title = urllib.parse.quote(raw_title, safe="()_'")
            sitelinks_map[qid] = safe_title

    titles = {}
    for subject in subjects:
        if subject in sitelinks_map:
            titles[subject] = sitelinks_map[subject]

    return titles

# Main loop for processing and saving abstracts 
def process_abstracts(input_file, output_file, error_file, missing_file, sitelinks_file, batch_size=1000):
    subjects = extract_subjects(input_file)
    titles = load_titles(subjects, sitelinks_file)

    # Check if the output file exists and collect already processed QIDs
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        try:
            print(f"Reading existing output file: {output_file}")
            df_existing = pd.read_csv(output_file, usecols=['subject.t0'])
            existing_qids = set(
                str(qid).replace('<','').replace('>','').replace('http://www.wikidata.org/entity/','')
                for qid in df_existing['subject.t0'].unique()
            )
        except pd.errors.EmptyDataError:
            print(f"Output file exists but is empty or corrupted: {output_file}")
            existing_qids = set()
    else:
        existing_qids = set()

    batch = []
    error_batch = []
    missing_batch = []
    processed_this_round = set()

    # Process each QID with tqdm progress bar
    for i, subject in enumerate(tqdm(subjects, desc="Processing QIDs"), 1):
        if subject in existing_qids:
            continue

        if subject not in titles:
            # Log subjects with no sitelink match
            if subject not in processed_this_round:
                missing_batch.append([subject, 'No sitelink found'])
                processed_this_round.add(subject)
            continue

        title = titles[subject]
        abstract = get_dbpedia_abstract(title)

        if any(keyword in abstract for keyword in ['HTTP error', 'Request error', 'Failed after retries']):
            error_batch.append([subject, abstract])
        elif abstract == 'No abstract found':
            if subject not in processed_this_round:
                missing_batch.append([subject, 'No abstract found'])
        else:
            batch.append([subject, abstract])

        processed_this_round.add(subject)

        # Write batches to disk periodically or at the end
        if i % batch_size == 0 or i == len(subjects):
            if batch:
                df = pd.DataFrame(batch, columns=['subject.t0', 'abstract']).drop_duplicates(subset=['subject.t0'])
                write_header = not os.path.exists(output_file) or os.path.getsize(output_file) == 0
                df.to_csv(output_file, mode='a', header=write_header, index=False)
                batch = []

            if error_batch:
                df_err = pd.DataFrame(error_batch, columns=['subject.t0', 'Error']).drop_duplicates(subset=['subject.t0'])
                write_header = not os.path.exists(error_file) or os.path.getsize(error_file) == 0
                df_err.to_csv(error_file, mode='a', header=write_header, index=False)
                error_batch = []

            if missing_batch:
                df_miss = pd.DataFrame(missing_batch, columns=['subject.t0', 'Reason']).drop_duplicates(subset=['subject.t0'])
                write_header = not os.path.exists(missing_file) or os.path.getsize(missing_file) == 0
                df_miss.to_csv(missing_file, mode='a', header=write_header, index=False)
                missing_batch = []

    existing_qids.update(processed_this_round)

# Loop to retry failed abstracts up to N iterations. After max retries, remaining failures are moved to the 'missing' file.
def reprocess_until_done(initial_input_file, output_file, error_file, missing_file, sitelinks_file, max_iterations=3):
    current_input = initial_input_file

    for iteration in range(1, max_iterations + 1):
        print(f"\n=== Iteration {iteration} ===")
        process_abstracts(current_input, output_file, error_file, missing_file, sitelinks_file)

        if os.path.exists(error_file):
            try:
                errors_df = pd.read_csv(error_file)
                if errors_df.empty:
                    print("No remaining errors. Stopping loop.")
                    break
                else:
                    if iteration == max_iterations:
                        print(f"Max iterations reached. Moving remaining errors to missing.")
                        errors_df['Reason'] = 'Failed after retries'
                        missing_header = not os.path.exists(missing_file) or os.path.getsize(missing_file) == 0
                        errors_df[['subject.t0', 'Reason']].drop_duplicates(subset=['subject.t0']).to_csv(
                            missing_file, mode='a', header=missing_header, index=False
                        )
                        print(f"Moved {len(errors_df)} errors to missing: {missing_file}")
                        os.remove(error_file)
                        break
                    else:
                        print(f"Remaining errors: {len(errors_df)}. Retrying...")
                        next_input = f"resources\data_augmentation\dbpedia_abstracts\errors_iteration_{iteration}.csv"
                        errors_df[['subject.t0']].drop_duplicates().to_csv(next_input, index=False)
                        current_input = next_input
                        os.remove(error_file)
                        
            except pd.errors.EmptyDataError:
                print("Error file empty. Stopping loop.")
                break
        else:
            print("No error file generated. Stopping loop.")
            break
    else:
        print(f"Max iterations ({max_iterations}) reached. Exiting.")

In [None]:
initial_input_file = 'resources/users_one_of.tsv' # Path to the initial input file with QIDs to process
abstracts_file = 'resources/data_augmentation/dbpedia_abstracts/abstracts_oneOf.csv' # Output file where successful abstracts will be stored
error_file = 'resources/data_augmentation/dbpedia_abstracts/errors_abstracts_oneOf.csv' # File to store temporary errors (e.g., HTTP errors, request timeouts)
missing_file = 'resources/data_augmentation/dbpedia_abstracts/missing_abstracts_oneOf.csv' # File to store QIDs with missing abstracts or failed after retries
sitelinks_file = 'resources/data_augmentation/sitelinks.en.tsv' # Sitelinks file mapping QIDs to Wikipedia/DBpedia titles

# This will attempt to query abstracts for each QID, save errors and missing cases, and retry failed ones up to 3 times.
# After the final iteration, remaining errors are moved to the missing file.
reprocess_until_done(
    initial_input_file,
    abstracts_file,
    error_file,
    missing_file,
    sitelinks_file,
    max_iterations=3 # Number of retry rounds before giving up
)

### Short abstracts

This cell processes a CSV file containing abstracts. It performs the following steps:
1.  Loads the CSV file into a pandas DataFrame.

2.  Defines a function `get_first_5_sentences` that takes an abstract text, tokenizes it into sentences using NLTK, and returns only the first 5 sentences.

3.  Applies the `get_first_5_sentences` function to the 'abstract' column of the DataFrame, effectively shortening each abstract.

4.  Selects only the 'QID' and the processed 'abstract' columns from the DataFrame.

5.  Saves the resulting DataFrame to a new CSV file.

In [None]:
import pandas as pd
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer models for sentence splitting

def get_first_5_sentences(abstract):
    # Replace line breaks with spaces to treat abstract as a single paragraph
    abstract = ' '.join(abstract.splitlines())
    # Tokenize the abstract into sentences
    sentences = nltk.sent_tokenize(abstract)
    # Join the first 5 sentences into a single string
    first_5 = ' '.join(sentences[:5]).strip()
    return first_5

def process_abstracts(input_file, output_file):
    # Read the input CSV file
    df = pd.read_csv(input_file, sep=',')
    # Apply the function to extract the first 5 sentences of each abstract
    df['abstract'] = df['abstract'].apply(get_first_5_sentences)
    # Save the subject and shortened abstract columns to a new CSV file
    df[['QID', 'abstract']].to_csv(output_file, index=False)

input_file = abstracts_file 
short_abstracts_file = 'resources/data_augmentation/dbpedia_abstracts/oneOf_abstracts_first_5_sentences.csv'  # Output file name

process_abstracts(input_file, short_abstracts_file)

# Wikidata Text Generator

### Collecting Wikidata Triples for Entities Missing Abstracts in DBpedia

The following cells collects **all triples** from **Wikidata** for each entity (QID) that **does not have an abstract available in DBpedia**.

- The list of QIDs missing abstracts has been previously identified.
- The extracted triples will be used as input to generate informative texts about these entities.
- These texts will be generated by a language model (LLM), which will use the triples to create detailed and useful descriptions for each entity.
- The goal is to fill the gap of missing abstracts in DBpedia with content generated from Wikidata data.


In [None]:
%pip install SPARQLWrapper

In [None]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm
import time

# CONFIGURATION
INPUT_QIDS_FILE = "resources/data_augmentation/dbpedia_abstracts/missing_abstracts_oneOf.csv" # Input CSV file containing QIDs (Wikidata entity IDs) to query
OUTPUT_TRIPLES_FILE = "resources/data_augmentation/wikidata_texts/wikidata_triples_oneOf.csv" # Output CSV file where all collected triples will be saved
ERRORS_FILE = "resources/data_augmentation/wikidata_texts/query_errors_oneOf.csv" # CSV file to save QIDs which caused query errors (to retry or analyze)
DELETED_FILE = "resources/data_augmentation/wikidata_texts/wikidata_deleted_entities_oneOf.csv" # CSV file to store QIDs detected as deleted entities (no data and no redirects)
REDIRECTS_FILE = "resources/data_augmentation/wikidata_texts/wikidata_redirects_oneOf.csv"  # CSV file to store redirect mappings (QIDs that redirect to other QIDs)

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

"""
This function takes a QID and performs a SPARQL query to fetch triples where the QID is the subject. It excludes properties of type ExternalId and CommonsMedia,as these are less 
relevant textual data. It returns a list of dictionaries with property labels, value labels (or raw values), descriptions, and the entity's label and description.
"""
def get_triples_for_qid(qid):
    query = f"""
    SELECT ?propertyLabel 
           (IF(BOUND(?valueLabel), ?valueLabel, STR(?value)) AS ?valueDisplay)
           (IF(BOUND(?valueDescription), ?valueDescription, "") AS ?valueDescription)
           ?entityLabel 
           ?entityDescription WHERE {{
      wd:{qid} ?statement ?value .
      ?property wikibase:directClaim ?statement .
      ?property wikibase:propertyType ?ptype .

      FILTER(?ptype != wikibase:ExternalId && ?ptype != wikibase:CommonsMedia)

      BIND(wd:{qid} AS ?entity)

      OPTIONAL {{
        FILTER(isIRI(?value))
        ?value rdfs:label ?valueLabel .
        ?value schema:description ?valueDescription .
        FILTER(LANG(?valueLabel) = "en")
        FILTER(LANG(?valueDescription) = "en")
      }}

      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en" .
        ?property rdfs:label ?propertyLabel .
        ?entity rdfs:label ?entityLabel .
        ?entity schema:description ?entityDescription .
      }}
    }}
    """
    sparql.setQuery(query)
    try:
        results = sparql.query().convert()
        triples = []
        for b in results['results']['bindings']:
            triples.append({
                "QID": qid,
                "Property": b['propertyLabel']['value'],
                "Value": b['valueDisplay']['value'],
                "ValueDescription": b.get('valueDescription', {}).get('value', ""),
                "EntityLabel": b['entityLabel']['value'],
                "EntityDescription": b.get('entityDescription', {}).get('value', "")
            })
        return triples
    except Exception as e:
        print(f"[ERROR] Failed to query {qid}: {e}")
        return None

"""
This function checks if the input QID redirects to any other entities. Redirects in Wikidata can be modeled as owl:sameAs links.
It returns a list of QIDs that the input QID redirects to.
"""
def get_redirects(qid):
    query = f"""
    SELECT ?redirect WHERE {{
      wd:{qid} owl:sameAs ?redirect.
    }}
    """
    sparql.setQuery(query)
    try:
        results = sparql.query().convert()
        redirects = [b['redirect']['value'].split('/')[-1] for b in results['results']['bindings']]
        return redirects
    except Exception as e:
        print(f"[ERROR] Failed to fetch redirects for {qid}: {e}")
        return []


# This function reads QIDs from the input file, queries their triples, detects redirects and deletions, and writes the results to output files.
def process_all_qids(
    input_file,
    output_file,
    errors_file,
    deleted_file,
    redirects_file,
    max_iterations=3
):
    # Load QIDs
    qids_df = pd.read_csv(input_file)
    qids = qids_df['subject.t0'].dropna().unique().tolist()

    seen_triples = set() # To avoid duplicate triples
    all_triples = [] # Accumulates all fetched triples
    deleted = [] # QIDs with no triples and no redirects (likely deleted)
    redirects_list = [] # Records which QIDs redirect to which others
    errors = qids.copy() # Initially, all QIDs need processing
    iteration = 1

    # Repeat up to max_iterations or until no errors remain
    while iteration <= max_iterations and errors:
        print(f"\nIteration {iteration}/{max_iterations} | QIDs to process: {len(errors)}")
        next_errors = []
        next_deleted = []
        redirects_map = {}

        for qid in tqdm(errors, desc=f"Iteration {iteration}"):
            triples = get_triples_for_qid(qid)
            if triples is None:
                # Network error, will retry in next round
                next_errors.append(qid)

            elif len(triples) == 0:
                # No triples found, check for redirects or deletion
                redirs = get_redirects(qid)
                if redirs:
                    # Store redirects for follow-up querying
                    redirects_map[qid] = redirs
                else:
                    # No redirects either; mark as deleted
                    next_deleted.append(qid)
            else:
                # Save unique triples
                for t in triples:
                    triple_key = (
                        t['QID'],
                        t.get('EntityLabel', ''),
                        t.get('EntityDescription', ''),
                        t['Property'],
                        t['Value'],
                        t.get('ValueDescription', '')
                    )
                    if triple_key not in seen_triples:
                        seen_triples.add(triple_key)
                        all_triples.append(t)
            time.sleep(1.5)  

        # Process found redirects in this iteration
        for source_qid, redirect_qids in redirects_map.items():
            for redirected_qid in redirect_qids:
                triples = get_triples_for_qid(redirected_qid)
                if triples:
                    for t in triples:
                        triple_key = (
                            t['QID'],
                            t.get('EntityLabel', ''),
                            t.get('EntityDescription', ''),
                            t['Property'],
                            t['Value'],
                            t.get('ValueDescription', '')
                        )
                        if triple_key not in seen_triples:
                            seen_triples.add(triple_key)
                            all_triples.append(t)
                else:
                    next_errors.append(redirected_qid) # If query fails on redirected QID, mark for retry
                time.sleep(1.5)
                # Save redirect relation for later reference
                redirects_list.append({
                    "Original_QID": source_qid,
                    "Redirect_QID": redirected_qid
                })

        # Update lists for next iteration or final save
        deleted.extend(next_deleted)
        errors = next_errors

        iteration += 1

    # Save results to files
    pd.DataFrame(all_triples).to_csv(output_file, index=False)
    pd.DataFrame({"QID": errors}).to_csv(errors_file, index=False)
    pd.DataFrame({"QID": deleted}).to_csv(deleted_file, index=False)
    pd.DataFrame(redirects_list).to_csv(redirects_file, index=False)

    print("\nDONE!")
    print(f"Total triples saved: {len(all_triples)}")
    print(f"Total redirects saved: {len(redirects_list)}")
    print(f"Remaining errors: {len(errors)}")
    print(f"Total deleted entities: {len(deleted)}")


# Entry point
if __name__ == "__main__":
    process_all_qids(
        INPUT_QIDS_FILE,
        OUTPUT_TRIPLES_FILE,
        ERRORS_FILE,
        DELETED_FILE,
        REDIRECTS_FILE,
        max_iterations=3  # Max reprocessing rounds
    )


### Grouping by Instance to Identify Common Properties

To improve the consistency and completeness of textual descriptions, we analyzed how different properties are distributed across items that share the same type (instance). The goal is to identify which properties appear most frequently within each instance group (e.g., humans, organizations, places).

By grouping items by their "instance of" value and counting how often each property appears within those groups, we can discover patterns and define a common structure for textual summaries. This can help ensure that similar types of entities are described in a coherent and standardized way across the dataset.


#### 1. Building the property ranking per instance

- Loads the original triples file (`wikidata_triples_oneOf.csv`).
- Identifies the instance(s) of each QID using the `instance of` property.
- Assigns a default tag `without instance of` (no instance) if none found.
- Groups data by instance and property, counting how many unique QIDs have each property.
- Creates a ranking of properties for each instance, ordered by frequency (`count_qids`).
- Saves this ranking to `property_ranking_by_instance.csv`.

**Purpose:** to understand which properties are most frequent for each entity type (instance).


In [None]:
import pandas as pd

df = pd.read_csv("resources/data_augmentation/wikidata_texts/wikidata_triples_oneOf.csv") # Load the CSV file containing the collected Wikidata triples

# Create mapping QID → instance (for those with 'instance of' property)
instance_map = df[df['Property Label'] == 'instance of'][['QID', 'Value Display']]
instance_map = instance_map.rename(columns={'Value Display': 'instance_of'})

df = df.merge(instance_map, on='QID', how='left') # Merge to add the instance, if any

df['instance_of'] = df['instance_of'].fillna('without instance of') # Fill QIDs without instance with a default tag

grouped = df.groupby(['instance_of', 'Property Label'])['QID'].nunique().reset_index() # Group by instance and property, counting unique QIDs

grouped = grouped.rename(columns={'Property Label': 'predicate','QID': 'count_qids'}) # Rename columns to the desired format

grouped = grouped.sort_values(by='count_qids', ascending=False) # Sort from highest to lowest count

grouped.to_csv("resources/data_augmentation/wikidata_texts/property_ranking_by_instance.csv", index=False) # Save to CSV

print(grouped.head(20)) # Display the top 20 rows


     instance_of                            predicate  count_qids
4891       human                          instance of        4612
4994       human                        sex or gender        4608
4939       human                           occupation        4159
4832       human                        date of birth        4155
4823       human               country of citizenship        3872
4872       human                           given name        3848
4959       human                       place of birth        3462
4898       human  languages spoken, written or signed        2659
4861       human                          family name        2544
4834       human                        date of death        2543
4961       human                       place of death        1944
4769       human                     Commons category        1300
4850       human                          educated at        1096
4925       human              name in native language         756
4798      

#### 2. Filtering triples using a combined property ranking

- Loads the triples file and the property ranking per instance created earlier.
- For each QID, retrieves its associated instances.
- For each instance, gets the ordered list of properties by frequency ranking.
- The frequency ranking for each instance is first normalized by assigning descending weights (e.g., top 1 gets N, next gets N-1, etc.) so that higher-ranked properties contribute more to the final score.
- Computes a combined ranking by summing the weights across instances for each property.
- For each QID, selects up to 15 most relevant properties according to the combined ranking that appear in the QID’s triples.
- Removes triples whose values start with Wikidata links (to exclude less informative).
- Saves the filtered triples to `triples_top15_by_combined_instance.csv`.

**Purpose:** to select the most informative and relevant triples for each entity, considering multiple instances and property frequency, reducing noise and prioritizing important data.  
This assumes that the most frequent properties are likely those most commonly used by the Wikidata community to describe entities of that instance type, so they tend to capture the essential aspects.

In [None]:
import pandas as pd
from collections import Counter

detailed = pd.read_csv("resources/data_augmentation/wikidata_texts/wikidata_triples_oneOf.csv") # Load the detailed triples CSV

ranking = pd.read_csv("resources/data_augmentation/wikidata_texts/property_ranking_by_instance.csv") # Load the property ranking per instance CSV

# Create dictionary: instance -> list of properties ordered by frequency
top_props_per_instance = (
    ranking.groupby('instance_of')
           .apply(lambda g: g.sort_values('count_qids', ascending=False)['predicate'].tolist())
           .to_dict()
)

# Map instances of each QID (may have multiple instances)
instances_per_qid = (
    detailed[detailed['Property Label'] == 'instance of']
    .groupby('QID')['Value Display'].apply(list)
    .to_dict()
)

# Normalized combined ranking summing weighted points
def combined_ranking_normalized(instances):
    if not instances:
        return top_props_per_instance.get('without instance of', [])

    counter = Counter()

    for inst in instances:
        props = top_props_per_instance.get(inst, [])
        total_props = len(props)
        if total_props == 0:
            continue
        
        for i, prop in enumerate(props):
            weight = total_props - i # Original weight: descending by position
            normalized_weight = weight / total_props # Normalize by dividing by the total number of properties in the instance
            counter[prop] += normalized_weight

    # Sort properties from highest combined normalized weight to lowest
    final_ranking = [prop for prop, _ in counter.most_common()]
    return final_ranking

results = []

# Process each QID individually
for qid, group in detailed.groupby('QID'):
    if(qid.startswith('Q')):
        instances = instances_per_qid.get(qid, [])
        ranking_for_item = combined_ranking_normalized(instances)
        
        available_props = group['Property Label'].tolist()
        
        top_props = [p for p in ranking_for_item if p in available_props][:15] # Take up to 15 properties following the combined ranking that exist in the item
        
        filtered_group = group[group['Property Label'].isin(top_props)]
        results.append(filtered_group)

final = pd.concat(results) # Concatenate all filtered groups

final = final[~final['Value Display'].fillna('').str.startswith("http")] # Remove rows where 'Value Display' starts with a Wikidata link (exclude these values)

# Save final CSV without the 'instance_of' column
final.drop(columns=['instance_of'], errors='ignore').to_csv("resources/data_augmentation/wikidata_texts/triples_top15_by_combined_instance.csv", index=False)

print(f"Total selected triples: {len(final)}")


  .apply(lambda g: g.sort_values('count_qids', ascending=False)['predicate'].tolist())


Total selected triples: 71810


### Generating Texts from Wikidata Triples

This cell takes structured data (triples) about entities extracted from Wikidata and automatically generates informative texts for each entity. These texts are meant to replace missing abstracts from DBpedia.

The process includes:
- Grouping triples by entity (QID).
- Sending facts, labels, and descriptions to a language model to generate a paragraph about each entity.
- Entities producing empty texts, texts with more than 5 # symbols, or fewer than 2 sentences are marked as problematic.
- Well-formed outputs are saved separately.

In [None]:
import json
import csv
import time
import subprocess
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

# Download punkt tokenizer for sentence splitting
nltk.download('punkt')

# Generate a text paragraph using a language model.Constructs a prompt with examples and facts, sends request via subprocess, and extracts the generated response.
def generate_paragraph(entity_label, description, triples):
    """    
    Args:
        entity_label (str): The entity's label.
        description (str): The entity's description.
        triples (list of tuples): List of (predicate, object(the object may include a description when available.)) facts about the entity.
        
    Returns:
        str: The generated paragraph
    """
    
    # Format facts as bullet points
    formatted_facts = "\n".join([f"- {p}: {o} - {d}" for p, o, d in triples])
    
    # Prompt including examples and facts to guide the LLM
    prompt = f"""Below are two examples of paragraphs written in a formal, encyclopedic style. Use them as a reference for writing a new text.

    Example 1:
    "NGC 6212 is a spiral galaxy located in the constellation Hercules. It is designated as Sb in the galaxy morphological classification scheme and was discovered by the French astronomer Édouard Stephan on 26 July 1870. NGC 6212 is located at about 397 million light years from Earth."

    Example 2:
    "Franz Josef Heinz, known as Heinz-Orbis, (25 February 1884 - 9 January 1924) was a Palatine separatist who briefly led the government of the \"Autonomous Palatinate\" during the French occupation of the Rhineland. He was assassinated by German nationalists in 1924. Heinz came from the town of Orbis in Northern Palatinate, later using the town as part of his name. He was a farmer and became a leader of the free peasantry and the founder of the Palatine Corps. In the aftermath of World War I, France occupied the Rhineland."

    Using the following facts, write a similar paragraph about ({entity_label} - {description}) in formal English:
    IMPORTANT:
    - Do NOT include any property IDs (like P21, P27) or QIDs (like Q581).
    - Always convert any identifiers into human-readable labels and context.
    - Write naturally and smoothly, in full sentences and do not add anything else

    {formatted_facts}
    
    """

    data = {
        "model": "mixtral:8x7b",
        "prompt": prompt
    }

    result = subprocess.run(
        [
            "curl", "-s", "-u", "llama:miengohNg9OG6ieR5aof",
            "-H", "Content-Type: application/json",
            "-X", "POST", "https://llama-webui.ai.wu.ac.at/api/generate",
            "-d", json.dumps(data)
        ],
        capture_output=True,
        text=True,
    )

    generated_response = ""
    # Process each line of output, expecting JSON lines with a "response" field
    for line in result.stdout.strip().splitlines():
        try:
            item = json.loads(line)
            generated_response += item.get("response", "")
        except json.JSONDecodeError:
            continue # Ignore lines that are not valid JSON

    return generated_response.strip()

# Reads input CSV of triples, groups data by QID, generates texts with the LLM, and writes results and errors to output CSV files.
def process_csv_and_generate_texts(input_path, results_file, errors_file):
    """
    Args:
        input_path (str): Path to the CSV input file containing the triples.
        results_file (str): Path to the output CSV file for successful generations.
        errors_file (str): Path to the output CSV file for any generation errors.
    """

    entities = {}

    # Read triples from input CSV and group by QID
    with open(input_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            qid = row["QID"]
            if qid not in entities:
                entities[qid] = {
                    "Entity Label": row["Entity Label"],
                    "Entity Description": row["Entity Description"],
                    "triples": []
                }
            entities[qid]["triples"].append((row["Property Label"], row["Value Display"], row["Value Description"]))

    with open(results_file, "a", encoding="utf-8", newline='') as out_csv, \
         open(errors_file, "w", encoding="utf-8", newline='') as err_csv:

        writer_out = csv.writer(out_csv)
        writer_err = csv.writer(err_csv)

        writer_out.writerow(["QID", "text"])
        writer_err.writerow(["QID", "error"])

        for qid, data in tqdm(entities.items(), desc="Generating texts"):
            try:
                paragraph = generate_paragraph(data["Entity Label"], data["Entity Description"], data["triples"])
                paragraph_clean = paragraph.replace("\n", " ").replace("\r", " ").strip()

                # Check for invalid outputs and write to errors if necessary
                if (not paragraph_clean or paragraph_clean.count("#") > 5 or len(sent_tokenize(paragraph_clean)) < 2):
                    writer_err.writerow([qid, "Invalid or poor quality response"])
                else:
                    writer_out.writerow([qid, paragraph_clean])

                time.sleep(1.5)

            except Exception as e:
                # Log exceptions to error file
                writer_err.writerow([qid, str(e)])
    
    print(f"\nProcess finished!")
    print(f"- Results saved in: {results_file}")
    print(f"- Errors saved in: {errors_file}")


# CONFIGURATION 
# Define the input CSV file with triples 
INPUT_FILE = "resources/data_augmentation/wikidata_texts/triples_top15_by_combined_instance.csv" 

# Define the output CSV filenames for the results and errors
RESULTS_OUTPUT = "resources/data_augmentation/wikidata_texts/wikidata_texts_oneOf.csv"
ERRORS_OUTPUT = "resources/data_augmentation/wikidata_texts/texts_errors_oneOf_.csv"

# MAIN EXECUTION 
# Run the main pipeline: process the triples and generate texts, saving successful results and any errors separately
process_csv_and_generate_texts(INPUT_FILE, RESULTS_OUTPUT, ERRORS_OUTPUT)


### Merge Abstracts with Wikidata Texts
This script merges two CSV files containing entity descriptions:

- `abstracts_oneOf.csv`: abstracts from DBpedia  
- `wikidata_texts_oneOf.csv`: generated texts for entities **missing DBpedia abstracts**

In [None]:
import csv

# Input file paths
file1 = 'resources/data_augmentation/dbpedia_abstracts/abstracts_oneOf.csv'
file2 = 'resources/data_augmentation/wikidata_texts/wikidata_texts_oneOf.csv'
output = 'resources/data_augmentation/final_entity_texts_oneOf.csv'

header = ["QID", "abstract"]

# Open the output file and write combined data
with open(output, "w", newline='', encoding="utf-8") as fout:
    writer = csv.writer(fout)
    writer.writerow(header) 

    def read_rows(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader) 
            for row in reader:
                if row:  # Skip empty lines
                    writer.writerow(row)

    # Append content from both files
    read_rows(file1)
    read_rows(file2)

print(f"Files successfully merged into: {output}")


Files successfully merged into: final_entity_texts_oneOf.csv
