### Organisation information search
This is a code snippet for organisation URL and WikidataID search (the query function credit to GPT).  
The search uses organisation name and country, return WikidataID and URL.  
Note: for better search accuracy, organisation name need to be accurate full name without the abbreviation inside the parenthesis

In [1]:
# import libraries
import pandas as pd
import re
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# sample dataframe
org_info = {
    'Name': [
        'University of Melbourne',
        'Australian National University',
        'The University of Sydney (USYD)',
        'Monash University',
        'Harvard University',
        'National Health and Medical Research Council (NHMRC)'
    ],
    'Country': ['Australia', 'Australia', 'Australia', 'Australia', 'United States', 'Australia']
}

# create the DataFrame
organization_info_df = pd.DataFrame(org_info)

In [3]:
# remove parenthese for organisations
def clean_name(name):
    clean_name = re.sub(r'\s*\(.*?\)\s*', '', name)
    return clean_name.strip()

# function to query Wikidata for an organization's ID and URL
def get_wikidata_info(name, country):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?item ?itemLabel ?itemDescription ?url WHERE {{
      ?item (rdfs:label|skos:altLabel) "{name}"@en .
      ?item wdt:P17 ?country .
      ?country (rdfs:label|skos:altLabel) "{country}"@en .
      OPTIONAL {{ ?item wdt:P856 ?url. }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    LIMIT 1
    """
    
    user_agent = "WDQS-example Python/3.7"
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    
    try:
        results = sparql.query().convert()
    except Exception as e:
        print(f"Error querying Wikidata for {name}, {country}: {e}")
        return {
            "Name": name,
            "Country": country,
            "WikidataID": None,
            "URL": None
        }
    
    if results["results"]["bindings"]:
        result = results["results"]["bindings"][0]
        url = result["url"]["value"] if "url" in result else None
        return {
            "Name": name,
            "Country": country,
            "WikidataID": result["item"]["value"].split("/")[-1] if url else None,
            "URL": url
        }
    else:
        return {
            "Name": name,
            "Country": country,
            "WikidataID": None,
            "URL": None
        }

    
# remove parenthese in 'name' column
organization_info_df['Name'] = organization_info_df['Name'].apply(clean_name)

# query for all organizations
query_results = [get_wikidata_info(row['Name'], row['Country']) for index, row in organization_info_df.iterrows()]

# dataframe
organisation_search_df = pd.DataFrame(query_results)
organisation_search_df.head()

Unnamed: 0,Name,Country,WikidataID,URL
0,University of Melbourne,Australia,Q319078,https://www.unimelb.edu.au/
1,Australian National University,Australia,Q127990,https://www.anu.edu.au/
2,The University of Sydney,Australia,Q487556,https://www.sydney.edu.au/
3,Monash University,Australia,Q598841,https://www.monash.edu/
4,Harvard University,United States,Q13371,https://harvard.edu
