In [18]:
import requests

# 文件路径
input_file_path = r'C:\Users\96446\Documents\GitHub\data/mmkg/FBDB15K/norm/ent_ids_1'  # Ensure this path is correct and accessible

FB15K = {}

# Reading the file and filling the FB15K dictionary
with open(input_file_path, 'r') as file:
    for line in file:
        index, fb_id = line.strip().split('\t')
        FB15K[index] = fb_id

output_file_path = 'output_with_types.txt'  # 输出文件路径, make sure your environment has write permissions to this path or directory

sparql_url = "https://query.wikidata.org/sparql"

# 准备输出文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # 写入文件头
    output_file.write("Index,Freebase ID,Entity,Type\n")
    
    for index, fb_id in FB15K.items():
        query = f"""
        SELECT ?item ?itemLabel ?itemTypeLabel WHERE {{
          ?item wdt:P646 "{fb_id}" .
          ?item wdt:P31 ?itemType .
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}"""
        
        response = requests.get(sparql_url, params={'format': 'json', 'query': query})
        data = response.json()
        for item in data["results"]["bindings"]:
            print(f"Freebase ID: {fb_id}, Entity: {item['itemLabel']['value']}, Type: {item['itemTypeLabel']['value']}")


Freebase ID: /m/027rn, Entity: Dominican Republic, Type: country
Freebase ID: /m/027rn, Entity: Dominican Republic, Type: island country
Freebase ID: /m/027rn, Entity: Dominican Republic, Type: sovereign state
Freebase ID: /m/027rn, Entity: Dominican Republic, Type: archipelagic state
Freebase ID: /m/06cx9, Entity: republic, Type: form of state
Freebase ID: /m/06cx9, Entity: republic, Type: form of government
Freebase ID: /m/017dcd, Entity: Mighty Morphin Power Rangers, Type: television series
Freebase ID: /m/06v8s0, Entity: Wendee Lee, Type: human
Freebase ID: /m/07s9rl0, Entity: drama film, Type: film genre
Freebase ID: /m/0170z3, Entity: American History X, Type: film
Freebase ID: /m/01sl1q, Entity: Michelle Rodriguez, Type: human
Freebase ID: /m/044mz_, Entity: Naveen Andrews, Type: human
Freebase ID: /m/0cnk2q, Entity: Australia men's national soccer team, Type: national association football team
Freebase ID: /m/02nzb8, Entity: midfielder, Type: association football position


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [31]:
import requests
import time

input_file_path = r'C:\Users\96446\Documents\GitHub\data/mmkg/FBDB15K/norm/ent_ids_1'
sparql_url = "https://query.wikidata.org/sparql"

FB15K = {}
FB15K_Updated = {}

# 读取Freebase IDs
with open(input_file_path, 'r') as file:
    for line in file:
        index, fb_id = line.strip().split('\t')
        FB15K[index] = fb_id

def fetch_data_with_backoff(query):
    backoff_time = 1  # 开始退避时间为1秒
    max_backoff_time = 120  # 最大退避时间为120秒
    while backoff_time <= max_backoff_time:
        response = requests.get(sparql_url, params={'format': 'json', 'query': query})
        if response.status_code == 429:  # 处理速率限制
            print(f"Rate limited. Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)
            backoff_time *= 2  # 指数退避
        elif response.status_code == 200:
            return response.json()  # 成功获取数据
        else:
            response.raise_for_status()  # 抛出其他HTTP错误

# 查询每个Freebase ID的实体和类型
for index, fb_id in FB15K.items():
    query = f"""
    SELECT ?item ?itemLabel ?itemTypeLabel WHERE {{
      ?item wdt:P646 "{fb_id}" .
      ?item wdt:P31 ?itemType .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}"""
    try:
        data = fetch_data_with_backoff(query)
        if data.get("results", {}).get("bindings", []):
            item = data["results"]["bindings"][0]  # 取第一个结果
            entity = item.get('itemLabel', {}).get('value', 'Unknown entity')
            entity_type = item.get('itemTypeLabel', {}).get('value', 'Unknown type')
            FB15K_Updated[fb_id] = {'entity_name': entity, 'entity_type': entity_type}
            print(f"Freebase ID: {fb_id}, Entity: {entity}, Type: {entity_type}")
        else:
            print(f"No data found for Freebase ID: {fb_id}")
    except requests.exceptions.HTTPError as e:
        print(f"Failed to fetch data for {fb_id}: {e}")

print("Finished updating the dictionary with entity types.")


Freebase ID: /m/027rn, Entity: Dominican Republic, Type: country
Freebase ID: /m/06cx9, Entity: republic, Type: form of state
Freebase ID: /m/017dcd, Entity: Mighty Morphin Power Rangers, Type: television series
Freebase ID: /m/06v8s0, Entity: Wendee Lee, Type: human
Freebase ID: /m/07s9rl0, Entity: drama film, Type: film genre
Freebase ID: /m/0170z3, Entity: American History X, Type: film
Freebase ID: /m/01sl1q, Entity: Michelle Rodriguez, Type: human
Freebase ID: /m/044mz_, Entity: Naveen Andrews, Type: human
Freebase ID: /m/0cnk2q, Entity: Australia men's national soccer team, Type: national association football team
Freebase ID: /m/02nzb8, Entity: midfielder, Type: association football position
Rate limited. Retrying in 1 seconds...
Freebase ID: /m/02_j1w, Entity: defender, Type: association football position
Freebase ID: /m/01cwm1, Entity: Stoke City F.C., Type: association football club
Rate limited. Retrying in 1 seconds...
Freebase ID: /m/059ts, Entity: Northwest Territories, T

In [38]:
entity_types = set()
for fb_id, info in FB15K_Updated.items():
    print(f"Freebase ID: {fb_id}")
    print(f"Entity Name: {info['entity_name']}")
    print(f"Entity Type: {info['entity_type']}")
    entity_types.add(info['entity_type'])
    print("---------------")


Freebase ID: /m/027rn
Entity Name: Dominican Republic
Entity Type: country
---------------
Freebase ID: /m/06cx9
Entity Name: republic
Entity Type: form of state
---------------
Freebase ID: /m/017dcd
Entity Name: Mighty Morphin Power Rangers
Entity Type: television series
---------------
Freebase ID: /m/06v8s0
Entity Name: Wendee Lee
Entity Type: human
---------------
Freebase ID: /m/07s9rl0
Entity Name: drama film
Entity Type: film genre
---------------
Freebase ID: /m/0170z3
Entity Name: American History X
Entity Type: film
---------------
Freebase ID: /m/01sl1q
Entity Name: Michelle Rodriguez
Entity Type: human
---------------
Freebase ID: /m/044mz_
Entity Name: Naveen Andrews
Entity Type: human
---------------
Freebase ID: /m/0cnk2q
Entity Name: Australia men's national soccer team
Entity Type: national association football team
---------------
Freebase ID: /m/02nzb8
Entity Name: midfielder
Entity Type: association football position
---------------
Freebase ID: /m/02_j1w
Entity Na

In [37]:
import csv

output_csv_path = 'FB15K_Updated.csv'

with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Freebase ID', 'Entity Name', 'Entity Type']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    for fb_id, info in FB15K_Updated.items():
        writer.writerow({
            'Freebase ID': fb_id,
            'Entity Name': info['entity_name'],
            'Entity Type': info['entity_type']
        })


In [40]:
len(entity_types)

1066

In [41]:
entity_types

{'land-grant university',
 'award for best leading actress',
 'film',
 'film festival',
 'unitary authority area in England',
 'river',
 'corporation',
 'designated intractable/rare disease',
 'planned community',
 'political movement',
 'collection',
 'province of Pakistan',
 'seaside resort',
 'ceremonial county of England',
 'area of London',
 'political ideology',
 'type of world view',
 'rebellion',
 'political system',
 'political alignment',
 'county of West Virginia',
 'type of musical instrument',
 'major regional center',
 'ice hockey position',
 'religious identity',
 'newspaper format',
 'group of stereoisomers',
 'programming paradigm',
 'urban university',
 'subsidiary',
 'type of arts',
 'interdisciplinarity',
 'federative unit of Brazil',
 'city in Ukraine',
 'field of work',
 'musical group',
 'peninsula',
 'university in Quebec',
 'phase of human life',
 'astronomical observatory',
 'philosophical movement',
 'form of festival',
 'county of Arizona',
 'game show',
 's