In [1]:

import os
import re
import pandas as pd
import configparser
import jaconv
from tqdm import tqdm
import math 

In [2]:
config = configparser.ConfigParser()
config.read("../config.ini")


['../config.ini']

# データ読み込み

In [3]:
OUTPUT_ROOT = config.get("Output","output_path")
cookpad_edges_path = os.path.join(OUTPUT_ROOT,"output_csv/cookpad_edges.csv") 
rakuten_edges_path = os.path.join(OUTPUT_ROOT,"output_csv/rakuten_edges.csv") 
seibunhyo_edges_path = os.path.join(OUTPUT_ROOT,"output_csv/seibunhyo_edges.csv") 

cookpad_edges = pd.read_csv(cookpad_edges_path)
rakuten_edges = pd.read_csv(rakuten_edges_path)
seibunhyo_edges = pd.read_csv(seibunhyo_edges_path)

In [4]:
print(len(cookpad_edges))
print(len(rakuten_edges))
print(len(seibunhyo_edges))

12724374
5273415
266023


In [None]:
rakuten_edges

Unnamed: 0,recipe_id,recipe_title,ingredient_name,edge_type,data_source
0,ad7d585b06850f8437ff5fb97d3c7a823ff21bb1,豚の角煮,しょうが,recipe-ingredient,cookpad
1,ad7d585b06850f8437ff5fb97d3c7a823ff21bb1,豚の角煮,にんにく,recipe-ingredient,cookpad
2,ad7d585b06850f8437ff5fb97d3c7a823ff21bb1,豚の角煮,ねぎ,recipe-ingredient,cookpad
3,ad7d585b06850f8437ff5fb97d3c7a823ff21bb1,豚の角煮,豚肉,recipe-ingredient,cookpad
4,ad7d585b06850f8437ff5fb97d3c7a823ff21bb1,豚の角煮,砂糖,recipe-ingredient,cookpad
...,...,...,...,...,...
12724369,7a9d10ab5eb506a3e66b0e115a1ce84b0dfe7a39,豆もやしのなむる,豆もやし,recipe-ingredient,cookpad
12724370,7a9d10ab5eb506a3e66b0e115a1ce84b0dfe7a39,豆もやしのなむる,塩,recipe-ingredient,cookpad
12724371,7a9d10ab5eb506a3e66b0e115a1ce84b0dfe7a39,豆もやしのなむる,ごま油,recipe-ingredient,cookpad
12724372,7a9d10ab5eb506a3e66b0e115a1ce84b0dfe7a39,豆もやしのなむる,醤油,recipe-ingredient,cookpad


In [5]:
import rdflib
import uuid
from rdflib import Graph, Namespace, URIRef, BNode, Literal, RDF, RDFS
import os

g = Graph()
# 名前空間の定義
RECIPE = Namespace("http://JapaneseFoodKG.org/Recipe/")
INGREDIENT = Namespace("http://JapaneseFoodKG.org/Ingredient/")
NUTRITION = Namespace("http://JapaneseFoodKG.org/Nutrition/")
DATA_SOURCE = Namespace("http://JapaneseFoodKG.org/data_source/")
RELATION = Namespace("http://JapaneseFoodKG.org/relation/")


In [7]:
ingredient_uris = {}

In [25]:


ingredient_uris = {}
for index, row in tqdm(cookpad_edges.iterrows(),total=len(cookpad_edges)):
    recipe_id = row['recipe_id']
    recipe_title = row['recipe_title']
    ingredient_name = row['ingredient_name']
    data_source = row['data_source']
    
    
    # レシピに関する情報を追加
    recipe_uri = RECIPE[recipe_id]
    g.add((recipe_uri, RDF.type, RECIPE.Recipe))
    g.add((recipe_uri, RECIPE.title, Literal(recipe_title)))
    g.add((recipe_uri, DATA_SOURCE.source, Literal(data_source)))  # データソースをレシピに追加
    
    
    # 食材のURI（文字列が一致する場合は既存のURIを使用）
    if ingredient_name not in ingredient_uris:
        random_uuid = str(uuid.uuid4())
        ingredient_uris[ingredient_name] = INGREDIENT[random_uuid]
    ingredient_uri = ingredient_uris[ingredient_name]

    # 食材に関する情報を追加
    g.add((ingredient_uri, RDF.type, INGREDIENT.Ingredient))
    g.add((ingredient_uri, INGREDIENT.name, Literal(ingredient_name)))

    # レシピと食材の関係を追加
    g.add((recipe_uri, RECIPE.hasIngredient, ingredient_uri))

  1%|          | 153870/12724374 [00:19<31:48, 6586.05it/s]http://JapaneseFoodKG.org/Recipe/{Ａ}しょうゆ does not look like a valid URI, trying to serialize this will break.
http://JapaneseFoodKG.org/Recipe/{Ａ}砂糖 does not look like a valid URI, trying to serialize this will break.
http://JapaneseFoodKG.org/Recipe/{Ａ}酒 does not look like a valid URI, trying to serialize this will break.
100%|██████████| 12724374/12724374 [30:16<00:00, 7004.97it/s] 


In [26]:

for index, row in tqdm(rakuten_edges.iterrows(),total=len(rakuten_edges)):
    recipe_id = row['recipe_id']
    recipe_title = row['recipe_title']
    ingredient_name = row['ingredient_name']
    data_source = row['data_source']
    
    # レシピに関する情報を追加
    recipe_uri = RECIPE[recipe_id]
    g.add((recipe_uri, RDF.type, RECIPE.Recipe))
    g.add((recipe_uri, RECIPE.title, Literal(recipe_title)))
    g.add((recipe_uri, DATA_SOURCE.source, Literal(data_source)))  # データソースをレシピに追加
    
    
    # 食材のURI（文字列が一致する場合は既存のURIを使用）
    if ingredient_name not in ingredient_uris:
        random_uuid = str(uuid.uuid4())
        ingredient_uris[ingredient_name] = INGREDIENT[random_uuid]
    ingredient_uri = ingredient_uris[ingredient_name]

    # 食材に関する情報を追加
    g.add((ingredient_uri, RDF.type, INGREDIENT.Ingredient))
    g.add((ingredient_uri, INGREDIENT.name, Literal(ingredient_name)))

    # レシピと食材の関係を追加
    g.add((recipe_uri, RECIPE.hasIngredient, ingredient_uri))

100%|██████████| 5273415/5273415 [12:28<00:00, 7045.59it/s]


In [8]:
from rdflib.namespace import XSD


nutrition_uris = {}

for index, row in tqdm(seibunhyo_edges.iterrows(),total=len(seibunhyo_edges)):
    food_name = row['food_name']
    nutrition_name = row['nutrition_name']
    value = row['value']
    data_source = row['data_source']  # データソースの値を取得

    # 栄養素のURI（文字列が一致する場合は既存のURIを使用）
    if nutrition_name not in nutrition_uris:
        # ランダムなUUIDを生成
        random_uuid = str(uuid.uuid4())
        nutrition_uris[nutrition_name] = NUTRITION[random_uuid]
    nutrition_uri = nutrition_uris[nutrition_name]
    # 栄養素に関する情報を追加
    g.add((nutrition_uri, RDF.type, NUTRITION.Nutrient))
    g.add((nutrition_uri, NUTRITION.name, Literal(nutrition_name)))
    
    food_name = food_name.replace('\u3000', ' ')  # 全角スペースを半角スペースに置換
    matching_ingredients = {name: uri for name, uri in ingredient_uris.items() if isinstance(name, str) and name in food_name}


    for ingredient_name, ingredient_uri in matching_ingredients.items():
        relation_uri = RELATION[str(uuid.uuid4())]  # エッジのURI
        g.add((ingredient_uri, relation_uri, nutrition_uri))
        g.add((relation_uri, RDF.type, RELATION.Contains))
        g.add((relation_uri, RELATION.value, Literal(value, datatype=XSD.float)))
        g.add((relation_uri, DATA_SOURCE.source, Literal(data_source))) 



 15%|█▌        | 40127/266023 [00:03<00:17, 12632.63it/s]


KeyboardInterrupt: 

# rdfファイル出力

In [17]:
japanesefoodkg_path = os.path.join(OUTPUT_ROOT,"JapaneseFoodKG.ttl") 
# RDFを保存（例としてTurtle形式で出力）
with open(japanesefoodkg_path, "wb") as f:
    f.write(g.serialize(format="turtle"))


In [28]:
ingredient_uris

{'しょうが': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/97e50983-cece-483e-b975-47790efc4189'),
 'にんにく': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/02e59014-c279-42b9-bcdb-db4d8d6a9e47'),
 'ねぎ': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/8fb3a4d2-68ad-4cc8-97a1-136d40dd792b'),
 '豚肉': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/21ec4a8c-63e4-45ca-a080-b700f7101ebd'),
 '砂糖': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/424db96d-6963-475d-b84a-211921257b7c'),
 '酒': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/bc7f6d3b-8215-4877-a782-feb8c8b41b2b'),
 '醤油': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/13e79813-93a6-411e-8d07-eac3d188d537'),
 'みりん': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/7af1b8a3-9328-48cf-896d-e06b6b30b71b'),
 'にんじん': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ingredient/380e0813-6b49-4ae7-8e84-12735324e443'),
 'ぴーまん': rdflib.term.URIRef('http://JapaneseFoodKG.org/Ing