In [8]:
import pandas as pd
import os
import configparser
import jaconv

config = configparser.ConfigParser()
config.read("../config.ini")
OUTPUT_ROOT = config.get("Output","output_path")

rakuten_root_path = config['Data']['rakuten_data_path']

In [2]:
data_all = pd.read_csv(os.path.join(rakuten_root_path,"recipe01_all_20170118.txt"), sep="\t", encoding="utf-8", header=None)
column_names = [
    "recipe_id",
    "user_id",
    "major_category",
    "medium_category",
    "minor_category",
    "recipe_title",
    "recipe_origin",
    "recipe_introduction",
    "food_image_file",
    "dish_name",
    "tag1",
    "tag2",
    "tag3",
    "tag4",
    "one_point_info",
    "cooking_time_id",
    "occasion_id",
    "cost_id",
    "servings",
    "recipe_publish_date"
]
data_all.columns = column_names
len(data_all)

796028

In [3]:
data_ingredient = pd.read_csv(os.path.join(rakuten_root_path,"recipe02_material_20160112.txt"), sep="\t", encoding="utf-8", header=None)
column_names = [
    "recipe_id",
    "name",
    "quantity"
]
data_ingredient.columns = column_names
print(len(data_ingredient))
data_ingredient.head(3)
data_ingredient = data_ingredient.rename(columns={"name":"ingredient_name"})

5274989


In [4]:
data_steps = pd.read_csv(os.path.join(rakuten_root_path,"recipe03_process_20160112.txt"), sep="\t", encoding="utf-8", header=None)
column_names = [
    "recipe_id",
    "step",
    "text"
]
data_steps.columns = column_names
print(len(data_steps))
# data_steps.head(3)

3035218


In [5]:
merge_rakuten_data = pd.merge(data_all, data_ingredient, on='recipe_id')

# 結果を表示
merge_rakuten_data.head(3)

Unnamed: 0,recipe_id,user_id,major_category,medium_category,minor_category,recipe_title,recipe_origin,recipe_introduction,food_image_file,dish_name,...,tag3,tag4,one_point_info,cooking_time_id,occasion_id,cost_id,servings,recipe_publish_date,ingredient_name,quantity
0,1000000008,1000000016,お菓子,和菓子,まんじゅう,栗きんとん,シーズン中はほぼ毎日、規格外の栗で夜なべに栗きんとんを作っています。長年試行錯誤したレシピで...,栗を生産しています。長年試行錯誤して作ってきました。栗きんとんに親しんでいただきたいので、こ...,1000000008.jpg,栗きんとん,...,栗きんとん,圧力鍋,栗は１，５ｋｇ以上のほうが作りやすいです。食べ切れなかったら冷凍保存して自然解凍でどうぞ。一...,6.0,12367,4.0,たくさん,2010/10/01,栗,１ｋ以上
1,1000000008,1000000016,お菓子,和菓子,まんじゅう,栗きんとん,シーズン中はほぼ毎日、規格外の栗で夜なべに栗きんとんを作っています。長年試行錯誤したレシピで...,栗を生産しています。長年試行錯誤して作ってきました。栗きんとんに親しんでいただきたいので、こ...,1000000008.jpg,栗きんとん,...,栗きんとん,圧力鍋,栗は１，５ｋｇ以上のほうが作りやすいです。食べ切れなかったら冷凍保存して自然解凍でどうぞ。一...,6.0,12367,4.0,たくさん,2010/10/01,砂糖,適宜 栗の正味の1/3～1/4くらい
2,1000000008,1000000016,お菓子,和菓子,まんじゅう,栗きんとん,シーズン中はほぼ毎日、規格外の栗で夜なべに栗きんとんを作っています。長年試行錯誤したレシピで...,栗を生産しています。長年試行錯誤して作ってきました。栗きんとんに親しんでいただきたいので、こ...,1000000008.jpg,栗きんとん,...,栗きんとん,圧力鍋,栗は１，５ｋｇ以上のほうが作りやすいです。食べ切れなかったら冷凍保存して自然解凍でどうぞ。一...,6.0,12367,4.0,たくさん,2010/10/01,塩,適宜


In [6]:
merge_rakuten_data.dropna(subset=['ingredient_name'], inplace=True)
merge_rakuten_data["ingredient_name"] = [jaconv.kata2hira(name) for name in merge_rakuten_data["ingredient_name"]] 
merge_rakuten_data.dropna(subset=['recipe_title'], inplace=True)
merge_rakuten_data["recipe_title"] = [jaconv.kata2hira(name) for name in merge_rakuten_data["recipe_title"]] 

In [7]:
merge_rakuten_data["data_source"] = "rakuten"
merge_rakuten_data["edge_type"] = "recipe-ingredient"
rakuten_edge = merge_rakuten_data[["recipe_id","recipe_title", "ingredient_name","edge_type","data_source"]].copy()

In [9]:
save_path = os.path.join(OUTPUT_ROOT,"output_csv","rakuten_edges.csv") 
rakuten_edge.to_csv(save_path,index=False)