In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

train_file = '/local/xiaowang/food_ingredient/Dataset_440_labels/train_set.json'
test_file = '/local/xiaowang/food_ingredient/Dataset_440_labels/test_set.json'
val_file = '/local/xiaowang/food_ingredient/Dataset_440_labels/val_set.json'

train_df = pd.read_json(train_file, orient='records', lines=True)
test_df = pd.read_json(test_file, orient='records', lines=True)
val_df = pd.read_json(val_file, orient='records', lines=True)
print('train_df shape:', train_df.shape)
print('test_df shape:', test_df.shape)
print('val_df shape:', val_df.shape)

train_df shape: (48388, 11)
test_df shape: (6048, 11)
val_df shape: (6049, 11)


In [2]:
train_df.head(2)

Unnamed: 0,id,images,image_file_name_ls,ingredients,url,partition,title,instructions,extracted_ingredients,cleaned_ingredients,generated_intro
0,787347fc9b,"[{'id': '589aa53634.jpg', 'url': 'http://img.s...","[589aa53634.jpg, 5ec5605334.jpg, 80e94b8784.jp...","[{'text': '34 cup plain flour'}, {'text': '1 1...",http://www.food.com/recipe/anzac-biscuits-with...,test,Anzac Biscuits With Macadamias (Australian),[{'text': 'Preheat oven to 160 degrees Celsius...,"[plain flour, oat, white sugar, coconut, macad...","[oat, butter, coconut, syrup, macadamia, sugar...",Title: Turkey Meatballs and Pasta in Broth\n\n...
1,d3f3833cbe,"[{'id': '50bc445af6.jpg', 'url': 'http://img.s...",[50bc445af6.jpg],"[{'text': '4 tablespoons unsalted butter'}, {'...",http://www.food.com/recipe/moroccan-cauliflowe...,test,Moroccan Cauliflower Soup,"[{'text': 'In a large saucepan, melt 2 Tbsp of...","[unsalt butter, oliv oil, onion, garlic clove,...","[butter, oil, onion, clove, pepper, egg, corns...",


In [4]:
# ingredients -> extract ingredients -> cleaned_ingredients

def convert_to_text(ingredients:list[dict]):
    text = ",".join([v for text_dict in ingredients for v in text_dict.values()])
    return text

train_df['ingredients_txt'] =  train_df['ingredients'].progress_apply(convert_to_text)
test_df['ingredients_txt'] =  test_df['ingredients'].progress_apply(convert_to_text)
val_df['ingredients_txt'] =  val_df['ingredients'].progress_apply(convert_to_text)

100%|██████████| 48388/48388 [00:00<00:00, 239405.08it/s]


In [9]:
train_ls = train_df['ingredients_txt'].tolist()
test_ls = test_df['ingredients_txt'].tolist()
val_ls = val_df['ingredients_txt'].tolist()

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

device = "cuda:7"
tokenizer = AutoTokenizer.from_pretrained("Dizex/InstaFoodRoBERTa-NER")
model = AutoModelForTokenClassification.from_pretrained("Dizex/InstaFoodRoBERTa-NER").to(device)

def convert_entities_to_list(text, entities: list[dict]) -> list[str]:
        ents = []
        for ent in entities:
            e = {"start": ent["start"], "end": ent["end"], "label": ent["entity_group"]}
            if ents and -1 <= ent["start"] - ents[-1]["end"] <= 1 and ents[-1]["label"] == e["label"]:
                ents[-1]["end"] = e["end"]
                continue
            ents.append(e)

        return [text[e["start"]:e["end"]] for e in ents]


pipe = pipeline("ner", model=model, tokenizer=tokenizer)

train_extract_ls = []
for train_example in tqdm(train_ls):
    ner_entity_results = pipe(train_example, aggregation_strategy="simple")
    train_extract_ls.append(convert_entities_to_list(train_example, ner_entity_results))

test_extract_ls = []
for test_example in tqdm(test_ls):
    ner_entity_results = pipe(test_example, aggregation_strategy="simple")
    test_extract_ls.append(convert_entities_to_list(test_example, ner_entity_results))

val_extract_ls = []
for val_example in tqdm(val_ls):
    ner_entity_results = pipe(val_example, aggregation_strategy="simple")
    val_extract_ls.append(convert_entities_to_list(val_example, ner_entity_results))

# save the extracted ingredients to the dataframe
train_df['extracted_ingredients'] = train_extract_ls
test_df['extracted_ingredients'] = test_extract_ls
val_df['extracted_ingredients'] = val_extract_ls

100%|██████████| 48388/48388 [49:30<00:00, 16.29it/s] 
100%|██████████| 6048/6048 [06:28<00:00, 15.57it/s]
100%|██████████| 6049/6049 [06:18<00:00, 15.99it/s]
