In [45]:
import numpy as np
import pandas as pd
import math as math
import matplotlib.pyplot as plt
import json
from typing import List
from urllib.parse import unquote

# Clean the Ground Truth dataset

In [23]:
data = pd.read_json('groundtruth_food_review.json')
ingredients = data.rename(columns={0: "ingredient"}).ingredient
substitutes = data.rename(columns={1: "substitute"}).substitute

In [24]:
ingredients.head(100)

0     http://idea.rpi.edu/heals/kb/ingredientname/mo...
1     http://idea.rpi.edu/heals/kb/ingredientname/ta...
2     http://idea.rpi.edu/heals/kb/ingredientname/pl...
3     http://idea.rpi.edu/heals/kb/ingredientname/ma...
4     http://idea.rpi.edu/heals/kb/ingredientname/ca...
                            ...                        
95    http://idea.rpi.edu/heals/kb/ingredientname/wh...
96    http://idea.rpi.edu/heals/kb/ingredientname/vi...
97    http://idea.rpi.edu/heals/kb/ingredientname/wa...
98    http://idea.rpi.edu/heals/kb/ingredientname/re...
99    http://idea.rpi.edu/heals/kb/ingredientname/ev...
Name: ingredient, Length: 100, dtype: object

In [25]:
print(len(ingredients))
print(len(substitutes))

3846
3846


In [38]:
frames = [ingredients, substitutes]
allIngredients = pd.concat(frames)
print(len(allIngredients))
allIngredients = list(set(allIngredients))
print(len(allIngredients))

7692
1737


In [40]:
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

In [46]:
# 2 methods to get the label
unquote(remove_prefix('http://idea.rpi.edu/heals/kb/ingredientname/red%20beans', 'http://idea.rpi.edu/heals/kb/ingredientname/'))

'red beans'

In [56]:
class Ingredient:
    def __init__(self, uri):
        self.uri = uri
        self.name = unquote(remove_prefix(uri, 'http://idea.rpi.edu/heals/kb/ingredientname/'))

class OpenFoodFact():
    def __init__(self, ingredients: List[Ingredient]):
        self.ingredients = ingredients
        
ingredients = [(Ingredient(ingredient)) for ingredient in allIngredients]

openfoodfact = OpenFoodFact(ingredients=ingredients)

In [57]:
json_data = json.dumps(openfoodfact, default=lambda o: o.__dict__, indent=4, ensure_ascii=False)
text_file = open("ingredients.json", "w", encoding='utf-8')
text_file.write(json_data)
text_file.close()