In [7]:
# Imports.
import pandas as pd
import csv
import re
import math
from collections import Counter
from tqdm import tqdm

In [8]:
# Get datasets.
grams = pd.read_csv("grams.csv")
print("Length of USDA portions dataset:", len(grams))
print("")

ingr = pd.read_csv("ingr.csv")
print("Lenght of recipes dataset:", len(ingr))
print("")

Length of USDA portions dataset: 32614

Lenght of recipes dataset: 30925



In [9]:
# Formatted food items from ingredients dataset will be appended to this list.
ts = []

# Food items in the USDA portions dataset will be appended to this list.
comp = []

# This while loop iterates through ingredients dataset and formats each food item to resemble the formatting on the USDA dataset.
# Formatted items are appended to ts list.
indx = 0
while indx < len(ingr):
    for element in ingr.food[indx].split(","):
        
        # This if statement eliminates possible non-food items that are mixed in the ingredients dataset.
        if "with" not in element:
            
            # Elements are formatted.
            element = element.lower()
            a = re.sub(r"[\[\]]", "", element)
            a = a.replace("'", "")
            a = a.lstrip()
            ts.append(a)
    
    # Index is increased by one at the end of the loop.
    indx+=1

# Duplicate food items are removed.
ts = list(set(ts))

# This loop iterates through the USDA portions dataset and appends each food item to comp list.
for element in grams.Main_Food_Description:
    comp.append(element)

# Duplicate food items are removed.
comp = list(set(comp))

In [10]:
# A regex pattern is set to be used in text to vector function.
WORD = re.compile(r"\w+")

# This function turns inputted text to vector by counting individual words in it.
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [11]:
# This function gives the cosine similarity score between two vectors.
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [12]:
# These lists will be zipped together after the proper matches are made by looking at the best cosine similarity scores.
ingr_lst = []
usda_lst = []
cos_lst = []

# This for loop iterates through food items from ingredient dataset.
for element in tqdm(ts):
    
    # The chosen food item is transformed to a vector.
    text1 = element
    vector1 = text_to_vector(text1)

    # These lists will store the chosen food item's cosine similarity scores with all food items from USDA portions dataset.
    # fs stores names of the items, cs stores similarity scores of those items. They will be zipped together.
    fs = [] 
    cs = []
    
    # This for loop iterates through USDA portions dataset's food items.
    for i in comp:
        
        # USDA food item is transformed to a vector.
        text2 = i
        vector2 = text_to_vector(text2)
        
        # Cosine similarity score between the USDA food item and the chosen food item is calculated.
        cosine = get_cosine(vector1, vector2)
        
        # Food item's name and its score is appended to their respective lists.
        fs.append(i)
        cs.append(cosine)
        
    # By zipping fs and cs, a master list containing each USDA item and its similarity score with the chosen food item is obtained.
    ms = list(zip(fs, cs))
    
    # The name of the chosen food item is appended to ingr_lst.
    ingr_lst.append(text1)
    
    # The name of the USDA food item which has the highest similarity with the chosen food item is appended to usda_lst.
    usda_lst.append(max(ms, key = lambda x: x[1])[0])
    
    # The best similarity score is appended to the cos_lst.
    cos_lst.append(max(ms, key = lambda x: x[1])[1])

100%|█████████████████████████████████████| 16586/16586 [15:12<00:00, 18.19it/s]


In [13]:
# Three lists are zipped to form a list of best matches names match_lst.
match_lst = list(zip(ingr_lst, usda_lst, cos_lst))

# match_lst is written as a csv file names link.csv.
with open('link.csv', 'w', newline="") as csvfile:
    fwriter = csv.writer(csvfile)
    fwriter.writerows(match_lst)