# Task 1.1 - Construct RLTK Datasets

In [9]:
# %pip install rltk

import rltk
import csv
import pandas as pd
import numpy as np

# Using pandas to inspect the dataframe and understand the data

source_2 = pd.read_csv('data/Sample - Nutrition Data Self - Source 2.csv')
source_1 = pd.read_csv('data/Sample - Nutrition Value with Categories - Source 1.csv')
flavor = pd.read_csv('data/Flavor DB.csv')

source_2 = source_2.apply(pd.to_numeric, errors="ignore")
source_1 = source_1.apply(pd.to_numeric, errors="ignore")

print(source_2.columns)
print(source_1.columns)
print(flavor.columns)

print(source_2.describe())
print(source_1.describe())

source_2['Food Name'].replace('', np.nan, inplace=True)
source_1['Food Name'].replace('', np.nan, inplace=True)

source_2.dropna(subset=['Food Name'], inplace=True)
source_1.dropna(subset=['Food Name'], inplace=True)

source_2.drop_duplicates('Food Name')
source_1.drop_duplicates('Food Name')

# source_2.to_csv('data/Nutrition Data Self Data UTF-8.csv', encoding="UTF-8")
# source_1.to_csv('data/Nutritional Data with Categories UTF-8.csv', encoding="UTF-8")
# flavor.to_csv('data/Flavor DB UTF-8.csv', encoding="UTF-8")

Index(['Food Name', 'Glycemic Load', 'Completeness Score', 'Amino Acid Score'], dtype='object')
Index(['Ash', 'Calcium', 'Calories', 'Carbohydrate', 'Copper', 'Fat', 'Iron',
       'Magnesium', 'Potassium', 'Protein', 'Saturated fatty acids', 'Sodium',
       'Vitamin A, RAE', 'Vitamin B6', 'Water', 'Zinc', 'Food Name',
       'Fatty acids, total trans', 'Manganese', 'Vitamin D', 'Sub-Category',
       'Category'],
      dtype='object')
Index(['Food', 'Matching Food', 'Category of Matching Food',
       'Number of common flavor molecules'],
      dtype='object')
       Glycemic Load  Completeness Score  Amino Acid Score
count     535.000000          535.000000        340.000000
mean        8.082243           40.177570         93.402941
std        22.397682           24.313578         43.987857
min         0.000000            0.000000          0.000000
25%         0.000000           23.000000         68.000000
50%         2.000000           37.000000         94.500000
75%         8.0000

Unnamed: 0,Ash,Calcium,Calories,Carbohydrate,Copper,Fat,Iron,Magnesium,Potassium,Protein,...,"Vitamin A, RAE",Vitamin B6,Water,Zinc,Food Name,"Fatty acids, total trans",Manganese,Vitamin D,Sub-Category,Category
0,1.72,13.11,127.0,2.13,0.06,9.280,1.17,13.68,147.63,8.70,...,0.00,0.085,35.17,3.47,Roast beef spread,,,,,Branded Foods
1,0.76,18.72,103.0,19.35,0.05,2.184,0.34,16.32,60.96,1.59,...,,0.036,24.12,0.26,"Van's, Totally Original Pancakes, Gluten Free",0.006,0.444,,,Branded Foods
2,0.37,5.39,123.0,21.72,0.08,4.224,0.56,11.34,62.09,1.13,...,0.28,0.019,0.79,0.18,"Cookies, regular, dry mix, brownies",,0.100,,Crusts_AND_Dough,Branded Foods
3,0.94,52.45,95.0,13.72,0.03,3.430,0.58,7.09,53.30,2.07,...,7.37,0.019,8.19,0.17,"Biscuits, prepared, dry mix, plain or buttermilk",,0.071,,Tomatoes,Foundation Foods
4,,0.00,100.0,21.00,,2.499,0.72,,89.90,3.00,...,,,,,"Act ii kettle corn, 8.75 oz by Conagra Brands",0.000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,0.99,15.82,149.0,0.00,0.12,6.452,1.36,24.86,383.07,21.16,...,0.00,0.589,83.97,3.80,"Pork, raw, separable lean only, blade (steaks)...",0.043,0.015,0.68,,Branded Foods
949,0.61,87.00,93.0,12.15,,1.710,0.03,,,7.22,...,25.00,,78.40,,"Yogurt, CHOBANI, apricot, 2% fat, Greek",0.070,,,,Branded Foods
950,,0.00,150.0,18.00,,6.999,0.40,,0.00,3.00,...,,,,,Annie's organic cheddar snack mix by GENERAL M...,0.000,,,,
951,0.16,1.30,26.0,0.96,0.00,1.758,0.08,1.30,23.79,1.51,...,5.46,0.014,8.60,0.14,Poultry salad sandwich spread,,0.001,0.03,,Branded Foods


In [10]:
unique_flavors_1 = set(list(flavor['Food']))
unique_flavors_2 = set(list(flavor['Matching Food']))
unique_flavors = unique_flavors_1.union(unique_flavors_2)
unique_flavors = list(unique_flavors)

flavor_dict = dict()

for i in unique_flavors:
    flavor_dict[i] = list()
    for index, row in source_2.iterrows():
        tokens = row['Food Name'].split()
        if(i.lower() in tokens):
            flavor_dict[i].append(row['Food Name'])

In [11]:
# Create record schema for both data sources

tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()
import re

class Flavor(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
    
    @rltk.cached_property
    def id(self):
        return str(self.raw_object['Food']+self.raw_object['Matching Food'])
    
    @rltk.cached_property
    def food1(self):
        if(len(self.raw_object['Food']) == 0):
            return ' '
        else:
            return str(self.raw_object['Food'])
    
    @rltk.cached_property
    def food1_tokens(self):
        return set(tokenizer.tokenize(self.food1))
    
    @rltk.cached_property
    def food2(self):
        if(len(self.raw_object['Matching Food']) == 0):
            return ' '
        else:
            return str(self.raw_object['Matching Food'])
    
    @rltk.cached_property
    def food2_tokens(self):
        return set(tokenizer.tokenize(self.food2))
    
    @rltk.cached_property
    def weight(self):
        return self.raw_object['Number of common flavor molecules']
    


class Source1Record(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
    
    @rltk.cached_property
    def id(self):
        return self.raw_object['Food Name']

    @rltk.cached_property
    def name_string(self):
        if(len(self.raw_object['Food Name']) == 0):
            return ' '
        else:
            return str(self.raw_object['Food Name'])
    
    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name_string))
    
    @rltk.cached_property
    def Category(self):
        return self.raw_object['Category']

    @rltk.cached_property
    def Sub_Category(self):
        return self.raw_object['Sub-Category']

    @rltk.cached_property
    def Ash(self):
        return self.raw_object['Ash']
    
    @rltk.cached_property
    def Calcium(self):
        return self.raw_object['Calcium']
    
    @rltk.cached_property
    def Calories(self):
        return self.raw_object['Calories']
    
    @rltk.cached_property
    def Carbohydrate(self):
        return self.raw_object['Carbohydrate']
    
    @rltk.cached_property
    def Copper(self):
        return self.raw_object['Copper']
    
    @rltk.cached_property
    def Fat(self):
        return self.raw_object['Fat']
    
    @rltk.cached_property
    def Iron(self):
        return self.raw_object['Iron']
    
    @rltk.cached_property
    def Magnesium(self):
        return self.raw_object['Magnesium']
    
    @rltk.cached_property
    def Potassium(self):
        return self.raw_object['Potassium']
    
    @rltk.cached_property
    def Protein(self):
        return self.raw_object['Protein']

    @rltk.cached_property
    def Saturated_fatty_acids(self):
        return self.raw_object['Saturated fatty acids']
    
    @rltk.cached_property
    def Sodium(self):
        return self.raw_object['Sodium']
    
    @rltk.cached_property
    def Vitamin_A_RAE(self):
        return self.raw_object['Vitamin A, RAE']
    
    @rltk.cached_property
    def Vitamin_B6(self):
        return self.raw_object['Vitamin B6']
    
    @rltk.cached_property
    def Water(self):
        return self.raw_object['Water']
    
    @rltk.cached_property
    def Zinc(self):
        return self.raw_object['Zinc']
    
    @rltk.cached_property
    def Fatty_acids_total_trans(self):
        return self.raw_object['Fatty acids, total trans']
    
    @rltk.cached_property
    def Manganese(self):
        return self.raw_object['Manganese']
    
    @rltk.cached_property
    def Vitamin_D(self):
        return self.raw_object['Vitamin D']

class Source2Record(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''
    
    @rltk.cached_property
    def id(self):
        return self.raw_object['Food Name']

    @rltk.cached_property
    def name_string(self):
        if(len(self.raw_object['Food Name']) == 0):
            return ' '
        else:
            return str(self.raw_object['Food Name'])
    
    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name_string))

    @rltk.cached_property
    def Glycemic_Load(self):
        return self.raw_object['Glycemic Load']

    @rltk.cached_property
    def Completeness_Score(self):
        return self.raw_object['Completeness Score']

    @rltk.cached_property
    def Amino_Acid_Score(self):
        return self.raw_object['Amino Acid Score']

In [13]:
flavor_ds = rltk.Dataset(rltk.CSVReader('data/Flavor DB UTF-8.csv'), record_class=Flavor)
ds2 = rltk.Dataset(rltk.CSVReader('data/Sample - Nutrition Data Self - Source 2.csv'), record_class=Source2Record)
ds1 = rltk.Dataset(rltk.CSVReader('data/Sample - Nutrition Value with Categories - Source 1.csv'), record_class=Source1Record)
# ds1 = rltk.Dataset(rltk.DataFrameReader(df_utf8), record_class=Source1Record)

# Task 1.2 - Blocking

In [14]:
# Cleaning pipeline and comparision functions

import string
import math

def toLower(text):
    return str(text).lower()

def extraSpaces(text):
    return re.sub(' +', ' ', text, flags=re.MULTILINE)

def removeNum(text):
    return re.sub(r'[0-9]+', '', text)

def removePunct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def basicCleaningPipeline(s):
    s1 = toLower(s)
    s1 = removePunct(s1)
    s1 = extraSpaces(s1)
    return s1

def jaccard(r1, r2):
    s1 = r1.name_tokens
    s2 = r2.name_tokens
    
    return rltk.jaccard_index_similarity(s1, s2)

def page(r1, r2):
    p1 = r1.pages
    p2 = r2.pages

    if(abs(p1-p2) < 3):
        return 1
    else:
        return 0

In [15]:
def compound(name):
    name = basicCleaningPipeline(name)
    size = len(name.split())
    label = str(size-1) + '-' + str(size+1)
    print(label)
    return label

bg = rltk.HashBlockGenerator()
block = bg.generate(
    bg.block(ds1, function_=lambda x: compound(x.name_string)),
    bg.block(ds2, function_=lambda x: compound(x.name_string))
)

2-4
5-7
4-6
6-8
8-10
4-6
10-12
6-8
5-7
4-6
1-3
4-6
13-15
10-12
5-7
10-12
8-10
8-10
12-14
10-12
9-11
7-9
6-8
8-10
12-14
4-6
3-5
2-4
4-6
5-7
5-7
3-5
10-12
5-7
8-10
4-6
9-11
11-13
1-3
8-10
7-9
14-16
3-5
5-7
5-7
5-7
4-6
3-5
8-10
4-6
5-7
5-7
4-6
12-14
7-9
2-4
4-6
8-10
4-6
5-7
6-8
5-7
11-13
8-10
4-6
4-6
8-10
5-7
7-9
4-6
7-9
3-5
3-5
6-8
13-15
8-10
9-11
11-13
1-3
5-7
3-5
10-12
5-7
12-14
6-8
8-10
5-7
12-14
11-13
5-7
7-9
11-13
9-11
10-12
4-6
12-14
10-12
6-8
2-4
10-12
10-12
7-9
1-3
6-8
11-13
1-3
7-9
5-7
5-7
4-6
8-10
13-15
3-5
11-13
12-14
12-14
2-4
7-9
5-7
7-9
13-15
11-13
12-14
11-13
4-6
3-5
6-8
8-10
5-7
7-9
8-10
2-4
12-14
12-14
3-5
3-5
4-6
8-10
3-5
9-11
3-5
5-7
5-7
1-3
4-6
6-8
10-12
7-9
4-6
14-16
3-5
5-7
6-8
3-5
13-15
3-5
3-5
2-4
5-7
6-8
5-7
4-6
6-8
12-14
9-11
12-14
6-8
6-8
5-7
8-10
15-17
4-6
7-9
2-4
4-6
6-8
5-7
4-6
6-8
3-5
12-14
11-13
5-7
4-6
10-12
6-8
10-12
8-10
6-8
8-10
3-5
4-6
1-3
3-5
15-17
7-9
11-13
5-7
4-6
4-6
10-12
7-9
11-13
6-8
8-10
9-11
11-13
1-3
7-9
10-12
13-15
7-9
2-4
8-10
2-4
11-13
3-

# Task 1.3 - Entity Linking

In [16]:

def perfect_match(r1, r2):
    name1 = basicCleaningPipeline(r1.name_string)
    name2 = basicCleaningPipeline(r2.name_string)

    if(name1 == name2):
        return 1
    
    fuzzy_thresh = 0.85

    name_l_score = round(rltk.levenshtein_similarity(name1, name2), 2)

    if(name_l_score > fuzzy_thresh):
        return 1
    
    return 0

In [17]:
# Rule based method

MY_TRESH = 0.85 

def rule_based_method(r1, r2):
    jaccard_score = jaccard(r1, r2)
    perfect_match_score = perfect_match(r1, r2)
    
    total = 0.20 * jaccard_score + \
            0.80 * perfect_match_score
    
    # return two values: boolean if they match or not, float to determine confidence
    return round(total, 2) > MY_TRESH, total

# Task 1.4 - Record Linkage

In [19]:
"""
Header:
Name,Ash,Calcium,Calories,Carbohydrate,Copper,Fat,Iron,Magnesium,Potassium,Protein,Saturated_fatty_acids,Sodium,Vitamin_A_RAE,Vitamin_B6,Water,Zinc,Fatty_acids_total_trans,Manganese,Vitamin_D,Glycemic_Load,Completeness_Score,Amino_Acid_Score,Category,Sub_Category

From 18 mins to 2 mins for 10Kx10K records with blocking. That is ~10 times faster.
"""

true_predictions = list()

with open('merged_dataset.csv', mode='w') as file:
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    candidate_pairs = rltk.get_record_pairs(ds1, ds2, block=block)

    hit_count = 0

    for r1, r2 in candidate_pairs:
        result, confidence = rule_based_method(r1, r2)
        if(result):
            hit_count += 1
            true_predictions.append((r1.id, r2.id))
            print(r1.name_string, "===", r2.name_string, confidence)
            writer.writerow((r2.id, r1.Ash, r1.Calcium, r1.Calories, r1.Carbohydrate, r1.Copper, r1.Fat, r1.Iron, r1.Magnesium, r1.Potassium, r1.Protein, r1.Saturated_fatty_acids, r1.Sodium, r1.Vitamin_A_RAE, r1.Vitamin_B6, r1.Water, r1.Zinc, r1.Fatty_acids_total_trans, r1.Manganese, r1.Vitamin_D, r2.Glycemic_Load, r2.Completeness_Score, r2.Amino_Acid_Score, r1.Category, r1.Sub_Category))

print("Hits:", hit_count)

Broccoli, cooked, chinese === broccoli cooked chinese 0.8800000000000001
Broccoli, raw, chinese === broccoli raw chinese 0.8800000000000001
Catsup, low sodium === catsup low sodium 0.8800000000000001
Babyfood, pear, juice === babyfood pear juice 0.8800000000000001
DENNY'S, chicken strips === dennys chicken strips 0.8571428571428572
Bread, toasted, egg === bread toasted egg 0.8800000000000001
Chrysanthemum, raw, garland === chrysanthemum raw garland 0.8800000000000001
Onions, raw, welsh === onions raw welsh 0.8800000000000001
Whey, fluid, sweet === whey fluid sweet 0.8800000000000001
Egg, dried, white === egg dried white 0.8800000000000001
Bratwurst, cooked, veal === bratwurst cooked veal 0.8800000000000001
Broccoli, raw, leaves === broccoli raw leaves 0.8800000000000001
Whey, fluid, acid === whey fluid acid 0.8800000000000001
Ostrich, cooked, oyster === ostrich cooked oyster 0.8800000000000001
Egg substitute, powder === egg substitute powder 0.8800000000000001
Ice cream sandwich === ic

# Task 2

# Task 2.1 - Construct KG

In [None]:
# ! pip install rdflib

In [38]:
# Define namespaces

from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF

FOOD = Namespace('https://dbpedia.org/ontology/Food/')
NUTRITIONINFORMATION = Namespace('https://schema.org/NutritionInformation/')
FLAVOR = Namespace('https://dsci558.org/Flavor/')
FOODCATEGORY = Namespace('https://dsci558.org/FoodCategory/')


In [39]:
# Bind to knowledge graph

my_kg = Graph()
my_kg.bind('food', FOOD)
my_kg.bind('nutriinfo', NUTRITIONINFORMATION)
my_kg.bind('flavor', FLAVOR)
my_kg.bind('foodcategory', FOODCATEGORY)


In [40]:
# URI parser to replace space with hyfhen

def parseURI(identifier):
    return identifier.replace(' ', '-')

In [41]:
# Create nodes and insert into the KG

merged_df = pd.read_csv('merged_dataset.csv')
merged_df.Category = merged_df.Category.astype('string')

for index, row in merged_df.iterrows():
    g_title = row['Name']
    
    food_node_uri = URIRef(FOOD[parseURI(g_title)])

    my_kg.add((food_node_uri, RDF.type, FOOD['title']))
    my_kg.add((food_node_uri, FOOD['name'], Literal(g_title, datatype=XSD.string)))

    g_calories = row['Calories']
    g_fat = row['Fat']
    g_protein = row['Protein']
    g_sfa = row['Saturated_fatty_acids']
    g_vit_a = row['Vitamin_A_RAE']
    g_vit_b = row['Vitamin_B6']
    g_fatt = row['Fatty_acids_total_trans']
    g_vit_d = row['Vitamin_D']

    # Glycemic_Load,Completeness_Score,Amino_Acid_Score
    g_gl = row['Glycemic_Load']
    g_cs = row['Completeness_Score']
    g_aas = row['Amino_Acid_Score']

    g_category = row['Category']
    if(isinstance(g_category, str)):
        category_uri = URIRef(FOODCATEGORY[parseURI(g_category)])
        my_kg.add((category_uri, RDF.type, FOODCATEGORY['title']))

        my_kg.add((category_uri, FOODCATEGORY['has'], food_node_uri))

    # g_isbn13 = good_df['ISBN13'][good_index] if len(str(good_df['ISBN13'][good_index]).strip()) > 0 else barnes_df['ISBN13'][barnes_index]
    # g_pages = good_df['PageCount'][good_index] if len(str(good_df['PageCount'][good_index]).strip()) > 0 else barnes_df['Pages'][barnes_index]
    # g_author = good_df['FirstAuthor'][good_index] if len(str(good_df['FirstAuthor'][good_index]).strip()) > 0 else barnes_df['Author1'][barnes_index]

    if(len(str(g_calories).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['calories'], Literal(g_calories, datatype=XSD.float)))
    if(len(str(g_fat).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['fat'], Literal(g_fat, datatype=XSD.float)))
    if(len(str(g_protein).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['protein'], Literal(g_protein, datatype=XSD.float)))
    if(len(str(g_sfa).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['saturatedFattyAcids'], Literal(g_sfa, datatype=XSD.float))) 
    if(len(str(g_vit_a).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['vitA'], Literal(g_vit_a, datatype=XSD.float)))
    if(len(str(g_vit_b).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['vitB'], Literal(g_vit_b, datatype=XSD.float)))
    if(len(str(g_fatt).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['fattyAcidsTotalTrans'], Literal(g_fatt, datatype=XSD.float)))
    if(len(str(g_vit_d).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['vitD'], Literal(g_vit_d, datatype=XSD.float))) 
    if(len(str(g_gl).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['glycemicLoad'], Literal(g_gl, datatype=XSD.float)))
    if(len(str(g_cs).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['completenessScore'], Literal(g_cs, datatype=XSD.float)))
    if(len(str(g_aas).strip()) > 0):
        my_kg.add((food_node_uri, NUTRITIONINFORMATION['aminoAcidScore'], Literal(g_aas, datatype=XSD.float)))       
    # if(len(str(g_isbn13).strip()) > 0):
    #     my_kg.add((book_node_uri, BOOK['isbn'], Literal(g_isbn13, datatype=XSD.string)))
    # if(len(str(g_pages).strip()) > 0):
    #     my_kg.add((book_node_uri, BOOK['numberOfPages'], Literal(g_pages, datatype=XSD.positiveInteger)))

    # author_node_uri = URIRef(PERSON[parseURI(g_author)])

    # my_kg.add((author_node_uri, RDF.type, PERSON['name']))
    # my_kg.add((author_node_uri, PERSON['name'], Literal(g_author, datatype=XSD.string)))
    # my_kg.add((book_node_uri, BOOK['author'], author_node_uri))

    # g_rating_good = good_df['Rating'][good_index]
    # g_number_rating_good = good_df['NumberofRatings'][good_index]
    # g_number_review_good = good_df['NumberofReviews'][good_index]

    # g_sales_barnes = barnes_df['Salesrank'][barnes_index]
    # g_rating_count_barnes = barnes_df['Ratingscount'][barnes_index]
    # g_rating_value_barnes = barnes_df['Ratingvalue'][barnes_index]

    # if(len(str(g_rating_good).strip()) > 0 and len(str(g_rating_value_barnes).strip()) > 0):
    #     rating_node_uri = URIRef(RATING[parseURI(g_title)])
    #     my_kg.add((book_node_uri, RATING['aggregate'], Literal(str(g_rating_good)+"+"+str(g_rating_value_barnes))))
    #     my_kg.add((book_node_uri, RATING['good'], Literal(g_rating_good)))
    #     my_kg.add((book_node_uri, RATING['barnes'], Literal(g_rating_value_barnes)))

    #     my_kg.add((book_node_uri, RATING['goodNumRating'], Literal(g_number_rating_good)))
    #     my_kg.add((book_node_uri, RATING['goodNumReview'], Literal(g_number_review_good)))
    #     my_kg.add((book_node_uri, RATING['barnesNumRating'], Literal(g_rating_count_barnes)))
    #     my_kg.add((book_node_uri, RATING['barnesSale'], Literal(g_sales_barnes)))

    #     my_kg.add((book_node_uri, BOOK['aggregateRating'], rating_node_uri))


    # g_publisher = good_df['Publisher'][good_index] if len(str(good_df['Publisher'][good_index])) > 0 else barnes_df['Publisher'][barnes_index]
    # g_publish_date = good_df['PublishDate'][good_index] if len(str(good_df['PublishDate'][good_index])) > 0 else barnes_df['PublicationDate'][barnes_index]

    # if(len(str(g_publish_date).strip()) > 0):
    #     my_kg.add((book_node_uri, BOOK['datePublished'], Literal(g_publish_date, datatype=XSD.string)))

    # if(len(str(g_publisher)) > 0):
    #     publisher_node_uri = URIRef(PUBLISHER[parseURI(g_publisher)])
    #     my_kg.add((book_node_uri, BOOK['publisher'], rating_node_uri))
        

    # g_format = good_df['Format'][good_index]
    # g_lang = good_df['Language'][good_index]

    # g_paperback_price = barnes_df['Paperbackprice'][barnes_index].replace('$', '')
    # g_hardcover_price = barnes_df['Hardcoverprice'][barnes_index].replace('$', '')
    # g_nookbook_price = barnes_df['Nookbookprice'][barnes_index].replace('$', '')
    # g_audiobook_price = barnes_df['Audiobookprice'][barnes_index].replace('$', '')


    # price_node_uri = URIRef(PRICE[parseURI(g_title)])
    
    # if(len(str(g_paperback_price).strip()) > 0):
    #     my_kg.add((price_node_uri, PRICE['paperback'], Literal(float(g_paperback_price))))
    # if(len(str(g_hardcover_price).strip()) > 0):
    #     my_kg.add((price_node_uri, PRICE['hardcover'], Literal(float(g_hardcover_price))))
    # if(len(str(g_nookbook_price).strip()) > 0):
    #     my_kg.add((price_node_uri, PRICE['nookbook'], Literal(float(g_nookbook_price))))
    # if(len(str(g_audiobook_price).strip()) > 0):
    #     my_kg.add((price_node_uri, PRICE['audiobook'], Literal(float(g_audiobook_price))))

    # my_kg.add((book_node_uri, BOOK['cost'], rating_node_uri))

from itertools import combinations

hit_count = 0

for key, value in flavor_dict.items():
    for pair in list(combinations(value, 2)):
        # try:
        food_uri_1 = URIRef(FOOD[parseURI(pair[0])])
        food_uri_2 = URIRef(FOOD[parseURI(pair[1])])
        # except KeyError:
        #     continue

        my_kg.add((food_uri_1, FLAVOR['flavorOf'], food_uri_2))
        my_kg.add((food_uri_2, FLAVOR['flavorOf'], food_uri_1))
        hit_count += 1

print("Hits:", hit_count)


Hits: 20307


In [42]:
# Serialize graph and dump to file

my_kg.serialize('large_graph.ttl', format="turtle")

<Graph identifier=N2f9ad2027b6248f89e625a5c0e55c004 (<class 'rdflib.graph.Graph'>)>