In [1]:
import io
import os
import re
from xml.dom import minidom
import json
import codecs
import networkx as nx
from networkx.algorithms import bipartite
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import locale
locale.setlocale(locale.LC_ALL, 'de-DE.utf-8')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#### Define RecipeCollection class

In [2]:
class RecipeCollection:
    def __init__(self, coll_path, coll_fn, cat_path, cat_fn):
        self.coll_path = coll_path
        self.coll_fn   = coll_fn
        self.cat_path  = cat_path
        self.cat_fn    = cat_fn
        os.chdir(coll_path)
        with open(coll_fn, encoding='utf-8') as file:
            self.coll = json.load(file) 
        os.chdir(cat_path)
        with open(cat_fn, encoding='utf-8') as file:
            self.cat = json.load(file)
            self.catEntries = [entry for entry in self.cat]
        
        self.subCollLtrs = [subcoll for subcoll in self.coll.get('collections')]
        self.recipes     = [rcp for rcp in self.coll.get('recipes')]
        xx =[igt for rcp in self.coll.get('recipes') for igt in rcp.get('ingredients')]
        self.ingredients = list(set(xx))     
        
    def __str__(self):
        return f"Collection with {len(self.recipes)} recipes in {len(self.subCollLtrs)} subcollections with {len(self.ingredients)} distinct ingredients\nsupported by ingredients catalog with {len(self.catEntries)} entries\n"
    
    def infoSubcolls (self):
        xx = [(subcoll,self.coll.get('collections').get(subcoll).get('name'),self.coll.get('collections').get(subcoll).get('author'),len(self.coll.get('collections').get(subcoll).get('recipes'))) for subcoll in self.coll.get('collections')]
        str = ''
        for sc in xx:
            str += f"subcollection {sc[0]} -- name: {sc[1]}, author: {sc[2]}, {sc[3]} recipes\n"
        return str   
        
    def recipes_list (self,coll=None):
        if coll == None:
            return [self.recipes]
        else:
            return self.coll.get('collections').get(coll).get('recipes') 
        
    def ingredients_list (self,coll=None):
        if coll == None:
            return self.ingredients
        else:
            xx = [rcp for rcp in self.coll.get('collections').get(coll).get('recipes')]
            yy = [igt for rcp in self.coll.get("recipes") if rcp.get('recipeName') in xx for igt in rcp.get('ingredients')]
            zz = list(set(yy))
            zz.sort(key=locale.strxfrm)
            return zz
                
    def catalog_list (self, select=None):
        if select == None:
            return [igt for igt in self.cat]
        elif type(select) is str:
            return self.cat.get(select)
        elif type(select) is list:
            return [self.cat.get(s) for s in select]
        
    def toGraph (self, coll=None):
        
        def igtGraph (i2r):
            B = nx.Graph(from_coll=coll,created_by='fruschtique RecipeCollection')
            top = []
            bottom = []
            e_list = []
            for recipe in i2r:
                top.append(recipe.get('recipeName'))
            for recipe in i2r:
                for ingredient in recipe.get('ingredients'):
                    if not(ingredient in bottom):
                        bottom.append(ingredient)
            B.add_nodes_from(top, bipartite=0)
            B.add_nodes_from(bottom, bipartite=1)
            for recipe in i2r:
                for ingredient in recipe.get('ingredients'):
                    nm = recipe.get('recipeName')
                    e_list.append((nm,ingredient))
            B.add_edges_from(e_list)
            B = bipartite.weighted_projected_graph(B, bottom)
            attr_dict = {igt: self.cat[igt] for igt in bottom}
            occ_list = [igt for rcp in i2r for igt in rcp.get('ingredients')]
            occ_dict = Counter(occ_list)
            occ_attr = {k:{'occ':occ_dict.get(k)} for k in occ_dict.keys()}
            nx.set_node_attributes(B, attr_dict)
            nx.set_node_attributes(B, occ_attr)
            e_attr = {}
            for e in list(B.edges(data=True)):
                x = [e[0],e[1]]
                x.sort(key=locale.strxfrm)
                id = str(x[0]) + '--' + str(x[1])
                xx = (e[0],e[1])
                e_attr[xx] = {'id':id}
            nx.set_edge_attributes(B, e_attr)
            return B
        
        # no parameter
        if coll == None:
            print ('Specify subcollection to be transformed.')
            return None
        
        # single subcollection
        elif type(coll) is str:
            if len(coll) != 1:
                print('Use a single character for subcollection specification.')
                return None
            elif not(coll in self.subCollLtrs):
                print (f"The subcollection {coll} is not contained in this collection.")
                return None
            else:
                xx = [rcp for rcp in self.coll.get('collections').get(coll).get('recipes')]
                i2r = [rcp for rcp in self.coll.get("recipes") if rcp.get('recipeName') in xx]
                return igtGraph(i2r)
 
        # two subcollections
        elif type(coll) is list:
            if len(coll) > 2:
                print('Two subcollections is maximum for graph generation.')
                return None
            elif not(coll[0] in self.subCollLtrs):
                print (f"The subcollection {coll[0]} is not contained in this collection.")
                return None
            elif not(coll[1] in self.subCollLtrs):
                print (f"The subcollection {coll[1]} is not contained in this collection.")
                return None
            else:
                xx = [rcp for cc in coll for rcp in self.coll.get('collections').get(cc).get('recipes')]
                i2r = [rcp for rcp in self.coll.get("recipes") if rcp.get('recipeName') in xx]
                B = igtGraph(i2r)
                
                Arecipes = [rcp for rcp in self.coll.get('collections').get(coll[0]).get('recipes')]
                Aingredients = set([igt for rcp in self.coll.get("recipes") if rcp.get('recipeName') in Arecipes for igt in rcp.get('ingredients')])
                Brecipes = [rcp for rcp in self.coll.get('collections').get(coll[1]).get('recipes')]
                Bingredients = set([igt for rcp in self.coll.get("recipes") if rcp.get('recipeName') in Brecipes for igt in rcp.get('ingredients')])
                #print(len(Aingredients))
                #print(len(Bingredients))
                ABingredients = Aingredients.intersection(Bingredients)
                Aingredients_pure = Aingredients.difference(ABingredients)
                Bingredients_pure = Bingredients.difference(ABingredients)
                Asub_dict = {igt: {'sub':'A'} for igt in Aingredients_pure}
                Bsub_dict = {igt: {'sub':'B'} for igt in Bingredients_pure}
                ABsub_dict = {igt: {'sub':'AB'} for igt in ABingredients}
                sub_dict = {**Asub_dict, **Bsub_dict, **ABsub_dict}
                nx.set_node_attributes(B, sub_dict)
                
                A_attr = {(e[0],e[1]):{'sub': 'A'} for e in list(B.edges(data=True)) if (e[0] in Aingredients_pure and e[1] in Aingredients_pure) or (e[0] in Aingredients_pure and e[1] in ABingredients) or (e[0] in ABingredients and e[1] in Aingredients_pure)}
                B_attr = {(e[0],e[1]):{'sub': 'B'} for e in list(B.edges(data=True)) if (e[0] in Bingredients_pure and e[1] in Bingredients_pure) or (e[0] in Bingredients_pure and e[1] in ABingredients) or (e[0] in ABingredients and e[1] in Bingredients_pure)}
                AB_attr = {(e[0],e[1]):{'sub': 'AB'} for e in list(B.edges(data=True)) if e[0] in ABingredients and e[1] in ABingredients}
                e_attr = {**A_attr,**B_attr,**AB_attr}
                #print ('#A edges: ',len(A_attr),'\n#B edges: ',len(B_attr),'\n#AB edges: ',len(AB_attr))
                nx.set_edge_attributes(B, e_attr)
                #print (B.edges(data=True))
                return B

    def nodeSets(self,graph=None,coll=None):
        if graph == None:
            print ('Specify graph.')
            return None
        elif coll == None:
            print ('Specify subcollection.')
            return None
        if type(coll) is str:
            if len(coll) != 1:
                print('Use a single character for subcollection specification.')
                return None
            elif not(coll in self.subCollLtrs):
                print(f"Subcollection {coll} does not exist.")
                return None
            else:
                Anodes = set ([n for (n,attr) in graph.nodes(data=True) if attr.get('sub') == coll])
                return list(Anodes)
        elif type(coll) is list:
            if len(coll) > 2:
                print('Two subcollections is maximum for node set generation.')
                return None
            elif not(coll[0] in self.subCollLtrs):
                print (f"The subcollection {coll[0]} is not contained in this collection.")
                return None
            elif not(coll[1] in self.subCollLtrs):
                print (f"The subcollection {coll[1]} is not contained in this collection.")
                return None
            else:
                xx = f"{coll[0]}{coll[1]}"
                Anodes = set ([n for (n,attr) in graph.nodes(data=True) if attr.get('sub') == xx])
                return list(Anodes)
        else:
            return None
    
    def edgeSets(self,graph=None,coll=None):         
        if graph == None:
            print ('Specify graph.')
            return None
        elif coll == None:
            print ('Specify subcollection.')
            return None
        if type(coll) is str:
            if len(coll) != 1:
                print('Use a single character for subcollection specification.')
                return None
            elif not(coll in self.subCollLtrs):
                print(f"Subcollection {coll} does not exist.")
                return None
            else:
                n_A  = [n for n,attr in graph.nodes(data=True) if attr.get('sub') == coll]
                A_e_pure  = [e for e in graph.edges(data=True) if e[0] in n_A or e[1] in n_A]
                return {'A_e_pure' : A_e_pure}  
        elif type(coll) is list:
            if len(coll) > 2:
                print('Two subcollections is maximum for node set generation.')
                return None
            elif not(coll[0] in self.subCollLtrs):
                print (f"The subcollection {coll[0]} is not contained in this collection.")
                return None
            elif not(coll[1] in self.subCollLtrs):
                print (f"The subcollection {coll[1]} is not contained in this collection.")
                return None
            else:
                n_A  = [n for n,attr in graph.nodes(data=True) if attr.get('sub') == coll[0]]
                n_B  = [n for n,attr in graph.nodes(data=True) if attr.get('sub') == coll[1]]
                n_AB = [n for n,attr in graph.nodes(data=True) if attr.get('sub') == f"{coll[0]}{coll[1]}"]
                A_e_pure       = [e for e in graph.edges(data=True) if e[0] in n_A and e[1] in n_A]
                B_e_pure       = [e for e in graph.edges(data=True) if e[0] in n_B and e[1] in n_B]
                A_e_mixed      = [e for e in graph.edges(data=True) if (e[0] in n_A and e[1] in n_AB) or (e[1] in n_A and e[0] in n_AB)]
                B_e_mixed      = [e for e in graph.edges(data=True) if (e[0] in n_B and e[1] in n_AB) or (e[1] in n_B and e[0] in n_AB)]
                AB_e_intersect = [e for e in graph.edges(data=True) if e[0] in n_AB and e[1] in n_AB]
                return {'A_e_pure' : A_e_pure, 'B_e_pure' : B_e_pure, 'A_e_mixed' : A_e_mixed, 'B_e_mixed' : B_e_mixed, 'AB_e_intersect' : AB_e_intersect}
        else:
            return None
        
    def toDot(self,graph,path,fn):
        dot = 'graph {\ngraph[rankdir="LR", outputorder="edgesfirst"]\nnode[fontname="Arial", fontsize=120, shape=circle, style=filled, fixedsize=shape];\n'
        for u,v,att in graph.edges(data=True):
            dot += u+' -- '+v+' [penwidth='+str(att.get('weight'))
            if att.get('weight') > 1:
                dot += ', color=Red]\n'
            else:
                dot += ']\n'
        for u,att in graph.nodes(data=True):
            dot += u+' [width=' + str(8+6*np.log(att.get('occ'))) + ', label=' + str(att.get('i-name')) + ', class=' + str(att.get('i-class')) + ']\n'
        dot += '}'
        os.chdir(path)
        with codecs.open(fn, 'w', encoding = 'utf8') as file:
            file.write(dot)
        return
    
    def toGephi(self,graph,fn):
        pass
    
    def toCSV(self,graph,path,fn):
        os.chdir(path)
        basename, extension = os.path.splitext(fn)
        nodes_fn = basename + '_nodes.csv'
        with codecs.open(nodes_fn, 'w', encoding = 'utf8') as file:
            file.write('n,i-name,i-class,occ,sub\n')
            for (n,attr) in graph.nodes(data=True):
                file.write(f"{n},{attr.get('i-name')},{attr.get('i-class')},{attr.get('occ')},{attr.get('sub')}\n")
        edges_fn = basename + '_edges.csv'
        with codecs.open(edges_fn, 'w', encoding = 'utf8') as file:
            file.write('n1,n2,id,weight,sub\n')
            for (n1,n2,attr) in graph.edges(data=True):
                file.write(f"{n1},{n2},{attr.get('id')},{attr.get('weight')},{attr.get('sub')}\n")
        return

#### Instantiate RecipeCollection class

In [3]:
HD_YO = RecipeCollection('C:/Users/nlutt/Documents/Websites/graphLab/sampleSpaces/compareHD-YO/graphs/',
                         'collDescript.json',
                         'C:/Users/nlutt/myPyPro/second/',
                         'igt_cat.json')
print (HD_YO)
print(HD_YO.infoSubcolls())
#print(HD_YO.recipes_list('A'))
#print(HD_YO.ingredients_list('A'))
#print(HD_YO.catalog_list(['aal','champignon']))
G = HD_YO.toGraph(['A','B'])
print ('G', G)
print ('G attributes: ', G.graph)

#xx = [(e1,e2) for (e1,e2,attr) in G.edges(data=True) if attr.get('sub') == 'A']
#A_n = nx.edge_subgraph(G,xx)
intersect = ['A','B']

A_n = HD_YO.nodeSets(G,'A')
print (f"nodeSet A: {len(A_n)} nodes")
B_n = HD_YO.nodeSets(G,'B')
print (f"nodeSet B: {len(B_n)} nodes")
AB_n = HD_YO.nodeSets(G,intersect)
print (f"nodeSet AB: {len(AB_n)} nodes")

edge_sets = HD_YO.edgeSets(G,intersect)
print (len(edge_sets.get('A_e_pure')),len(edge_sets.get('A_e_mixed')),len(edge_sets.get('B_e_pure')),len(edge_sets.get('B_e_mixed')),len(edge_sets.get('AB_e_intersect')))

#HD_YO.toDot (G,'C:/Users/nlutt/Documents/Websites/graphLab/sampleSpaces/compareHD-YO/graphs/','xxx.dot')
#HD_YO.toCSV (G,'C:/Users/nlutt/Documents/Websites/graphLab/sampleSpaces/compareHD-YO/graphs/','xxx.csv')


Collection with 187 recipes in 2 subcollections with 281 distinct ingredients
supported by ingredients catalog with 717 entries

subcollection A -- name: HD-Gemüse, author: Henriette Davidis, 95 recipes
subcollection B -- name: YO-Gemüse, author: Yotam Ottolenghi, 92 recipes

G Graph with 281 nodes and 4711 edges
G attributes:  {'from_coll': ['A', 'B'], 'created_by': 'fruschtique RecipeCollection'}
nodeSet A: 59 nodes
nodeSet B: 181 nodes
nodeSet AB: 41 nodes
148 399 2476 1351 337
