<a href="https://colab.research.google.com/github/mscholl96/mad-recime/blob/recipe1M-parser/data/recipe1M/parser/ingEval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

# Ingredient Evaluation

In [None]:
# network graph
!pip install plotly
!pip install chart-studio



In [None]:
import pandas as pd
import numpy as np
import itertools
import re

# network graph
import plotly.graph_objects as go
import networkx as nx
from chart_studio import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go

import random
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
FILE_DIR = 'drive/My Drive/_Master/09_KI_ML/TP2/Datasets/Recipe1M/'

# Get valid recipes from pickle

In [None]:
baseFrame = pd.read_pickle(FILE_DIR + '20220126/recipes_valid.pkl')
baseFrame.head()

Unnamed: 0_level_0,title,ingredients,instructions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,Dilly Macaroni Salad Recipe,amount unit ingredient 0 1....,0 Cook macaroni according to package direct...
000035f7ed,Gazpacho,amount unit ingredient 0 8.0 ...,0 Add the tomatoes to a food processor with...
00003a70b1,Crunchy Onion Potato Bake,amount unit ingredient 0 1.0...,0 Preheat oven to 350 degrees Fah...
00004320bb,Cool 'n Easy Creamy Watermelon Pie,amount unit ingredient 0 1.0...,0 Dissolve Jello in boiling water. 1 ...
0000631d90,Easy Tropical Beef Skillet,amount unit ingredient 0...,"0 In a large skillet, toast the coconut ove..."


## Create ingredient Frame

In [None]:
#ingredient normalization TODO: add instructions here to have the full frame
def flatten_ing(row):
  return np.column_stack((row['amount'].tolist(), row['unit'].tolist(), row['ingredient'].tolist()))

tempSeries = baseFrame['ingredients'].apply(lambda x: flatten_ing(x)).explode()
baseFrame  = pd.DataFrame(tempSeries.tolist(), index=tempSeries.index, columns=['amount', 'unit', 'ingredient'])

# free space
del tempSeries

baseFrame

Unnamed: 0_level_0,amount,unit,ingredient
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000033e39b,1.0,cup,elbow macaroni
000033e39b,1.0,cup,american cheese
000033e39b,0.5,cup,celery
000033e39b,0.5,cup,green pepper
000033e39b,3.0,tablespoon,pimento
...,...,...,...
5aad55ecb6,4.0,cup,water
5aad55ecb6,1.0,,frozen cranberry
5aad55ecb6,1.0,cup,fresh mint leaf
5aad55ecb6,0.5,cup,sugar




## Recreate frame structure

In [None]:
# TBD

## Postprocessing
As both the original datasets and the parsed output have ingredients that are not normed, are parsed incorrectly or not easily understandable, a postprocessing step to fix these is implemented.

This shall cover:
* norm **\"** to **' - inch'**
* norm **[0-9] inch** to **[0-9] - inch**
* add **cream** to **half - and - half** to make ingredient clear
* fixup issue of pattern lib (**flour** singularized falsely to **flmy**)

Things to be done in parsing:
* empty ingredient cells
*


In [None]:
re.sub(r'\d"', r'%d - inch', '9" crust')
teststr = '9 inch'
re.findall('[0-9]"', teststr)

In [None]:
def postproc_ings(row):
  #ingredient = re.sub(r'[0-9]"', re.findall('[0-9]"', ingredient)[0].replace('"', ' - inch'), ingredient) # to be done
  # removal " to inch
  ingredient = row['ingredient']
  ingredient = re.sub(r'"', ' - inch', ingredient)
  # replacement 'half and half' to 'half and half cream'
  ingredient = re.sub(r'half - and - half(?!cream)$', 'half - and - half cream', ingredient)
  return ingredient

# baseFrame['ingredient'] = baseFrame.apply(lambda row: postproc_ings(row), axis=1)
# baseFrame

# Ingredient evaluation


## Analysis of preparsed ingredients

In [None]:
baseFrame['ingredient'].value_counts().to_frame().rename(columns={'ingredient': 'count'}).rename_axis('ingredient')

Unnamed: 0_level_0,count
ingredient,Unnamed: 1_level_1
salt,106669
butter,70662
egg,67046
sugar,62738
onion,54943
...,...
nut crunch topping,1
heineken lager beer,1
hershey candy corn kiss,1
vegan edam cheese,1


## Clustering of ingredients
https://towardsdatascience.com/clustering-product-names-with-python-part-1-f9418f8705c8

## Create ingredient frame

In [None]:
# brakedown into lists 
ingList = baseFrame.groupby(level=0).aggregate(lambda x: x.unique().tolist()) # https://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc/24112443
ingList

# Ingredient dependencies

In [None]:
def get_ingredient_combinations(ings):
  '''create list of ingredient combination tuples for each recipe'''
  tup = list(itertools.product(ings, ings)) # https://stackoverflow.com/questions/4709510/itertools-product-speed-up
  tup = list(map(lambda x: sorted(x), tup))
  tup.sort()
  return tup

# create flattened list with ingredient combination tuples
ingList['ingredient'] = ingList.apply(lambda row: get_ingredient_combinations(row['ingredient']), axis=1)
ingEdgeList = pd.DataFrame(ingList['ingredient'].explode())

# create ingredient node frame and clean it up (duplicates, combinations with itself)
ingEdgeList[['ing1', 'ing2']] = pd.DataFrame(ingEdgeList['ingredient'].tolist(), index=ingEdgeList.index)
ingEdgeList.drop_duplicates(subset=['ing1', 'ing2'], inplace=True)
ingEdgeList = ingEdgeList[ingEdgeList['ing1'] != ingEdgeList['ing2']]

ingEdgeList

## Count Edges

In [None]:
edgeZwerg = ingEdgeList.groupby(['ing1', 'ing2']).size().to_frame('count').reset_index()
edgeZwerg

## Count ingredient appearances

In [None]:
nodeZwerg = ingEdgeList.drop_duplicates(['idx','ing1'])
nodeZwerg = nodeZwerg.groupby(['ing1']).size().to_frame('count').reset_index()
nodeZwerg

# Plot
## Static Graph
* https://stackoverflow.com/questions/21207872/construct-networkx-graph-from-pandas-dataframe
* https://networkx.org/documentation/networkx-1.10/reference/introduction.html
* https://networkx.org/documentation/stable/reference/drawing.html
* https://www.youtube.com/watch?v=9aZiwuQTo-4&ab_channel=AnalystRising


In [None]:

# nxG = nx.from_pandas_edgelist(edgeZwerg, 'ing1', 'ing2')
nxG = nx.from_pandas_edgelist(ingEdgeList.head(1000), 'ing1', 'ing2')

plt.figure(1, figsize=(10,5))
# pos = nx.random_layout(nxG, seed=42)
pos = nx.draw_kamada_kawai(nxG, node_size=80, font_size=10, with_labels=True)

nxG.nodes()

## Plot interactive connection graph

* https://plotly.com/python/network-graphs/



In [None]:
# https://www.kaggle.com/anand0427/network-graph-with-at-t-data-using-plotly

# Getting node positions
# pos = nx.spring_layout(nxG)
pos = nx.kamada_kawai_layout(nxG)

#Adding positions of the nodes to the graph¶
for n, p in pos.items():
    nxG.nodes[n]['pos'] = p

# Adding nodes and edges to the plotly api
edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in nxG.edges():
    x0, y0 = nxG.nodes[edge[0]]['pos']
    x1, y1 = nxG.nodes[edge[1]]['pos']
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=15,
        colorbar=dict(
            thickness=10,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=0)))

for node in nxG.nodes():
    x, y = nxG.nodes[node]['pos']
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])


# Coloring nodes
for node, adjacencies in enumerate(nxG.adjacency()):
    node_trace['marker']['color']+=tuple([len(adjacencies[1])])
    node_info = adjacencies[0] +' # of connections: '+str(len(adjacencies[1]))
    node_trace['text']+=tuple([node_info])


# Plotting the figure
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Ingredient Connections',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="No. of connections",
                    showarrow=False,
                    xref="paper", yref="paper") ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig)

# plotly.plot(fig)