In [7]:
import pandas as pd
import numpy as np
import sqlalchemy as sq
import datetime

from plotly import graph_objs as go
import plotly.express as px
import plotly.offline as py
from plotly.subplots import make_subplots

import geopandas as gp
import datetime as dt
import ssl
from chord import Chord
import itertools

pd.options.display.max_rows = 999

pd.set_option('display.float_format', lambda x: '%.5f' % x)

%matplotlib inline

In [8]:
ssl._create_default_https_context = ssl._create_unverified_context

In [10]:
df = pd.read_csv('recipes.csv')
print(df.shape)

(7486, 4)


In [11]:
df.head(5)

Unnamed: 0,recipe_name,flavors,rating,success
0,Big Apple Crumble Cupcakes,cinnamon cinnamon apple cinnamon vanilla,0.0,0.0
1,Bacon-Latticed Apple Pie,apple lemon cinnamon nutmeg,0.0,0.0
2,Lemon and Fig Cupcakes,vanilla lemon lemon fig vanilla,0.0,0.0
3,Toasted Coconut Chia Pudding,almond coconut,0.0,0.0
4,Coconut-Strawberry Ice Cream Pie,coconut graham coconut coconut strawberry stra...,4.0,0.1


In [12]:
df.isna().sum()

recipe_name      0
flavors        251
rating           0
success          0
dtype: int64

In [13]:
df = df.dropna()

In [14]:
df['flavor_list'] = df['flavors'].apply(lambda x: sorted(list(set(x.split()))))

In [15]:
df.head(5)

Unnamed: 0,recipe_name,flavors,rating,success,flavor_list
0,Big Apple Crumble Cupcakes,cinnamon cinnamon apple cinnamon vanilla,0.0,0.0,"[apple, cinnamon, vanilla]"
1,Bacon-Latticed Apple Pie,apple lemon cinnamon nutmeg,0.0,0.0,"[apple, cinnamon, lemon, nutmeg]"
2,Lemon and Fig Cupcakes,vanilla lemon lemon fig vanilla,0.0,0.0,"[fig, lemon, vanilla]"
3,Toasted Coconut Chia Pudding,almond coconut,0.0,0.0,"[almond, coconut]"
4,Coconut-Strawberry Ice Cream Pie,coconut graham coconut coconut strawberry stra...,4.0,0.1,"[coconut, graham, lemon, strawberry]"


In [16]:
flavors = []
for flavor_list in df['flavor_list'].values:
    flavors = flavors + flavor_list

In [17]:
flavor_series = pd.Series(flavors)

In [18]:
flavor_series.nunique()

98

Look at top dessert flavors

Look at cooccurence of dessert flavors

In [19]:
flavor_cdf = pd.DataFrame(flavor_series.value_counts()).reset_index()
flavor_cdf.columns = ['flavor', 'recipe_counts']

In [80]:
flavor_cdf.head(10)

Unnamed: 0,flavor,recipe_counts
0,vanilla,3689
1,chocolate,1812
2,lemon,1628
3,cinnamon,1436
4,orange,941
5,almond,887
6,cocoa,629
7,ginger,590
8,raspberry,512
9,apple,508


In [21]:
flavor_cdf.shape

(98, 2)

In [52]:
fig = go.Figure([go.Bar(x=flavor_cdf.head(50).flavor, y=flavor_cdf.head(50).recipe_counts)])
py.offline.iplot(fig)

Save only top 50 flavors

In [23]:
top50_flavors = list(flavor_cdf.head(50).flavor.values)
top20_flavors = list(flavor_cdf.head(20).flavor.values)

In [59]:
top20_flavors = list(flavor_cdf.head(20).flavor.values)
top20_flavors

['vanilla',
 'chocolate',
 'lemon',
 'cinnamon',
 'orange',
 'almond',
 'cocoa',
 'ginger',
 'raspberry',
 'apple',
 'strawberry',
 'honey',
 'nutmeg',
 'walnut',
 'pecan',
 'coconut',
 'clove',
 'rum',
 'lime',
 'cherry']

## Chord Diagram

Take the original dataframe and calcualte co occurence matrix

In [24]:
adf = df[['recipe_name', 'flavor_list']]
adf.head()

Unnamed: 0,recipe_name,flavor_list
0,Big Apple Crumble Cupcakes,"[apple, cinnamon, vanilla]"
1,Bacon-Latticed Apple Pie,"[apple, cinnamon, lemon, nutmeg]"
2,Lemon and Fig Cupcakes,"[fig, lemon, vanilla]"
3,Toasted Coconut Chia Pudding,"[almond, coconut]"
4,Coconut-Strawberry Ice Cream Pie,"[coconut, graham, lemon, strawberry]"


Lets focus on the top 50 recipes

In [60]:
f_list = adf.flavor_list

In [73]:
f_list.iloc[0]

['apple', 'cinnamon', 'vanilla']

In [75]:
adf.flavor_list.values

array([list(['apple', 'cinnamon', 'vanilla']),
       list(['apple', 'cinnamon', 'lemon', 'nutmeg']),
       list(['fig', 'lemon', 'vanilla']), ...,
       list(['blackberry', 'blueberry', 'lemon', 'nutmeg', 'oat', 'orange', 'raspberry']),
       list(['almond', 'apricot', 'lavender', 'vanilla']),
       list(['almond', 'graham', 'vanilla'])], dtype=object)

In [27]:
list(itertools.combinations(f_list, 2))

[('apple', 'cinnamon'), ('apple', 'vanilla'), ('cinnamon', 'vanilla')]

In [28]:
#all flavor pairs grouped by 2
flavor_pairs_list = []
for f_list in adf.flavor_list.values:
    flavor_pairs = list(itertools.combinations(f_list, 2))
    flavor_pairs_list = flavor_pairs_list + flavor_pairs

In [29]:
#create dataframe out of flavor pairs list
flavor_pdf = pd.DataFrame(flavor_pairs_list, columns=['flavor1', 'flavor2'])
print(flavor_pdf.shape)

(35627, 2)


In [77]:
flavor_pdf.head(10)

Unnamed: 0,flavor1,flavor2
0,apple,cinnamon
1,apple,vanilla
2,cinnamon,vanilla
3,apple,cinnamon
4,apple,lemon
5,apple,nutmeg
6,cinnamon,lemon
7,cinnamon,nutmeg
8,lemon,nutmeg
9,fig,lemon


In [31]:
#top 20 or 50 flavor pairs -- is flavor 1 in top 50 and is flavor 2 in top 50
flavor_t50_pdf = flavor_pdf[(flavor_pdf.flavor1.isin(top50_flavors)) & (flavor_pdf.flavor2.isin(top50_flavors))]
flavor_t20_pdf = flavor_pdf[(flavor_pdf.flavor1.isin(top20_flavors)) & (flavor_pdf.flavor2.isin(top20_flavors))]
print(flavor_t20_pdf.shape)

(17021, 2)


In [79]:
flavor_t20_pdf.head(10)

Unnamed: 0,flavor1,flavor2
0,apple,cinnamon
1,apple,vanilla
2,cinnamon,vanilla
3,apple,cinnamon
4,apple,lemon
5,apple,nutmeg
6,cinnamon,lemon
7,cinnamon,nutmeg
8,lemon,nutmeg
11,lemon,vanilla


In [83]:
data = list(itertools.chain.from_iterable((i, i[::-1]) for i in flavor_t20_pdf.values))

In [34]:
matrix = pd.pivot_table(
    pd.DataFrame(data), index=0, columns=1, aggfunc="size", fill_value=0
).values.tolist()

In [35]:
mdf = pd.DataFrame(matrix)

In [36]:
mdf.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,41,91,235,166,33,58,56,50,97,192,20,43,135,21,72,26,45,435,37
1,41,0,21,10,263,53,9,9,56,33,182,3,73,50,31,10,24,5,242,56
2,91,21,0,91,69,17,24,10,16,21,70,6,22,47,21,9,8,3,169,19
3,235,10,91,0,177,30,438,104,60,64,101,20,46,162,131,111,71,68,1021,125
4,166,263,69,177,0,287,56,40,335,122,360,11,326,236,132,36,75,35,761,171
5,33,53,17,30,287,0,17,5,150,40,89,4,108,80,23,5,20,10,142,39
6,58,9,24,438,56,17,0,29,21,23,26,1,8,56,39,30,22,15,412,39
7,56,9,10,104,40,5,29,0,29,19,52,49,16,34,24,14,53,18,224,8
8,50,56,16,60,335,150,21,29,0,40,129,29,125,85,45,21,28,14,253,33
9,97,33,21,64,122,40,23,19,40,0,122,15,38,117,30,20,9,30,182,56


In [37]:
mdf.sum()

0     1853
1     1171
2      734
3     3065
4     3658
5     1152
6     1323
7      793
8     1519
9     1078
10    2886
11     359
12    1439
13    1958
14     983
15     893
16     726
17     878
18    6476
19    1098
dtype: int64

In [38]:
names = np.unique(data).tolist()
name_df = pd.DataFrame(names, columns=['flavor_name'])
name_df

Unnamed: 0,flavor_name
0,almond
1,apple
2,cherry
3,chocolate
4,cinnamon
5,clove
6,cocoa
7,coconut
8,ginger
9,honey


In [39]:
color_dic = {
     'almond': "#b06e31",
     'apple': "#cf350e",
     'cherry': "#8f0b0b",
     'chocolate': "#2B1700",
     'cinnamon': "#622A0F",
     'clove': "#B5651D",
     'cocoa': "#3B270C",
     'coconut': "#EE8130",
     'ginger': "#db6d09",
     'honey': "#ebb028",
     'lemon': "#e8ce25",
     'lime': "#5ce825",
     'nutmeg': "#7e4a3b",
     'orange': "#e88325",
     'pecan': "#48260D",
     'raspberry': "#db1f1f",
     'rum': "#D7C5A9",
     'strawberry': "#FC5A8D",
     'vanilla': "#F9E5BC",
     'walnut': "#43270F"
}

In [40]:
name_df['color'] = name_df['flavor_name'].map(color_dic)

In [41]:
name_df

Unnamed: 0,flavor_name,color
0,almond,#b06e31
1,apple,#cf350e
2,cherry,#8f0b0b
3,chocolate,#2B1700
4,cinnamon,#622A0F
5,clove,#B5651D
6,cocoa,#3B270C
7,coconut,#EE8130
8,ginger,#db6d09
9,honey,#ebb028


In [42]:
colors = list(name_df.color.values)

In [43]:
Chord(matrix, names, colors=colors, wrap_labels=False).show()

In [44]:
Chord(matrix, names, colors=colors, wrap_labels=False).to_html()