In [1]:
import json
import numpy as np
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import collections

In [2]:
# load train data
with open('data/train.json') as f:
    data = json.load(f)

In [3]:
# get id_cuisine_dict
id_cuisine_dict = dict()
for recipe in data:
    id_cuisine_dict[recipe['id']] = recipe['cuisine'].strip()

In [4]:
# loop through every train data
cuisine_list = list()
ingredient_list = list()
id_list = list()

for recipe in data:
    cuisine_list.append(recipe['cuisine'].strip())
    id_list.append(recipe['id'])
    for ingredient in recipe['ingredients']:
        ingredient_list.append(ingredient.strip().lower())
# print(cuisine_list[:10])
# print(ingredient_list[:10])

counts_cuis = collections.Counter(cuisine_list)
counts_ingr = collections.Counter(ingredient_list)

# print('Size Ingredients dataset (with repetition):  \t{}'.format((len(ingredient_list))))
print('Unique #Ingredient: \t{}'.format((len(counts_ingr.values()))))
print('Total #recipes: \t{}'.format(len(cuisine_list)))
print('Total #Cuisines: \t{}'.format((len(counts_cuis.values()))))

Unique #Ingredient: 	6703
Total #recipes: 	39774
Total #Cuisines: 	20


In [5]:
counts_cuis.most_common

<bound method Counter.most_common of Counter({'italian': 7838, 'mexican': 6438, 'southern_us': 4320, 'indian': 3003, 'chinese': 2673, 'french': 2646, 'cajun_creole': 1546, 'thai': 1539, 'japanese': 1423, 'greek': 1175, 'spanish': 989, 'korean': 830, 'vietnamese': 825, 'moroccan': 821, 'british': 804, 'filipino': 755, 'irish': 667, 'jamaican': 526, 'russian': 489, 'brazilian': 467})>

In [6]:
counts_ingr.most_common(10)

[('salt', 18049),
 ('onions', 7972),
 ('olive oil', 7972),
 ('water', 7457),
 ('garlic', 7380),
 ('sugar', 6434),
 ('garlic cloves', 6237),
 ('butter', 4848),
 ('ground black pepper', 4785),
 ('all-purpose flour', 4632)]

# Step 1: Vectorize rating matrix

In [7]:
corpus = list()
for recipe in data:
    corpus.append(recipe['ingredients'])

def dummy(text):
    return [i.lower() for i in text]

vectorizer = CountVectorizer(
    tokenizer=dummy,
    preprocessor=dummy,
)  
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

# print(features)
# print(len(features))
rating_matrix = X.toarray()

In [8]:
df = pd.DataFrame(rating_matrix)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6693,6694,6695,6696,6697,6698,6699,6700,6701,6702
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39770,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39772,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
index_ingr_dict = dict(zip(list(range(len(features))), features))
index_id_dict = dict(zip(list(range(len(id_list))),id_list))
df_renamed = df.rename(columns=index_ingr_dict, index=index_id_dict)
df_renamed

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,...,yuzukosho,za'atar,zatarains creole seasoning,zatarain’s jambalaya mix,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
10259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Step 2: clean the data from irrelevant recipes and ingredients

In [10]:
# removed ingredients that appear three times or less
leq_three_feat_list = [feat for feat in features if df_renamed[feat].value_counts()[1] <= 3]
print('appear three times or less ingredients:\ncount = {}\n{}'.format(len(leq_three_feat_list),leq_three_feat_list))

appear three times or less ingredients:
count = 3029
['(10 oz.) frozen chopped spinach', '(10 oz.) frozen chopped spinach, thawed and squeezed dry', '(14 oz.) sweetened condensed milk', '(14.5 oz.) diced tomatoes', '(15 oz.) refried beans', '1% low-fat chocolate milk', '2 1/2 to 3 lb. chicken, cut into serving pieces', '2% low fat cheddar chees', '2% lowfat greek yogurt', '2% milk shredded mozzarella cheese', '25% less sodium chicken broth', '33% less sodium cooked deli ham', '33% less sodium cooked ham', '33% less sodium ham', '33% less sodium smoked fully cooked ham', '8 ounc ziti pasta, cook and drain', 'a taste of thai rice noodles', 'abalone', 'abbamele', 'absinthe', 'acai juice', 'accent', 'accompaniment', 'acini di pepe', 'adobo all purpose seasoning', 'adobo style seasoning', 'agar agar flakes', 'agave tequila', 'aged cheddar cheese', 'aged gouda', 'aged manchego cheese', 'ajinomoto', 'aka miso', 'alaskan king crab legs', 'alaskan king salmon', 'albacore', 'alcohol', 'aleppo', 

In [11]:
df_renamed_rev = df_renamed.drop(leq_three_feat_list, axis=1)
# update features list
features_rev = [feat for feat in features if feat not in leq_three_feat_list]
df_renamed_rev

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,1% low-fat buttermilk,1% low-fat cottage cheese,1% low-fat milk,2% low-fat cottage cheese,2% reduced-fat milk,40% less sodium taco seasoning,40% less sodium taco seasoning mix,7 up,...,yoghurt,yolk,yukon gold,yukon gold potatoes,yuzu,zest,zesty italian dressing,zinfandel,ziti,zucchini
10259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# removing special characters and numbers (e.g. ’14 oz’)
change_dict = dict()
for feat in features_rev:
    feat_rev = re.sub(r'(^\(\s*oz.\) |^\d*%+ )', '', feat)
    if feat_rev != feat:
        change_dict[feat] = feat_rev
change_dict

{'(    oz.) tomato sauce': 'tomato sauce',
 '(   oz.) tomato paste': 'tomato paste',
 '1% low-fat buttermilk': 'low-fat buttermilk',
 '1% low-fat cottage cheese': 'low-fat cottage cheese',
 '1% low-fat milk': 'low-fat milk',
 '2% low-fat cottage cheese': 'low-fat cottage cheese',
 '2% reduced-fat milk': 'reduced-fat milk',
 '40% less sodium taco seasoning': 'less sodium taco seasoning',
 '40% less sodium taco seasoning mix': 'less sodium taco seasoning mix',
 '95% lean ground beef': 'lean ground beef'}

In [13]:
features_rev_2 = [re.sub(r'(^\(\s*oz.\) |^\d*%+ )', '', feat) for feat in features_rev]
retD = list(set(features_rev_2).difference(set(features_rev)))     # in features_rev_2 but not in features_rev
retD
# only three 'new' ingredients, others are already existing

['less sodium taco seasoning',
 'reduced-fat milk',
 'less sodium taco seasoning mix']

In [14]:
for feat in change_dict.keys():
    if change_dict[feat] in retD:
        # rename them
        df_renamed_rev.rename(columns={feat: change_dict[feat]}, inplace = True)
    else:
        # combine them with existing columns
        df_renamed_rev[change_dict[feat]] = df_renamed_rev[feat] + df_renamed_rev[change_dict[feat]]
        # print(df_renamed_rev['tomato sauce'][6528])
        df_renamed_rev.drop(feat, axis=1, inplace=True)
df_renamed_rev

Unnamed: 0,reduced-fat milk,less sodium taco seasoning,less sodium taco seasoning mix,7 up,abura age,accent seasoning,achiote,achiote paste,achiote powder,ackee,...,yoghurt,yolk,yukon gold,yukon gold potatoes,yuzu,zest,zesty italian dressing,zinfandel,ziti,zucchini
10259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
for i in df_renamed_rev.columns:
    if i.startswith(' '):
        print('found')
        
# not find any ingr starting with space

# Step 3: Remove words that do not contain relevant information and only indicated different versions of the same thing

In [16]:
def longest_common_substring(s1, s2):
    m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in range(1, 1 + len(s1)):
        for y in range(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest: x_longest]

def longest_common_sentence(s1, s2):
    s1 = re.sub(r'[^A-Za-z0-9-]+', ' ', s1)
    s2 = re.sub(r'[^A-Za-z0-9-]+', ' ', s2)
    s1_words = s1.split()
    s2_words = s2.split()
    return ' '.join(longest_common_substring(s1_words, s2_words))

In [17]:
longest_common_sentence('coconut milk', 'coconut milk')

'coconut milk'

In [18]:
# words that do not contain relevant information and only indicated different versions of the same thing
#‘light brown sugar’ and ‘dark brown sugar’ should be combined into one ingredient ‘brown sugar’, 
# and words like ‘half’ or ‘oz’ should be removed from the name

# e.g.
# milk, low-fat milk, fat free milk, coconut milk, buttermilk, whole milk, evaporated skim milk, etc.
# longest common string should be 'milk'

# only keep those in more than `30 different ingredients` `or` in `more than 1,000 recipes`

LCS_feats_dict = collections.defaultdict(list)

num_feats = len(df_renamed_rev.columns)
i = 0
for i in range(i, num_feats):
    j = i # should include 'itself'
    while j < num_feats:
        LCS = longest_common_sentence(df_renamed_rev.columns[i], df_renamed_rev.columns[j])
        LCS_feats_dict[LCS].append(df_renamed_rev.columns[i])
        LCS_feats_dict[LCS].append(df_renamed_rev.columns[j])
        j += 1
        # can have repeat feats

# remove key ''
LCS_feats_dict.pop('')

# remove repeat feats
LCS_feats_dict = {LCS: list(set(feats_list)) for LCS, feats_list in LCS_feats_dict.items()}


In [19]:
LCS_feats_dict['sauce']

['oyster sauce',
 'alfredo sauce',
 'sauce',
 'bertolli® alfredo sauce',
 'raspberry sauce',
 'hot pepper sauce',
 'light soy sauce',
 'pasta sauce',
 'worcestershire sauce',
 'cholula hot sauce',
 'hot sauce',
 'kikkoman soy sauce',
 'low sodium soy sauce',
 'tamari soy sauce',
 'asian fish sauce',
 'fish sauce',
 'japanese soy sauce',
 'low-fat pasta sauce',
 'tartar sauce',
 'caramel sauce',
 'cocktail sauce',
 'peanut sauce',
 'chili sauce',
 'vietnamese fish sauce',
 'reduced-sodium tamari sauce',
 'light alfredo sauce',
 'sweet chili sauce',
 'lower sodium soy sauce',
 'soy sauce',
 'chili bean sauce',
 'prepared pasta sauce',
 'sweet bean sauce',
 'chile sauce',
 'chilegarlic sauce',
 'black bean garlic sauce',
 'tonkatsu sauce',
 'pickapeppa sauce',
 'garlic sauce',
 'vegan worcestershire sauce',
 'reduced sodium soy sauce',
 'horseradish sauce',
 'red chile sauce',
 'black bean sauce',
 'regular soy sauce',
 'yellow bean sauce',
 'hoisin sauce',
 'barbecue sauce',
 'cranberry 

In [20]:
# get the list of substrings more than `30 different ingredients`
LCS_geq30_list = [LCS for LCS in LCS_feats_dict.keys() if len(LCS_feats_dict[LCS]) >= 30]
len(LCS_geq30_list), LCS_geq30_list

(42,
 ['milk',
  'seasoning',
  'sodium',
  'mix',
  'paste',
  'powder',
  'sauce',
  'beans',
  'vinegar',
  'pepper',
  'flour',
  'cheese',
  'chile',
  'chili',
  'ground',
  'fat',
  'chicken',
  'juice',
  'rice',
  'fresh',
  'leaves',
  'oil',
  'corn',
  'cream',
  'pork',
  'dried',
  'beef',
  'broth',
  'tomatoes',
  'black',
  'garlic',
  'bread',
  'italian',
  'green',
  'sugar',
  'chopped',
  'coconut',
  'white',
  'whole',
  'red',
  'frozen',
  'sweet'])

In [21]:
# or `more than 1,000 recipes`

LCS_geq1000recipes_list = list()

for LCS, feats_list in LCS_feats_dict.items():
    count = 0
    for feat in feats_list:
        count += df_renamed_rev[feat].value_counts()[1]
    if count >= 1000:
        LCS_geq1000recipes_list.append(LCS)

len(LCS_geq1000recipes_list), LCS_geq1000recipes_list

(231,
 ['milk',
  'seasoning',
  'sodium',
  'paste',
  'powder',
  'dry',
  'sauce',
  'beans',
  'vinegar',
  'water',
  'pepper',
  'flour',
  'all-purpose flour',
  'butter',
  'extract',
  'cheese',
  'chile',
  'chili',
  'ground',
  'ground pepper',
  'fat',
  'sausage',
  'chicken',
  'seed',
  'seeds',
  'juice',
  'bacon',
  'rice',
  'starch',
  'basil',
  'fish',
  'fish sauce',
  'fresh',
  'avocado',
  'leaves',
  'oil',
  'carrots',
  'corn',
  'leaf',
  'lettuce',
  'mushrooms',
  'potatoes',
  'spinach',
  'cream',
  'baking',
  'baking powder',
  'soda',
  'spray',
  'peppers',
  'pork',
  'dried',
  'bay',
  'bay leaves',
  'eggs',
  'beef',
  'broth',
  'stock',
  'tomatoes',
  'bell pepper',
  'bell',
  'wine',
  'olive oil',
  'masala',
  'black',
  'garlic',
  'black beans',
  'cumin',
  'mustard',
  'olives',
  'black pepper',
  'rice vinegar',
  'salt',
  'sesame',
  'sesame seeds',
  'peas',
  'orange',
  'onions',
  'skinless',
  'skinless chicken',
  'chicke

In [22]:
LCS_feats_dict['steak']

['steak sauce',
 'top round steak',
 'sirloin steak',
 'boneless sirloin steak',
 'halibut steak',
 'steak seasoning',
 'beef steak',
 'top sirloin steak',
 'lean steak',
 'skirt steak',
 'flank steak',
 'steak',
 'ham steak',
 'hanger steak']

In [23]:
LCS_feats_dict['saffron']

['saffron threads', 'saffron']

In [24]:
count = 0
for feat in LCS_feats_dict['steak']:
    count += df_renamed_rev[feat].value_counts()[1]
print(count)

613


In [25]:
'steak' in LCS_geq1000recipes_list

False

In [26]:
'saffron' in LCS_geq1000recipes_list

False

In [27]:
count = 0
for feat in LCS_feats_dict['saffron']:
    count += df_renamed_rev[feat].value_counts()[1]
print(count)

483


In [28]:
'saffron threads' in LCS_geq1000recipes_list

False

In [29]:
count = 0
for feat in LCS_feats_dict['saffron threads']:
    count += df_renamed_rev[feat].value_counts()[1]
print(count)

277


In [30]:
# 差集

for feat in LCS_geq30_list:
    if feat not in LCS_geq1000recipes_list:
        print(feat)
        
# LCS_geq30_list is useless

mix


In [31]:
with open('./LCS.txt', 'w') as f:
    sorted_list = sorted(LCS_geq1000recipes_list)
    for LCS in sorted_list:
        f.write(LCS + '\n')
sorted_list

['all-purpose flour',
 'avocado',
 'bacon',
 'baking',
 'baking powder',
 'basil',
 'bay',
 'bay leaves',
 'beans',
 'beef',
 'bell',
 'bell pepper',
 'black',
 'black beans',
 'black pepper',
 'boneless',
 'boneless skinless chicken',
 'boneless skinless chicken breasts',
 'bread',
 'breast',
 'breasts',
 'broth',
 'brown',
 'brown sugar',
 'butter',
 'buttermilk',
 'cabbage',
 'canola',
 'canola oil',
 'carrots',
 'cayenne',
 'cayenne pepper',
 'celery',
 'cheddar',
 'cheddar cheese',
 'cheese',
 'chicken',
 'chicken breast',
 'chicken breasts',
 'chicken broth',
 'chicken stock',
 'chile',
 'chili',
 'chili powder',
 'chilies',
 'chopped',
 'chopped cilantro',
 'chopped cilantro fresh',
 'chopped fresh',
 'chopped onion',
 'cilantro',
 'cilantro leaves',
 'cinnamon',
 'cloves',
 'coconut',
 'coconut milk',
 'cooked',
 'cooking',
 'cooking spray',
 'coriander',
 'corn',
 'corn starch',
 'cream',
 'crushed',
 'crushed red pepper',
 'cucumber',
 'cumin',
 'curry',
 'dark',
 'diced',
 '

In [32]:
# trial area

key = 'sauce'

temp_dict = dict()
for feat in LCS_feats_dict[key]:
    count = df_renamed_rev[feat].value_counts()[1]
    temp_dict[feat] = count

temp_dict

{'oyster sauce': 455,
 'alfredo sauce': 48,
 'sauce': 458,
 'bertolli® alfredo sauce': 8,
 'raspberry sauce': 4,
 'hot pepper sauce': 306,
 'light soy sauce': 347,
 'pasta sauce': 266,
 'worcestershire sauce': 688,
 'cholula hot sauce': 4,
 'hot sauce': 674,
 'kikkoman soy sauce': 7,
 'low sodium soy sauce': 425,
 'tamari soy sauce': 90,
 'asian fish sauce': 158,
 'fish sauce': 1247,
 'japanese soy sauce': 29,
 'low-fat pasta sauce': 4,
 'tartar sauce': 20,
 'caramel sauce': 19,
 'cocktail sauce': 5,
 'peanut sauce': 20,
 'chili sauce': 147,
 'vietnamese fish sauce': 38,
 'reduced-sodium tamari sauce': 7,
 'light alfredo sauce': 9,
 'sweet chili sauce': 112,
 'lower sodium soy sauce': 57,
 'soy sauce': 3296,
 'chili bean sauce': 17,
 'prepared pasta sauce': 6,
 'sweet bean sauce': 4,
 'chile sauce': 39,
 'chilegarlic sauce': 13,
 'black bean garlic sauce': 8,
 'tonkatsu sauce': 14,
 'pickapeppa sauce': 4,
 'garlic sauce': 11,
 'vegan worcestershire sauce': 5,
 'reduced sodium soy sauce

In [33]:
# load valid LCS (人工筛选 LCS_geq30_list)
# Rules：
# 1. 删除无关词 如 black
# 2. 只保留确切的食材 -- 删除broth, sauce这类模糊的食材
# 3. 多个词组成的食材 具有更高优先级
# 4. etc. 

with open('./valid_LCS.txt', 'r') as f:
    valid_LCS_list = f.read().split('\n')
valid_LCS_list

['baking powder',
 'bay leaves',
 'brown sugar',
 'white sugar',
 'bell pepper',
 'black beans',
 'black pepper',
 'canola oil',
 'cayenne pepper',
 'chicken breast',
 'chicken breasts',
 'chicken broth',
 'chicken stock',
 'chili powder',
 'cilantro leaves',
 'coconut milk',
 'corn starch',
 'fish sauce',
 'garlic cloves',
 'garlic powder',
 'kosher salt',
 'lemon juice',
 'lime juice',
 'olive oil',
 'white wine',
 'sesame oil',
 'sesame seeds',
 'sour cream',
 'wine vinegar',
 'pepper flakes',
 'soy sauce',
 'red wine',
 'rice vinegar',
 'sea salt',
 'vanilla extract',
 'tomato paste',
 'vegetable oil',
 'avocado',
 'bacon',
 'basil',
 'beans',
 'beef',
 'bread',
 'butter',
 'buttermilk',
 'cabbage',
 'canola',
 'carrots',
 'cayenne',
 'celery',
 'cheddar',
 'cheese',
 'chicken',
 'chile',
 'chili',
 'chilies',
 'cilantro',
 'cinnamon',
 'cloves',
 'coconut',
 'coriander',
 'corn',
 'cream',
 'cucumber',
 'cumin',
 'curry',
 'egg',
 'eggs',
 'fish',
 'flour',
 'garlic',
 'ginger',
 

In [34]:
# 合并同义列

drop_list = [feat for LCS in valid_LCS_list for feat in LCS_feats_dict[LCS]]
drop_list = list(set(drop_list))  # the order makes no sense

# 从drop_list中去掉valid_LCS_list
drop_list = [feat for feat in drop_list if feat not in valid_LCS_list]

used = list()

for LCS in valid_LCS_list:
    df_renamed_rev['temp'] = 0
    for feat in LCS_feats_dict[LCS]:
        if feat not in used:
            df_renamed_rev['temp'] += df_renamed_rev[feat]
            used.append(feat)
            
    if LCS in df_renamed_rev.columns:
        df_renamed_rev[LCS] = df_renamed_rev['temp']
    else:
        df_renamed_rev.rename({'temp':LCS}, axis='columns', inplace=True)

# 最后删掉已合并的列
df_renamed_rev.drop(drop_list, axis=1, inplace=True)
df_renamed_rev.drop(['temp'], axis=1, inplace=True, errors='ignore')

df_renamed_rev

Unnamed: 0,less sodium taco seasoning,less sodium taco seasoning mix,7 up,abura age,accent seasoning,achiote,achiote paste,achiote powder,ackee,acorn squash,...,cloves,egg,mozzarella,onion,parmesan,peppers,sausage,tomato,yogurt,spray
10259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41882,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [35]:
df_renamed_rev['white wine']

10259    0
25693    0
20130    0
22213    0
13162    0
        ..
29109    0
11462    0
2238     0
41882    0
2362     0
Name: white wine, Length: 39774, dtype: int64

In [36]:
# leaf, leaves.... egg eggs 单复数合并 by stem
# paprika jalapeno 不同国家不同的辣椒 裂开

from stemming.porter2 import stem

stemmed_feats = [stem(feat) for feat in df_renamed_rev.columns]

feat_count_dict = collections.Counter(stemmed_feats)
sorted(feat_count_dict.items(), key=lambda d: d[1], reverse=True)


# ice-icing are different ingredients, thus removed

temp = ['chicken breast', 'chile', 'chili', 'clove', 'egg', 'onion', 'pepper', 'sausag', 'tomato']
for feat in df_renamed_rev.columns:
    if stem(feat) in temp:
        print(feat)

df_renamed_rev['sausage'] += df_renamed_rev['sausages']
df_renamed_rev['egg'] += df_renamed_rev['eggs']
df_renamed_rev['tomato'] += df_renamed_rev['tomatoes']
df_renamed_rev['chili'] += df_renamed_rev['chilies']
df_renamed_rev['pepper'] += df_renamed_rev['peppers']
df_renamed_rev['onion'] += df_renamed_rev['onions']
df_renamed_rev['chicken breast'] += df_renamed_rev['chicken breasts']
df_renamed_rev['chile'] += df_renamed_rev['chiles']
df_renamed_rev['clove'] += df_renamed_rev['cloves']

temp_2 = ['sausages', 'eggs', 'tomatoes', 'chilies', 'peppers', 'onions', 'chicken breasts', 'chiles', 'cloves']
df_renamed_rev.drop(temp_2, axis=1, inplace=True)

chicken breasts
chiles
chili
clove
eggs
onions
pepper
sausages
tomatoes
chicken breast
chile
chilies
cloves
egg
onion
peppers
sausage
tomato


In [37]:
# 判断是否有大于1的列 或者 全为0的列
df_renamed_rev.describe()

Unnamed: 0,less sodium taco seasoning,less sodium taco seasoning mix,7 up,abura age,accent seasoning,achiote,achiote paste,achiote powder,ackee,acorn squash,...,cheddar,chile,egg,mozzarella,onion,parmesan,sausage,tomato,yogurt,spray
count,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,...,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0
mean,0.000176,0.000126,0.000101,0.000101,0.000151,0.000201,0.000277,0.000101,0.000226,0.000453,...,0.045784,0.064489,0.226656,0.005808,0.482677,0.00264,0.032006,0.24692,0.027857,0.039146
std,0.013265,0.011211,0.010028,0.010028,0.012281,0.014181,0.016628,0.010028,0.015041,0.021269,...,0.211886,0.263312,0.465426,0.076319,0.57787,0.051313,0.178992,0.475535,0.165784,0.193945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,4.0,3.0,2.0,4.0,1.0,2.0,4.0,2.0,1.0


In [38]:
# 找出>1的“灵异”菜谱
df_renamed_rev[(df_renamed_rev.yogurt==2)].index

Int64Index([21624, 13654, 18832, 16632, 10527, 26759, 3765, 33271], dtype='int64')

In [39]:
# 将所有大于1的数字全部改成1
df_renamed_rev[df_renamed_rev>1] = 1
# 删除全为0的食材列
all_zero_list = [feat for feat in df_renamed_rev.columns if df_renamed_rev[feat].value_counts()[0] == df_renamed_rev.shape[0]]
df_renamed_rev.drop(all_zero_list, axis=1, inplace=True)

df_renamed_rev.describe()

Unnamed: 0,less sodium taco seasoning,less sodium taco seasoning mix,7 up,abura age,accent seasoning,achiote,achiote paste,achiote powder,ackee,acorn squash,...,cheddar,chile,egg,mozzarella,onion,parmesan,sausage,tomato,yogurt,spray
count,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,...,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0,39774.0
mean,0.000176,0.000126,0.000101,0.000101,0.000151,0.000201,0.000277,0.000101,0.000226,0.000453,...,0.04518,0.060165,0.206668,0.005783,0.442249,0.00264,0.031478,0.228038,0.027656,0.039146
std,0.013265,0.011211,0.010028,0.010028,0.012281,0.014181,0.016628,0.010028,0.015041,0.021269,...,0.207702,0.237795,0.40492,0.075825,0.49666,0.051313,0.174607,0.419573,0.163988,0.193945
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Step 4: Remove all ingredients appearing in 250 recipes or less and all recipes that contained two ingredients or less

In [40]:
# removed ingredients that appear 250 times or less
leq_250_feat_list = [feat for feat in df_renamed_rev.columns if df_renamed_rev[feat].value_counts()[1] <= 250]
print('appear 250 times or less ingredients:\ncount = {}\n{}'.format(len(leq_250_feat_list),leq_250_feat_list))
df_renamed_rev.drop(leq_250_feat_list, axis=1, inplace=True)

df_renamed_rev

appear 250 times or less ingredients:
count = 1906
['less sodium taco seasoning', 'less sodium taco seasoning mix', '7 up', 'abura age', 'accent seasoning', 'achiote', 'achiote paste', 'achiote powder', 'ackee', 'acorn squash', 'adobo', 'adobo sauce', 'adobo seasoning', 'agar', 'agave nectar', 'ahi', 'ahi tuna steaks', 'aioli', 'ajwain', 'ale', 'aleppo pepper', 'alfalfa sprouts', 'alfredo sauce', 'all potato purpos', 'all purpose seasoning', 'allspice', 'allspice berries', 'almond extract', 'almond liqueur', 'almond meal', 'almond paste', 'almonds', 'amaretti', 'amaretti cookies', 'amaretto', 'amaretto liqueur', 'amchur', 'ancho', 'ancho powder', 'anchovies', 'anchovy filets', 'anchovy fillets', 'anchovy paste', 'and fat free half half', 'angel food cake', 'angel hair', 'angostura bitters', 'anise', 'anise extract', 'anise seed', 'anjou pears', 'annatto seeds', 'apple brandy', 'apple cider', 'apple jelly', 'apple juice', 'apple juice concentrate', 'apple pie filling', 'apple pie spice'

Unnamed: 0,active dry yeast,avocado,bacon,baguette,baking powder,basil,bay leaf,bay leaves,beans,beansprouts,...,zucchini,chicken breast,cheddar,chile,egg,onion,sausage,tomato,yogurt,spray
10259,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
41882,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [41]:
# remove all removed all recipes that contained two ingredients or less
leq_2_recipe_list = [id for id in df_renamed_rev.index if df_renamed_rev.loc[id].value_counts()[0] >= (df_renamed_rev.shape[1]-2)]
df_renamed_rev.drop(leq_2_recipe_list, axis=0, inplace=True)

df_renamed_rev

Unnamed: 0,active dry yeast,avocado,bacon,baguette,baking powder,basil,bay leaf,bay leaves,beans,beansprouts,...,zucchini,chicken breast,cheddar,chile,egg,onion,sausage,tomato,yogurt,spray
10259,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
41882,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [42]:
cols = df_renamed_rev.columns

In [43]:
sorted(cols)

['active dry yeast',
 'avocado',
 'bacon',
 'baguette',
 'baking powder',
 'basil',
 'bay leaf',
 'bay leaves',
 'beans',
 'beansprouts',
 'beef',
 'bell pepper',
 'black beans',
 'black pepper',
 'black peppercorns',
 'bread',
 'brown sugar',
 'butter',
 'buttermilk',
 'cabbage',
 'cajun seasoning',
 'canola oil',
 'capers',
 'carrots',
 'cayenne',
 'cayenne pepper',
 'celery',
 'cheddar',
 'cheese',
 'chicken',
 'chicken breast',
 'chicken broth',
 'chicken stock',
 'chickpeas',
 'chile',
 'chili',
 'chili powder',
 'chinese five-spice powder',
 'chopped pecans',
 'cilantro',
 'cilantro leaves',
 'cinnamon',
 'clove',
 'coconut',
 'coconut milk',
 'coriander',
 'corn',
 'corn starch',
 'cornmeal',
 'cream',
 'creole seasoning',
 'crushed red pepper',
 'cucumber',
 'cumin',
 'curry',
 'dry sherry',
 'egg',
 'eggplant',
 'enchilada sauce',
 'fennel seeds',
 'fish',
 'fish sauce',
 'flank steak',
 'flour',
 'fresh rosemary',
 'freshly ground pepper',
 'garlic',
 'garlic cloves',
 'garli

In [44]:
# 人工筛选，合并

# 'bay leaf', 'bay leaves'

# 'black pepper' 'black peppercorns'

# 'chili', 'chile', 'cayenne', 'cayenne pepper', 'ground red pepper', 'paprika'

# 'cilantro', 'cilantro leaves'

# 'bell pepper', 'green pepper'

# 'pepper', 'ground pepper', 'freshly ground pepper'

# 'garlic', 'garlic cloves'

# 'tomato paste',  'ketchup'

df_renamed_rev['bay leaf'] += df_renamed_rev['bay leaves']
df_renamed_rev['black pepper'] += df_renamed_rev['black peppercorns']
df_renamed_rev['chili'] = df_renamed_rev['chili'] + df_renamed_rev['chile'] + df_renamed_rev['cayenne'] + df_renamed_rev['cayenne pepper'] + df_renamed_rev['ground red pepper'] + df_renamed_rev['paprika']
df_renamed_rev['cilantro'] += df_renamed_rev['cilantro leaves']
df_renamed_rev['bell pepper'] += df_renamed_rev['green pepper']
df_renamed_rev['pepper'] = df_renamed_rev['pepper'] + df_renamed_rev['ground pepper'] + df_renamed_rev['freshly ground pepper']
df_renamed_rev['garlic'] += df_renamed_rev['garlic cloves']
df_renamed_rev['tomato paste'] += df_renamed_rev['ketchup']

temp_3 = ['bay leaves', 'black peppercorns', 'chile', 'cayenne', 
          'cayenne pepper', 'ground red pepper', 'paprika', 'cilantro leaves', 
          'green pepper', 'ground pepper', 'freshly ground pepper', 'garlic cloves',
         'ketchup']
df_renamed_rev.drop(temp_3, axis=1, inplace=True)

In [45]:
# 将所有大于1的数字全部改成1
df_renamed_rev[df_renamed_rev>1] = 1
# 删除全为0的食材列
all_zero_list = [feat for feat in df_renamed_rev.columns if df_renamed_rev[feat].value_counts()[0] == df_renamed_rev.shape[0]]
df_renamed_rev.drop(all_zero_list, axis=1, inplace=True)

In [46]:
# 保存不带cuisine的处理数据
df_renamed_rev.to_csv("processed_data.csv")
df_renamed_rev

Unnamed: 0,active dry yeast,avocado,bacon,baguette,baking powder,basil,bay leaf,beans,beansprouts,beef,...,worcestershire sauce,zucchini,chicken breast,cheddar,egg,onion,sausage,tomato,yogurt,spray
10259,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
20130,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,0,0,0,0
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13162,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
41882,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [47]:
# check duplicated recipines
df_renamed_rev.duplicated()

10259    False
25693    False
20130    False
22213    False
13162    False
         ...  
29109    False
11462    False
2238     False
41882    False
2362     False
Length: 38472, dtype: bool

In [57]:
# 保存带cuisine的处理数据
cuisine_list = [id_cuisine_dict[id] for id in df_renamed_rev.index]
df_renamed_rev['cuisine'] = cuisine_list
df_renamed_rev.to_csv("processed_data_with_cuisine.csv")
df_renamed_rev

Unnamed: 0,active dry yeast,avocado,bacon,baguette,baking powder,basil,bay leaf,beans,beansprouts,beef,...,zucchini,chicken breast,cheddar,egg,onion,sausage,tomato,yogurt,spray,cuisine
10259,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,greek
25693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,southern_us
20130,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,0,0,filipino
22213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,indian
13162,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,indian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,irish
11462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,italian
2238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,irish
41882,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,chinese
