## v2
#### v2 version converts all the upper cases to lower case. And also apply "Porter Stemmer" to the data.
##### It'll output the result as ingredients_v2.csvfile

In [15]:
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer

In [17]:
df=pd.read_csv('/Users/dobrien/Downloads/recipes.csv')  
df=df.groupby('recipe')['ingredient'].agg(list).reset_index(name='ingredient')
#ing = all ingredients used in recipes one by one. Ingredients are not unique in this variable. As you can see we have 655913 rows
ing=df.explode('ingredient')
ing   

Unnamed: 0,recipe,ingredient
0,1-2-3-cherry-poke-cake,cake
0,1-2-3-cherry-poke-cake,water
0,1-2-3-cherry-poke-cake,Whipped Topping
0,1-2-3-cherry-poke-cake,Gelatin
0,1-2-3-cherry-poke-cake,Chocolate
...,...,...
77038,zydeco-stomp-cajun-shrimp-alfredo,Parmesan cheese
77038,zydeco-stomp-cajun-shrimp-alfredo,shrimp
77038,zydeco-stomp-cajun-shrimp-alfredo,garlic
77038,zydeco-stomp-cajun-shrimp-alfredo,parsley


In [18]:
ing['ingredient']=ing['ingredient'].str.lower()   #all ingredients are converted to lower case. It decreased from 6842 to 6271.    

#now the stemmer will be applied and unique ingredients woill be decreased 6271 to 5927.  
stemmer = PorterStemmer()

# define a lambda function to apply the stemmer to each element in the column
stem_function = lambda x: stemmer.stem(x)

# apply the lambda function to the 'Ingredients' column using the apply() method
ing['Stemmed'] = ing['ingredient'].apply(stem_function)

# print out the resulting DataFrame
ing

Unnamed: 0,recipe,ingredient,Stemmed
0,1-2-3-cherry-poke-cake,cake,cake
0,1-2-3-cherry-poke-cake,water,water
0,1-2-3-cherry-poke-cake,whipped topping,whipped top
0,1-2-3-cherry-poke-cake,gelatin,gelatin
0,1-2-3-cherry-poke-cake,chocolate,chocol
...,...,...,...
77038,zydeco-stomp-cajun-shrimp-alfredo,parmesan cheese,parmesan chees
77038,zydeco-stomp-cajun-shrimp-alfredo,shrimp,shrimp
77038,zydeco-stomp-cajun-shrimp-alfredo,garlic,garlic
77038,zydeco-stomp-cajun-shrimp-alfredo,parsley,parsley


In [19]:
ing['Stemmed'].nunique() #we have 6271 unique ingredient(string)

5927

In [20]:
#we will have two list with alligned indexes. 
#the list called 'uniques' will have unique ingredients names after applying NLP methods.
#the list called 'num' will have the number of the corresponding ingredients used in the recipes.
#this part of code will take around 2 mins to compile

uniques=[] 
num=[0]*ing['Stemmed'].nunique()  #the number of unique ingredient.(5927) Initial amount will be zero for all of them

for i in range(len(ing)): 
    if ing.iloc[i][2] in uniques:
        num[uniques.index(ing.iloc[i][2])] +=1
    else:
        uniques.append(ing.iloc[i][2])
        num[uniques.index(ing.iloc[i][2])] +=1
        

len(uniques)



5927

In [21]:
#Now we create a dataframe called dfzips and sort all the ingredients in descending order again after nlp methods.
dfzips=pd.DataFrame(list(zip(uniques,num)),columns=['Ingredient','Number of Usage'])
dfzips=dfzips.sort_values('Number of Usage', ascending=False).reset_index(drop=True)
dfzips

Unnamed: 0,Ingredient,Number of Usage
0,salt,31117
1,onion,23295
2,egg,21607
3,garlic,21343
4,butter,20551
...,...,...
5922,rice flak,1
5923,rice starch,1
5924,banana flour,1
5925,organic shorten,1


In [22]:
dfzips.to_csv('ingredients_v2.csv',index=True)

In [77]:
df_recipes=pd.read_csv('./recipes_v3.csv')  
df_ingredients=pd.read_csv('./ingredients_v3.csv')  

recipe_list = df_recipes.to_numpy()
ingredient_list = df_ingredients.to_numpy()

# derives ingredients from list with less than 25 occurences
less_than_25_occurences = [x[1] for x in ingredient_list if int(x[2]) <= 25]

# remove ingredients that appear less than 25 times
refined_ingredient_list = [x for x in ingredient_list if int(x[2]) > 25]

# remove all recipes that have ingredients that appear less than 25 times
refined_recipe_list = [x for x in recipe_list if x[2] not in less_than_25_occurences]



In [94]:
# derive recipes and their ingredient counts
recipe_map = {}
for recipe in refined_recipe_list:
    try:
        if recipe_map[recipe[1]]:
            recipe_map[recipe[1]] = recipe_map[recipe[1]] + 1
    except:
        recipe_map[recipe[1]] = 1

# derive list of all recipes with less than 5 ingredients
recipes_with_less_than_5_ingredients = [x for x in recipe_map if recipe_map[x] < 5]

# derive only recipes with > 5 ingredients
final_refined_recipe_list = [x for x in refined_recipe_list if x[1] not in recipes_with_less_than_5_ingredients]


In [95]:
df_out_ingredients = pd.DataFrame(refined_ingredient_list, columns = ['','Ingredient','Number of Usage'])
df_out_recipes = pd.DataFrame(final_refined_recipe_list, columns = ['','Recipe','Ingredient'])


df_out_ingredients.to_csv('ingredients_v4.csv',index=True)
df_out_recipes.to_csv('recipes_v4.csv',index=True)

In [99]:
refined_recipe_list
# recipe_map = {}
# for recipe in final_refined_recipe_list:
#     try:
#         if recipe_map[recipe[1]]:
#             recipe_map[recipe[1]] = recipe_map[recipe[1]] + 1
#     except:
#         recipe_map[recipe[1]] = 1

# # for recipe in recipe_map:
# #     print(recipe_map[recipe] < 5)


# for recipe in recipes_with_less_than_5_ingredients:
#     print(recipe_map[recipe])


[array([0, '1-2-3-cherry-poke-cake', 'cake'], dtype=object),
 array([0, '1-2-3-cherry-poke-cake', 'water'], dtype=object),
 array([0, '1-2-3-cherry-poke-cake', 'whipped top'], dtype=object),
 array([0, '1-2-3-cherry-poke-cake', 'gelatin'], dtype=object),
 array([0, '1-2-3-cherry-poke-cake', 'chocol'], dtype=object),
 array([1, '1-2-3-complete-breakfast-smoothie', 'oat'], dtype=object),
 array([1, '1-2-3-complete-breakfast-smoothie', 'water'], dtype=object),
 array([1, '1-2-3-complete-breakfast-smoothie', 'honey'], dtype=object),
 array([1, '1-2-3-complete-breakfast-smoothie', 'blueberri'], dtype=object),
 array([1, '1-2-3-complete-breakfast-smoothie', 'protein powd'],
       dtype=object),
 array([1, '1-2-3-complete-breakfast-smoothie', 'yogurt'], dtype=object),
 array([2, '1-2-3-jambalaya', 'worcestershire sauc'], dtype=object),
 array([2, '1-2-3-jambalaya', 'olive oil'], dtype=object),
 array([2, '1-2-3-jambalaya', 'parsley'], dtype=object),
 array([2, '1-2-3-jambalaya', 'tomato'], d