In [4]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 1.4/1.4 MB 15.2 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.1


In [5]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

### Preprocessing the data into a more suitable format

In [6]:
data = pd.read_csv("dataset/recipe_dataset.csv")
data = data.head(10000)
data = data.drop(data.columns[0], axis=1)

data

Unnamed: 0,recipe_id,recipe_name,url,recipe_Ingredients,Instructions,cuisine,Prep Time in Minutes
0,1,Super Easy Slow Cooker Chicken Enchilada Meat,https://www.allrecipes.com/recipe/228367/super...,"2 cups chicken broth, 1 (14.5 ounce) can diced...","Place chicken broth, tomatoes, flour, chili po...",['Mexican Inspired'],480
1,2,Avocado Quick Bread,https://www.allrecipes.com/recipe/27341/avocad...,"2.6666667461395 cups all-purpose flour, 1.5 te...",Preheat the oven to 350 degrees F (175 degrees...,['American'],60
2,3,Quick and Easy Baked Fish Fillet,https://www.allrecipes.com/recipe/256348/quick...,"1 pound flounder fillets, 0.5 teaspoon salt, g...",Gather all ingredients. Preheat the oven to 40...,['American'],25
3,4,Roasted Kohlrabi,https://www.allrecipes.com/recipe/203975/roast...,"4 kohlrabi bulbs, peeled, 1 tablespoon olive o...",Preheat the oven to 450 degrees F (230 degrees...,['American'],20
4,5,Custard Buttercream,https://www.allrecipes.com/recipe/262379/custa...,"1 egg, 1 cup white sugar, 0.5 cup milk, 1.3333...","Beat egg with a fork in a small, heavy saucepa...",['German'],10
...,...,...,...,...,...,...,...
9995,9996,Earthquake Cookies,https://www.allrecipes.com/recipe/10904/earthq...,"1 (18.25 ounce) package devil's food cake mix,...","Mix cake mix, eggs, shortening, and water in a...",['American'],10
9996,9997,Pistachio-Crusted Salmon,https://www.allrecipes.com/recipe/269394/pista...,"0.25 cup crushed pistachios, 2 tablespoons pan...",Preheat the oven to 375 degrees F (190 degrees...,['American'],15
9997,9998,Classic Shepherd&#39;s Pie,https://www.allrecipes.com/recipe/217812/class...,"1 tablespoon vegetable oil, 1 onion, chopped, ...",Preheat the oven to 350 degrees F (180 degrees...,['Uk And Ireland'],35
9998,9999,One-Pot Spaghetti with Meat Sauce,https://www.allrecipes.com/recipe/269004/one-p...,"1 pound ground Italian sausage, 1 small white ...","Combine ground sausage, onions, and garlic in ...",['Italian'],30


In [7]:
ingredients = []
with open('dataset/recipe_ingredient_pairs.txt', 'r', encoding='utf-8') as file:
    for line in file:
        # Split on the first occurrence of ' - ' to handle ingredients with hyphens
        recipe_id, ingredient = line.strip().split(' - ', 1)
        ingredients.append({'recipe_id': int(recipe_id), 'ingredient': ingredient})
    
ingredients_df = pd.DataFrame(ingredients)
ingredients_df

Unnamed: 0,recipe_id,ingredient
0,1,tomatoes
1,1,purpose flour
2,1,clove garlic
3,1,cayenne pepper
4,1,skinless
...,...,...
93390,12184,miniature marshmallows
93391,12184,milk
93392,12184,dash salt
93393,12184,chips


In [8]:
df = pd.merge(ingredients_df, data, on='recipe_id', how='inner')
df.drop(columns=['url','recipe_Ingredients','Instructions','cuisine','Prep Time in Minutes'], inplace=True)
df

Unnamed: 0,recipe_id,ingredient,recipe_name
0,1,tomatoes,Super Easy Slow Cooker Chicken Enchilada Meat
1,1,purpose flour,Super Easy Slow Cooker Chicken Enchilada Meat
2,1,clove garlic,Super Easy Slow Cooker Chicken Enchilada Meat
3,1,cayenne pepper,Super Easy Slow Cooker Chicken Enchilada Meat
4,1,skinless,Super Easy Slow Cooker Chicken Enchilada Meat
...,...,...,...
76842,10000,kidney beans,Slow Cooker Cowboy Beans
76843,10000,lima beans,Slow Cooker Cowboy Beans
76844,10000,onions,Slow Cooker Cowboy Beans
76845,10000,brown sugar,Slow Cooker Cowboy Beans


# 1a

In [9]:
transactions = df.groupby('recipe_id')['ingredient'].apply(list).values.tolist()

te = TransactionEncoder() # Encode the list of ingredients (one hot)
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

min_support = 0.005  # 0.5% 
frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True) # Using apriori

### All Itemsets 

In [10]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# Frequent itemsets of size 1
frequent_1_itemsets = frequent_itemsets[frequent_itemsets['length'] == 1]
print("Frequent Itemsets of Size 1:")
print(frequent_1_itemsets)

# Frequent itemsets of size 2
frequent_2_itemsets = frequent_itemsets[frequent_itemsets['length'] == 2]
print("\nFrequent Itemsets of Size 2:")
print(frequent_2_itemsets)

# Frequent itemsets of size 3
frequent_3_itemsets = frequent_itemsets[frequent_itemsets['length'] == 3]
print("\nFrequent Itemsets of Size 3:")
print(frequent_3_itemsets)

Frequent Itemsets of Size 1:
      support           itemsets  length
0    0.006221                (%)       1
1    0.005117  (Cajun seasoning)       1
2    0.063710   (Cheddar cheese)       1
3    0.020668    (Dijon mustard)       1
4    0.005318  (Italian parsley)       1
..        ...                ...     ...
233  0.007425  (yellow cake mix)       1
234  0.007725   (yellow mustard)       1
235  0.024481     (yellow onion)       1
236  0.012441         (zucchini)       1
237  0.014548                (®)       1

[238 rows x 3 columns]

Frequent Itemsets of Size 2:
      support                         itemsets  length
238  0.026889   (black pepper, Cheddar cheese)       2
239  0.006321  (cooking spray, Cheddar cheese)       2
240  0.006321          (Cheddar cheese, cream)       2
241  0.008327           (Cheddar cheese, eggs)       2
242  0.007124  (garlic powder, Cheddar cheese)       2
..        ...                              ...     ...
649  0.005819    (sour cream, unsalted b

### Saved to CSV

In [11]:
frequent_1_itemsets.to_csv('Q1_outputs/1_size_itemsets.csv', index=False)
frequent_2_itemsets.to_csv('Q1_outputs/2_size_itemsets.csv', index=False)
frequent_3_itemsets.to_csv('Q1_outputs/3_size_itemsets.csv', index=False)

# 1b 

### Top 20 Itemsets

In [12]:
# Top 20 itemsets of size 1
top_1_itemsets = frequent_itemsets[frequent_itemsets['length'] == 1].nlargest(20, 'support')
print("Top 20 Itemsets of Size 1:")
print(top_1_itemsets)

# Top 20 itemsets of size 2
top_2_itemsets = frequent_itemsets[frequent_itemsets['length'] == 2].nlargest(20, 'support')
print("\nTop 20 Itemsets of Size 2:")
print(top_2_itemsets)

# Top 20 itemsets of size 3
top_3_itemsets = frequent_itemsets[frequent_itemsets['length'] == 3].nlargest(20, 'support')
print("\nTop 20 Itemsets of Size 3:")
print(top_3_itemsets)

Top 20 Itemsets of Size 1:
      support           itemsets  length
26   0.323066     (black pepper)       1
167  0.230461    (purpose flour)       1
180  0.184308             (salt)       1
35   0.121601      (brown sugar)       1
72   0.117287             (eggs)       1
151  0.105849            (onion)       1
51   0.091903           (cloves)       1
225  0.091201      (white sugar)       1
107  0.090298           (ground)       1
97   0.081971    (garlic powder)       1
165  0.081569           (powder)       1
123  0.081369      (kosher salt)       1
214  0.077054  (unsalted butter)       1
154  0.076753          (package)       1
127  0.072138       (large eggs)       1
8    0.070031  (Parmesan cheese)       1
2    0.063710   (Cheddar cheese)       1
69   0.059496              (egg)       1
114  0.059296             (inch)       1
213  0.056386         (tomatoes)       1

Top 20 Itemsets of Size 2:
      support                          itemsets  length
371  0.112371              (

### Saved to CSV

In [13]:
top_1_itemsets.to_csv('Q1_outputs/top20_1_itemsets.csv', index=False)
top_2_itemsets.to_csv('Q1_outputs/top20_2_itemsets.csv', index=False)
top_3_itemsets.to_csv('Q1_outputs/top20_3_itemsets.csv', index=False)

# 1c

**Support:** 
1. Support is a metric used in association rule mining to measure the frequency of occurrence of an itemset in a dataset
2. It is defined as the proportion of transactions (in this case, recipes) that contain the item (ingredient) or itemset.

3. $Support = {A\over B}$  
A = Number of recipes containing ingredient x  
B = Total number of recipes

[Reference](https://codefinity.com/courses/v2/a7e17f02-2cc9-4b92-abe0-cc8710d7011e/d2fce24c-4b70-427a-815a-afeb7ae6e604/bf978c9d-0090-4671-a662-a7277eb34e7d)

**Confidence:** 
1. Confidence is a measure of the likelihood that an itemset will appear if another itemset appears.
2. It indicates the proportion of the transactions (recipes) which contain ingredient A or item A along with ingredient B or item B.  
3. $
\text{Confidence}(X \rightarrow Y) = \frac{\text{Support}(X \cup Y)}{\text{Support}(X)}
$  
X & Y are ingredients

[Reference](https://www.geeksforgeeks.org/what-is-support-and-confidence-in-data-mining/#:~:text=Support%20is%20a%20measure%20of,the%20total%20number%20of%20transactions.)

**Briefly:**  
Support shows how common an item or itemset is in the dataset.  
Confidence shows how likely an item or itemset x is to appear when another item or itemset y is already present in the transaction.