# DATASET LOADING

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np
import re

In [5]:
data = pd.read_csv("data_with_nutrients_final.csv")
data.head(20)["ingredients"]

Unnamed: 0,ingredients
0,"[""6 cups rice krispies"", ""2 cups reese's peanu..."
1,"[""4 boneless skinless chicken breasts"", ""6 spr..."
2,"[""1 (7 1/2 ounce) can refrigerated biscuits"", ..."
3,"[""2 pounds ground beef or turkey"", ""2 pounds r..."
4,"[""3 lbs beef round steak"", ""all-purpose flour""..."
5,"[""6 oz. evaporated milk"", ""1 3/4 sticks butter..."
6,"[""3 ripe avocados"", ""2 tbsp. lemon juice"", ""1/..."
7,"[""1 tsp. cornstarch"", ""2 tbs. low-sodium soy s..."
8,"[""1 small red apple"", ""1 tbsp water"", ""1 tsp g..."
9,"[""1 box duncan hines family size brownie mix"",..."


In [6]:
data.columns

Index(['ingredients', 'directions', 'NER', 'ingredients_count', 'NER_count',
       'directions_count', 'normalized_ingredients', 'normalized_directions',
       'calories', 'protein_g', 'carbs_g', 'fat_g', 'fiber_g', 'sugar_g'],
      dtype='object')

In [7]:
data.isnull().sum()

Unnamed: 0,0
ingredients,0
directions,0
NER,0
ingredients_count,0
NER_count,0
directions_count,0
normalized_ingredients,0
normalized_directions,0
calories,0
protein_g,0


In [8]:
nut = data[["calories", "protein_g", "carbs_g", "fat_g", "fiber_g", "sugar_g"]]
nut.head()

Unnamed: 0,calories,protein_g,carbs_g,fat_g,fiber_g,sugar_g
0,5950.0,85.0,750.0,300.0,25.0,480.0
1,2600.0,280.0,280.0,60.0,40.0,50.0
2,2800.0,25.0,300.0,180.0,15.0,200.0
3,5700.0,500.0,400.0,250.0,100.0,100.0
4,3500.0,400.0,80.0,150.0,10.0,30.0


In [9]:
nut.corr(method = "pearson")

Unnamed: 0,calories,protein_g,carbs_g,fat_g,fiber_g,sugar_g
calories,1.0,0.459056,0.75033,0.888036,0.390062,0.558214
protein_g,0.459056,1.0,-0.018033,0.415646,0.329729,-0.200284
carbs_g,0.75033,-0.018033,1.0,0.437934,0.369246,0.846396
fat_g,0.888036,0.415646,0.437934,1.0,0.246661,0.309742
fiber_g,0.390062,0.329729,0.369246,0.246661,1.0,0.111684
sugar_g,0.558214,-0.200284,0.846396,0.309742,0.111684,1.0


# Plain correlation with ingredients, didn't use the amounts for each recipe

In [10]:
ner = data["NER"].apply(ast.literal_eval)
ner.iloc[0]

['rice krispies',
 'miniatures',
 'marshmallows',
 '¼',
 'chocolate chips',
 'smooth peanut butter']

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
ing_onehot = pd.DataFrame(
    mlb.fit_transform(ner),
    columns=mlb.classes_,
    index=data.index
)
ing_numeric = ing_onehot

In [12]:
ingredient_list = ner.explode()
top_ingredients = ingredient_list.value_counts().head(50)
common = [col for col in ing_numeric.columns if col in top_ingredients.index]
subset = ing_numeric[common]

In [13]:
for nutrient_column in nut.columns:
    nutrient = nut[nutrient_column]
    ingredient_corr = subset.apply(lambda x: x.corr(nutrient))

    top_pos_corr = ingredient_corr.sort_values(ascending = False).head(10)
    top_neg_corr = ingredient_corr.sort_values(ascending = True).head(10)

    print(f"Top 10 positively correlated ingredients with {nutrient_column}:")
    print(top_pos_corr)
    print(f"\nTop 10 negatively correlated ingredients with {nutrient_column}:")
    print(top_neg_corr)
    print("\n")
    print("\n")

Top 10 positively correlated ingredients with calories:
eggs             0.288174
vanilla          0.258893
flour            0.255290
baking soda      0.211260
sugar            0.179823
butter           0.174787
baking powder    0.169766
nuts             0.148103
pecans           0.147960
brown sugar      0.139479
dtype: float64

Top 10 negatively correlated ingredients with calories:
olive oil     -0.086818
lemon juice   -0.065525
soy sauce     -0.060330
garlic        -0.054093
pepper        -0.038619
parsley       -0.035580
onion         -0.029904
honey         -0.029360
tomatoes      -0.029282
celery        -0.026236
dtype: float64




Top 10 positively correlated ingredients with protein_g:
chicken           0.263254
garlic            0.238212
onion             0.223420
ground beef       0.213313
onions            0.146273
tomatoes          0.145475
pepper            0.144902
cheddar cheese    0.133575
chicken broth     0.129971
oregano           0.126987
dtype: float64

Top 10 neg

# Correlation while also accounting the amount of each ingredient in each recipe New

In [14]:
import time
import pandas as pd
import numpy as np
import re
from fractions import Fraction

In [15]:
start_time = time.perf_counter()

df = pd.read_csv('data_with_nutrients_final.csv')
if 'recipe_id' not in df.columns:
    df['recipe_id'] = df.index

long = (
    df[['recipe_id','ingredients']]
      .assign(orig_ing=df['ingredients'].str.split(','))
      .explode('orig_ing')
)
long['orig_ing'] = long['orig_ing'].astype(str).str.strip()
long = long[long['orig_ing'] != '']

In [16]:
t_units = [
    'c','cup','cups','tbsp','tablespoon','tablespoons',
    'tsp','teaspoon','teaspoons','oz','ounce','ounces',
    'lb','pound','pounds','g','gram','grams',
    'kg','kilogram','kilograms','ml','milliliter','milliliters',
    'l','liter','liters','clove','cloves' , "lbs"
]
units_pattern = '|'.join(map(re.escape, t_units))
qty_pattern = r'(?P<qty>\d+(?:-\d+/\d+|\.\d+|/\d+)?)?'
unit_pattern = r'(?P<unit>' + units_pattern + r')?'
name_pattern = r'(?P<raw>.+)'
regex = r'^\s*' + qty_pattern + r'\s*' + unit_pattern + r'\s*' + name_pattern + r'$'

cleaned = long['orig_ing'].str.lower().str.replace(r'\(.*?\)', '', regex=True)
ex = cleaned.str.extract(regex)

In [17]:
def to_float(s):
    if pd.isna(s): return np.nan
    s = s.strip()
    if '-' in s and '/' in s:
        head, frac = s.split('-',1)
        try:
            return int(head) + float(Fraction(frac))
        except:
            return np.nan
    try:
        return float(Fraction(s))
    except:
        try:
            return float(s)
        except:
            return np.nan
ex['qty'] = ex['qty'].map(to_float)

In [18]:
STOP = set([
    'taste','chopped','sliced','minced','drained','melted','softened',
    'beaten','peeled','finely','thinly','fresh','extra','large','small','medium',
    'and','to' , "hopped" , "rinsed" , "seeded" , "diced"
])
t_forbidden = set(t_units)
forbidden = STOP.union(t_forbidden)

In [19]:
def clean_name(n):
    if pd.isna(n): return np.nan
    n = n.strip().lower()
    n = re.sub(r'[^\w\s]', '', n)
    n = re.sub(r'\b(?:' + units_pattern + r')\b', '', n)
    words = [w for w in n.split() if w not in forbidden and not w.isdigit()]
    if not words:
        return np.nan
    if words[-1].endswith('s') and len(words[-1]) > 1:
        words[-1] = words[-1][:-1]
    return ' '.join(words)

ex['name'] = ex['raw'].apply(clean_name)

In [20]:
long = long.join(ex)
long = long.dropna(subset=['name']).copy()
long['qty_filled'] = long['qty'].fillna(1)

top50 = long['name'].value_counts().nlargest(50).index
over50 = long[long['name'].isin(top50)]

X = over50.pivot_table(
    index='recipe_id',
    columns='name',
    values='qty_filled',
    aggfunc='sum',
    fill_value=0
)

nut = df.set_index('recipe_id')[[
    'calories','protein_g','carbs_g','fat_g','fiber_g','sugar_g'
]]


In [21]:
Xc = X - X.mean()
N = len(Xc)
print("Running correlations...")
for col in nut.columns:
    y = nut[col].reindex(Xc.index)
    yc = y - y.mean()
    ystd = y.std()
    cov = Xc.T.dot(yc) / (N - 1)
    xstd = X.std()
    corr = cov / (xstd * ystd)
    print(f"\n=== {col} ===\n")
    print("Top 10 positives:")
    print(corr.nlargest(10).to_string())
    print("\nTop 10 negatives:")
    print(corr.nsmallest(10).to_string())
    print('-'*40)

print(f"Total execution time: {time.perf_counter() - start_time:.2f} seconds")

Running correlations...

=== calories ===

Top 10 positives:
name
egg                0.295060
vanilla            0.230510
baking soda        0.218181
sugar              0.196043
flour              0.190797
baking powder      0.182541
butter             0.150827
vanilla extract    0.148967
pecan              0.144347
cinnamon           0.137589

Top 10 negatives:
name
olive oil     -0.034424
lemon juice   -0.033159
soy sauce     -0.031717
cilantro      -0.030025
red onion     -0.027317
tomatoe       -0.023178
green onion   -0.022179
salt pepper   -0.016715
garlic        -0.012999
parsley       -0.012435
----------------------------------------

=== protein_g ===

Top 10 positives:
name
garlic                  0.242248
ground beef             0.205431
onion                   0.174589
pepper                  0.156382
chili powder            0.136516
worcestershire sauce    0.124542
olive oil               0.121395
garlic powder           0.120436
salt pepper             0.116798
salt     