In [None]:
!pip install -q datasets

In [2]:
#Not used
from datasets import load_dataset

#Imported directly from kaggle
import pandas as pd

df = pd.read_csv("/kaggle/input/recipes/dataset/full_dataset.csv")

In [None]:
df.head()

In [None]:
dim_dataset = len(df)

print(f"In the dataset there are {dim_dataset} recipe. \n")

**Analysis first column: Unnamed: 0**

In [None]:
fields = list(df.columns)
    
first_cols = list(df.columns[:1])  
duplicates = df[df.duplicated(subset=first_cols)]
  
print(f"Duplicates index: {duplicates}")

“Unnamed: 0” corresponds to the ‘id’ field, which uniquely identifies a prescription in the database

In [None]:
import ast 


for field in fields:
    sample_value = df[field].dropna().iloc[0]  
    
    if isinstance(sample_value, str):
        try:
            evaluated = ast.literal_eval(sample_value)
            sample_value = evaluated
        except (ValueError, SyntaxError):
            pass 

    print(f"\nField: {field}")
    print(f"Type: {type(sample_value).__name__}")
    
    # If it's a list or dict, show inner types
    if isinstance(sample_value, list):
        inner_types = set(type(x).__name__ for x in sample_value)
        print(f"Inner types in list: {inner_types}")
    elif isinstance(sample_value, dict):
        key_types = set(type(k).__name__ for k in sample_value.keys())
        value_types = set(type(v).__name__ for v in sample_value.values())
        print(f"Inner types in dict - keys: {key_types}, values: {value_types}")

**Analysis second column: title**

In [None]:
from collections import defaultdict

second_col = df.columns[1]  
upper_title = df[second_col].str.upper()

title_indices = defaultdict(list)

for i, title in enumerate(upper_title):
    title_indices[title].append(i)

duplicates_dict = {title: idxs for title, idxs in title_indices.items() if len(idxs) > 1}

l = []
for i, (title, idxs) in enumerate(duplicates_dict.items()):
    l.append(title)
    if i >= 4:  # Stop after 5 examples
        break

print(f"Example of duplicates: {l} \n")

single_recipe = {title: idxs for title, idxs in title_indices.items() if len(idxs) == 1}

count = 0
maxR = 1
maxItem = ""
for item in duplicates_dict:
    #print(f"Item: {item}, recipes: {len(duplicates_dict[item])}")
    count += len(duplicates_dict[item])
    if len(duplicates_dict[item]) > maxR:
        maxR = len(duplicates_dict[item])
        maxItem = item

print(f"Number of title: {len(title_indices)}. \n")

print(f"Number of title with one recipe: {len(single_recipe)}, with more than one: {len(title_indices) - len(single_recipe)}. \n")

print(f"MAX number for recipe: {maxR}, {maxItem}. \n")

print(f"AVG recipe for title: {count/len(duplicates_dict)} (counting only duplicates). \n")

print(f"AVG recipe for title: {count/len(title_indices)} . \n")


Now let's take some time to understand how recipes with the same name differ, focusing only on the ingredients 

In [None]:

ingredients = df.columns[6]

#Insert a duplicates recipe from the dataset
recipe = "CHICKEN CASSEROLE"

setIngr = set()
indexRecipe = duplicates_dict[recipe]

print(f"Example of index: {indexRecipe[:10]} \n")
print(f"Recipe ing: {df.iloc[63][ingredients]} \n")

ingredientsCounter = defaultdict(int) 

#Some statistics 
counter = 0
for idx in indexRecipe:
    counter += 1
    ingredientList_str = df.iloc[idx][ingredients].upper()
    ingredientList = ast.literal_eval(ingredientList_str)
    for ingredient in ingredientList:
        if ingredient in ingredientsCounter:
            ingredientsCounter[ingredient] += 1
        else:
            ingredientsCounter[ingredient] = 1

nPrint = 10
numRecipe = len(indexRecipe)
print(f"Percentage of ingredients in {recipe}: \n")
for ingr, count in ingredientsCounter.items():
    print(f"{ingr}:  {count/numRecipe:.2%}")
    nPrint -= 1
    if nPrint == 0:
        break



In [None]:
# Sort ingredientsCounter by count in descending order
sorted_ingredients = sorted(ingredientsCounter.items(), key=lambda x: x[1], reverse=True)

nPrint = 10
numRecipe = len(indexRecipe)
print(f"Percentage of ingredients in {recipe}: \n")
for ingr, count in sorted_ingredients:
    print(f"{ingr}: {count/numRecipe:.2%}")  
    nPrint -= 1
    if nPrint == 0:
        break

The recipe duplicated several times has only 60% more ingredient present. The main problem is also in the names, maybe the same ingredient with different names

In [None]:
del sorted_ingredients, ingredientsCounter, duplicates_dict, upper_title, first_cols

**Analysis second column: Directions**

In [None]:
directions_col = df['directions'].dropna().apply(ast.literal_eval)

In [None]:
step_counts = directions_col.apply(lambda x: len(x) if isinstance(x, list) else 0)

print("Analysis of 'directions' field:\n")
print(f"Total recipes with directions: {len(step_counts)}")
print(f"Minimum steps in a recipe: {step_counts.min()}")
print(f"Maximum steps in a recipe: {step_counts.max()}")
print(f"Average number of steps: {step_counts.mean():.2f}")
print(f"Median number of steps: {step_counts.median()}")

In [None]:
import numpy as np

all_instructions = [instruction for instructions in directions_col for instruction in instructions]

instruction_lengths = [len(instr) for instr in all_instructions]

print("\nAnalysis of single instructions:\n")
print(f"Total instructions: {len(instruction_lengths)}")
print(f"Shortest instruction length: {np.min(instruction_lengths)} characters")
print(f"Longest instruction length: {np.max(instruction_lengths)} characters")
print(f"Average instruction length: {np.mean(instruction_lengths):.2f} characters")
print(f"Median instruction length: {np.median(instruction_lengths):.2f} characters")

In [None]:
del directions_col, all_instructions, instruction_lengths

**Analysis second column: Link & source**

I leave out the recipes that come from the old dataset and dwell on those with a link

In [None]:
# Select columns
col4 = df.iloc[:, 4]  
col5 = df.iloc[:, 5]  

mask = col5 != "Gathered"
filtered_col5 = col5[mask]
print(filtered_col5)

In [None]:
mask = ~col5.str.contains("Recipes1M", na=False)  # "~" means NOT, na=False to avoid NaN problems
filtered_col4 = col4[mask]
filtered_col4_inv = col4[~mask]

print(f"Total entries in filtered column link: {len(filtered_col4)}\n")

def extract_base_link(link):
    if pd.isna(link):
        return None
    return link.split('/')[0]

base_links = filtered_col4.apply(extract_base_link)

base_link_counts = base_links.value_counts()

print("Top base links:\n")
print(base_link_counts.head(10))

print(f"\n\n Old dataset: {len(filtered_col4_inv)}")

**Analysis second column: NER**

In [None]:
from collections import Counter

ingredienti_unici = set()
ingredient_counter = Counter()

for ingr_str in df['NER']:
    try:
        lista_ingredienti = ast.literal_eval(ingr_str)
        for ingrediente in lista_ingredienti:
            ingrediente_pulito = ingrediente.lower().strip()
            ingredienti_unici.add(ingrediente_pulito)
            ingredient_counter[ingrediente_pulito] += 1
    except:
        continue

print(f"Total number of unique ingredients: {len(ingredienti_unici)}\n")

print("Top 10 ingredients by frequency:")
for ingrediente, count in ingredient_counter.most_common(10):
    print(f"{ingrediente}: {count} times")

**vocabulary analysis**

How big is the vocabulary of the collection? How big is the vocabulary of a document on average? 

While the analysis on ingredients has already been done, the vocabulary for recipe instructions has not. Let us therefore try to extract some useful information.

In [3]:
import nltk
import pandas as pd
import ast

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
print('English stopwords:')
print(stopwords.words('english'))

English stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'sh

In [5]:
#Flatten the list of instraction
directions_col = df['directions'].dropna().apply(ast.literal_eval)
all_instructions_list = [instruction for instructions in directions_col for instruction in instructions]

From here on I work on a subset because the full dataset is too heavy, subset extracted completely randomly

In [7]:
import re
import random
from nltk.corpus import stopwords
from nltk.probability import FreqDist

division = 1000

subset_size = len(all_instructions_list) // division  
subset_instructions = random.sample(all_instructions_list, subset_size)

full_text = ' '.join(subset_instructions)

full_text = full_text.lower()
full_text = re.sub(r'[^a-zA-Z0-9\s]', '', full_text)  # keep only letters, numbers, spaces

words = full_text.split()

words_nostopwords = [w for w in words if w not in stopwords.words('english')]

fdist = FreqDist(words_nostopwords)

print("Top 10 most common words:")
for word, freq in fdist.most_common(10):
    print(f"{word}: {freq}")


Top 10 most common words:
add: 2796
minutes: 2316
heat: 1597
mix: 1220
cook: 1164
mixture: 1114
stir: 1111
bowl: 1107
pan: 1072
salt: 1059


In [9]:
#Find emoticons, if any

emoticon_regex = '(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)'
emoticons_in_pos = re.findall(emoticon_regex,full_text)
counts = nltk.FreqDist(emoticons_in_pos)
print(f"Number of emoticons: {counts}")

Number of emoticons: <FreqDist with 0 samples and 0 outcomes>
