In [None]:
import pandas as pd
import numpy as np

from wordcloud import WordCloud

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
recipes_path = "../data/full_dataset.csv"

recipes_data = pd.read_csv(
    recipes_path,
    index_col=0,
    usecols=["title", "link", "NER"],
    dtype={
        "title": "string[pyarrow]",
        "link": "string[pyarrow]",
        "NER": "string[pyarrow]",
    },
)

print(f" data shape: {recipes_data.shape}")
print(recipes_data.memory_usage(deep=True))

recipes_data.sample()

In [None]:
recipes_data.website.value_counts()

After inspecting some websites and checking what extra information about recipes they may hold:

'www.allrecipes.com' has ~60 K recipes, where I could get hold of more features related to nutrition facts.

In [None]:
filtered_data = recipes_data.query(
    f" website == 'www.allrecipes.com' "
).copy()

# remove the double quotes around the lists in teh NER column:
filtered_data.NER = filtered_data.NER.apply(eval)

filtered_data.shape

In [None]:
all_ingredients = filtered_data.NER.explode()

ingredients_frequency = all_ingredients.value_counts().to_dict()

In [None]:
wordcloud = WordCloud(width=1000, height=500).generate_from_frequencies(
    ingredients_frequency
)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
filtered_data["duration"] = np.nan
filtered_data["Nutrition_facts_unstructured"] = np.nan
filtered_data["servings"] = np.nan


filtered_data.to_csv("all_recipes_data.csv", index=False)