In [1]:
import pandas as pd
from collections import Counter
import csv


2.A. Find the number of unique ingredients. List them with their frequencies.

In [13]:
import pandas as pd
import csv
from collections import Counter

def analyze_ingredients(input_filename, output_filename):
    print(f"[INFO] Reading input CSV file: {input_filename}")

    ingredients_list = []

    with open(input_filename, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            ingredient = row.get('parsed_Ingredients', '')
            if ingredient:
                ingredients_list.append(ingredient)
    
    # Count the occurrences of each ingredient
    ingredient_counts = Counter(ingredients_list)
    
    # Code to find the count of unique ingredients
    print("Count of unique ingredients: " + str(len(ingredient_counts)))

    # List all unique ingredients along with their frequencies
    ingredient_freqs = sorted(ingredient_counts.items(), key=lambda x: x[1], reverse=True)
    
    # Store the list in a DataFrame
    df = pd.DataFrame(ingredient_freqs, columns=['Ingredient Name', 'Frequency'])
    print(df)
    
    # Save the DataFrame to a CSV file
    df.to_csv(output_filename, index=False)
    print(f"[INFO] Data saved to {output_filename}")

# Example usage
input_filename = 'parsed_Ingredients.csv'  
output_filename = 'unique_ingredients_frequencies.csv'
analyze_ingredients(input_filename, output_filename)




[INFO] Reading input CSV file: parsed_Ingredients.csv
Count of unique ingredients: 10752
               Ingredient Name  Frequency
0                       garlic       3643
1                    olive oil       3524
2                       butter       2051
3                 caster sugar       1980
4                         eggs       1529
...                        ...        ...
10747               pastry mix          1
10748  crusty wholegrain rolls          1
10749          Coles chickpeas          1
10750   Turkish bread croutons          1
10751      Microwavebrown rice          1

[10752 rows x 2 columns]
[INFO] Data saved to unique_ingredients_frequencies.csv


2.B. Plot the recipe size distribution for these recipes and the average size of the recipes (s).
Properly label the axes.

In [9]:
import matplotlib.pyplot as plt
def plot_recipe_size_distribution(input_filename, output_filename):
    print(f"[INFO] Reading input CSV file: {input_filename}")


    df = pd.read_csv(input_filename)


    if 'Recipe ID' not in df.columns or 'parsed_Ingredients' not in df.columns:
        raise ValueError("CSV file must contain 'Recipe ID' and 'parsed_Ingredients' columns")


    recipe_sizes = df.groupby('Recipe ID').size()


    average_size = round(recipe_sizes.mean())


    plt.figure(figsize=(10, 6))
    plt.hist(recipe_sizes, bins=range(1, recipe_sizes.max() + 2), edgecolor='black', alpha=0.7)
    plt.title('Recipe Size Distribution')
    plt.xlabel('Number of Ingredients')
    plt.ylabel('Frequency')
    plt.xticks(range(1, recipe_sizes.max() + 1))
    plt.yticks(range(0, recipe_sizes.value_counts().max() + 1, max(1, recipe_sizes.value_counts().max() // 10)))


    plt.axvline(average_size, color='red', linestyle='dashed', linewidth=1, label=f'Average Size = {average_size}')
    plt.legend()


    plt.savefig(output_filename)
    plt.close()

    print(f"[INFO] Recipe size distribution plot saved to {output_filename}")
    print(f"[INFO] Average recipe size: {average_size}")


input_filename = 'parsed_Ingredients.csv' 
output_filename = 'recipe_size_distribution.png'
plot_recipe_size_distribution(input_filename, output_filename)


[INFO] Reading input CSV file: parsed_Ingredients.csv
[INFO] Recipe size distribution plot saved to recipe_size_distribution.png
[INFO] Average recipe size: 10


2.C. Plot cumulative distribution of recipe size

In [10]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_cumulative_distribution(input_filename, output_filename):
    print(f"[INFO] Reading input CSV file: {input_filename}")


    df = pd.read_csv(input_filename)

    if 'Recipe ID' not in df.columns or 'parsed_Ingredients' not in df.columns:
        raise ValueError("CSV file must contain 'Recipe ID' and 'parsed_Ingredients' columns")


    recipe_sizes = df.groupby('Recipe ID').size()


    sorted_sizes = sorted(recipe_sizes)
    cdf = [i / len(sorted_sizes) for i in range(len(sorted_sizes))]

    # Plot the CDF
    plt.figure(figsize=(10, 6))
    plt.step(sorted_sizes, cdf, where='post', color='blue', linestyle='-', linewidth=2)
    plt.title('Cumulative Distribution Function (CDF) of Recipe Size')
    plt.xlabel('Number of Ingredients')
    plt.ylabel('Cumulative Probability')
    plt.xticks(range(1, max(sorted_sizes) + 1))
    plt.yticks([i / 10.0 for i in range(11)])


    plt.savefig(output_filename)
    plt.close()

    print(f"[INFO] CDF plot saved to {output_filename}")

input_filename = 'parsed_Ingredients.csv' 
output_filename = 'recipe_size_cdf.png'
plot_cumulative_distribution(input_filename, output_filename)


[INFO] Reading input CSV file: parsed_Ingredients.csv
[INFO] CDF plot saved to recipe_size_cdf.png


In [18]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_frequency_rank_distribution(input_filename, output_filename_svg, output_filename_png):
    print(f"[INFO] Reading input CSV file: {input_filename}")

    df = pd.read_csv(input_filename)

    if 'Ingredient Name' not in df.columns or 'Frequency' not in df.columns:
        raise ValueError("CSV file must contain 'Ingredient Name' and 'Frequency' columns")


    df_sorted = df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    

    df_sorted['Rank'] = df_sorted.index + 1
    

    max_frequency = df_sorted['Frequency'].max()
    df_sorted['Normalized Frequency'] = df_sorted['Frequency'] / max_frequency


    plt.figure(figsize=(10, 6))
    plt.plot(df_sorted['Rank'], df_sorted['Normalized Frequency'], marker='o', linestyle='-', color='b')
    plt.title('Frequency-Rank Distribution of Ingredients')
    plt.xlabel('Rank')
    plt.ylabel('Normalized Frequency')
    plt.xscale('log')  
    plt.yscale('log')  
    plt.grid(True, which="both", ls="--")
    

    plt.savefig(output_filename_svg, format='svg')
    plt.savefig(output_filename_png, format='png')
    plt.close()

    print(f"[INFO] Frequency-rank distribution plot saved to {output_filename_svg} and {output_filename_png}")


input_filename = 'unique_ingredients_frequencies.csv'
output_filename_svg = 'frequency_rank_distribution.svg'
output_filename_png = 'frequency_rank_distribution.png'
plot_frequency_rank_distribution(input_filename, output_filename_svg, output_filename_png)


[INFO] Reading input CSV file: unique_ingredients_frequencies.csv


[INFO] Frequency-rank distribution plot saved to frequency_rank_distribution.svg and frequency_rank_distribution.png
