In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import random

## Data Preprocessing

In [2]:
# Load the RAW_recipes and RAW_interactions tables into pandas dataframes
raw_recipes_df = pd.read_csv("/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv")
raw_interactions_df = pd.read_csv("/kaggle/input/food-com-recipes-and-user-interactions/RAW_interactions.csv")

### recipes_df

In [3]:
# Dropping not required columns
raw_recipes_df.drop(['contributor_id', 'submitted', 'n_ingredients', 'n_steps'], axis=1, inplace=True)

In [4]:
# Define the function to classify recipes
def classify_recipe(nutrition_data, thresholds):
    nutrition_data = nutrition_data.replace('[', '').replace(']', '').split(',')
    tags = []
    for index, value in enumerate(nutrition_data):
        nutrient = None
        if index == 0:
            nutrient = 'calories'
        elif index == 1:
            nutrient = 'total fat'
        elif index == 2:
            nutrient = 'sugar'
        elif index == 3:
            nutrient = 'sodium'
        elif index == 4:
            nutrient = 'protein'
        elif index == 5:
            nutrient = 'saturated fat'
        elif index == 6:
            nutrient = 'carbohydrates'
        
        if nutrient and nutrient in thresholds['high']:
            high_threshold = thresholds['high'][nutrient]
            low_threshold = thresholds['low'][nutrient]
            
            if float(value) > high_threshold:
                tags.append(f'high-{nutrient}')
            elif float(value) < low_threshold:
                tags.append(f'low-{nutrient}')
    return tags

# Define the thresholds from online sources
thresholds = {
    'high': {
        'calories': 2500,
        'protein': 50,
        'carbohydrates': 300,
        'sugar': 50,
        'sodium': 2300,
        'total fat': 70,
        'saturated fat': 20
    },
    'low': {
        'calories': 1200,
        'protein': 20,
        'carbohydrates': 130,
        'sugar': 25,
        'sodium': 500,
        'total fat': 20,
        'saturated fat': 5
    }
}

# Apply the function to each row and create a new column with the tags
# Assuming your nutrition data is stored in the 'nutrition' column as lists
raw_recipes_df['nutrition_tags'] = raw_recipes_df['nutrition'].apply(lambda x: classify_recipe(x, thresholds))

raw_recipes_df.drop(['nutrition'], axis=1, inplace=True)
# Display the DataFrame with the new column
raw_recipes_df.head(2)

Unnamed: 0,name,id,minutes,tags,steps,description,ingredients,nutrition_tags
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...","['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...","[low-calories, low-total fat, low-sugar, low-s..."
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...","['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...","[low-calories, low-total fat, low-sugar, low-s..."


In [13]:
# When training, the user won't mention all the ingredients; rather just some available ingredients.
# Function to split ingredients into random parts and just selecting one for training
def split_ingredients(ingredient_list):
    # Replace brackets and split the string into a list
    ingredient_list = ingredient_list.replace('[', '').replace(']', '').split(',')
    # Randomly shuffle the ingredient list
    random.shuffle(ingredient_list)
    # Check if the length of the ingredient list is greater than 1
    if len(ingredient_list) > 1:
        # Split the list into two random parts
        split_index = random.randint(1, len(ingredient_list) - 1)
        return ','.join(ingredient_list[:split_index]), ','.join(ingredient_list[split_index:])
    else:
        # Return the original list if it contains only one item
        return ','.join(ingredient_list), ''
    
# Apply the function to split ingredients for input and output separately
raw_recipes_df[['input_ingredients_1', 'input_ingredients_2']] = raw_recipes_df['ingredients'].apply(split_ingredients).apply(pd.Series)
raw_recipes_df.head(2)

Unnamed: 0,name,id,minutes,tags,steps,description,ingredients,nutrition_tags,input_ingredients_1,input_ingredients_2
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...","['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...","[low-calories, low-total fat, low-sugar, low-s...","'winter squash', 'salt', 'mixed spice', 'olive...","'butter', 'honey', 'mexican seasoning'"
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...","['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...","[low-calories, low-total fat, low-sugar, low-s...","'sausage patty', 'salt and pepper', 'cheese'","'milk', 'eggs','prepared pizza crust'"


### interactions_df

In [14]:
# Convert the 'review' column to strings
raw_interactions_df['review'] = raw_interactions_df['review'].astype(str)

# Aggregate the interactions by recipe_id and calculate the count of users, mean rating, and concatenate the reviews
interactions_agg = raw_interactions_df.groupby('recipe_id').agg({'user_id': 'nunique', 'rating': 'mean', 'review': lambda x: ' '.join(x)}).reset_index()
interactions_agg.rename(columns={'user_id': 'user_count'}, inplace=True)

### combining

In [15]:
# Merge the aggregated interactions with the RAW_recipes dataframe
combined_df = pd.merge(raw_recipes_df, interactions_agg, left_on='id', right_on='recipe_id', how='left')

combined_df.head(2)

Unnamed: 0,name,id,minutes,tags,steps,description,ingredients,nutrition_tags,input_ingredients_1,input_ingredients_2,recipe_id,user_count,rating,review
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...","['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...","[low-calories, low-total fat, low-sugar, low-s...","'winter squash', 'salt', 'mixed spice', 'olive...","'butter', 'honey', 'mexican seasoning'",137739,3,5.0,I used an acorn squash and recipe#137681 Swee...
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...","['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...","[low-calories, low-total fat, low-sugar, low-s...","'sausage patty', 'salt and pepper', 'cheese'","'milk', 'eggs','prepared pizza crust'",31490,4,3.5,"Have not tried this, but it sounds delicious. ..."


In [16]:
# Selecting top 100000 values
combined_df = combined_df.iloc[:100000,:]

In [17]:
# Drop the redundant column 'recipe_id' after merging
combined_df.drop(['recipe_id'], axis=1, inplace=True)

# Convert each list of tags to a string
combined_df['nutrition_tags_str'] = combined_df['nutrition_tags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else str(x))

In [18]:
# Combine input information into an instruction or prompt
combined_df['Instruction'] = "Tags: " + combined_df['tags'] + "\nNutrition: " + combined_df['nutrition_tags_str'] + "\nIngredients: " + combined_df['input_ingredients_1'] + "\nRating: " + combined_df['rating'].astype(str)
combined_df['Instruction'] = combined_df['Instruction'].str.replace('\n', ' ', regex=False)

# Combine output information into a response or target
combined_df['Response'] = "Name: " + combined_df['name'] + "\nMinutes: " + combined_df['minutes'].astype(str) + "\nIngredients: " + combined_df['ingredients'] + "\nSteps: " + combined_df['steps']
combined_df['Response'] = combined_df['Response'].str.replace('\n', ' ', regex=False)

In [19]:
# Selecting just Input and Output required for modelling
final_df = combined_df[['id', 'Instruction', 'Response']]
final_df.head()

Unnamed: 0,id,Instruction,Response
0,137739,"Tags: ['60-minutes-or-less', 'time-to-make', '...",Name: arriba baked winter squash mexican sty...
1,31490,"Tags: ['30-minutes-or-less', 'time-to-make', '...",Name: a bit different breakfast pizza Minutes...
2,112140,"Tags: ['time-to-make', 'course', 'preparation'...",Name: all in the kitchen chili Minutes: 130 I...
3,59389,"Tags: ['60-minutes-or-less', 'time-to-make', '...",Name: alouette potatoes Minutes: 45 Ingredien...
4,44061,"Tags: ['weeknight', 'time-to-make', 'course', ...",Name: amish tomato ketchup for canning Minut...


In [20]:
final_df.to_csv('inst-resp.csv')

Saved the dataset!!