### Clean the Recipes Dataset

To train our model we have to prepare and clean the dataset.

#### We are going to get only the recommended "Gathered" entries from the dataset.

In [1]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df_train = pd.read_csv('train_recipes_001.csv')
df_test = pd.read_csv('test_recipes_001.csv')

In [2]:
df_train = df_train[df_train['source'] == 'Gathered']
df_test = df_test[df_test['source'] == 'Gathered']

In [3]:
# Clean the train dataset
df_train['directions'] = df_train['directions'].str.replace('[\[\],"]', '', regex=True)
df_train['NER'] = df_train['NER'].str.replace('[\[\]"]', '', regex=True)
df_train['ingredients'] = df_train['ingredients'].str.replace('[\[\]"]', '', regex=True)

# Clean the test dataset
df_test['directions'] = df_test['directions'].str.replace('[\[\],"]', '', regex=True)
df_test['NER'] = df_test['NER'].str.replace('[\[\]"]', '', regex=True)
df_test['ingredients'] = df_test['ingredients'].str.replace('[\[\]"]', '', regex=True)

In [4]:
# merge now all into one to be used for the label later
# f"title: {title} <section> ingredients: {ingredients} <section> directions: {steps}"
df_train['merged_text'] = 'title: ' + df_train['title'] + ' ' + 'ingredients: ' + df_train['ingredients'] + ' ' + 'directions: ' + df_train['directions']
df_test['merged_text'] = 'title: ' + df_test['title'] + ' ' + 'ingredients: ' + df_test['ingredients'] + ' ' + 'directions: ' + df_test['directions']

In [5]:
# Drop the columns we don't need anymore
columns_to_drop = ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source']

# Drop the columns (if they exist)
for column_name in columns_to_drop:
    if column_name in df_train.columns:
        df_train = df_train.drop(column_name, axis=1)
    if column_name in df_test.columns:
        df_test = df_test.drop(column_name, axis=1)

In [6]:
# Save the cleaned DataFrame to a new CSV file
df_train.to_csv('train_recipes.002.clean.csv', index=False)
df_test.to_csv('test_recipes.002.clean.csv', index=False)

In [7]:
data = pd.read_csv('train_recipes.002.clean.csv')
data.shape

(65781, 2)

In [8]:
data.head(3)

Unnamed: 0,NER,merged_text
0,"lemon juice, milk, vanilla, graham cracker cru...",title: Cherry Cheese Cake ingredients: 1/3 c. ...
1,white vinegar,title: Homemade (Scented) Fabric Softener ingr...
2,"yeast, shortening, potatoes, sugar, salt, brea...",title: Potato Bread ingredients: 1 1/2 pkg. ye...


In [9]:
data[:10]

Unnamed: 0,NER,merged_text
0,"lemon juice, milk, vanilla, graham cracker cru...",title: Cherry Cheese Cake ingredients: 1/3 c. ...
1,white vinegar,title: Homemade (Scented) Fabric Softener ingr...
2,"yeast, shortening, potatoes, sugar, salt, brea...",title: Potato Bread ingredients: 1 1/2 pkg. ye...
3,"water, whole cloves, cinnamon, sugar, orange j...","title: Russian Tea ingredients: 1 gal. water, ..."
4,"grape tomatoes, garlic, red onion, carrot, sca...",title: “Light” Bolognese Sauce With Fettuccine...
5,"any white fish, water, cider, salt, butter",title: Poor Mary Lobster ingredients: 2 lb. an...
6,"ground beef, cheddar cheese, onion soup mix, m...",title: Mini Beef Rolls ingredients: 1 pound gr...
7,"lemon cake, sugar, oil, sour cream, eggs",title: Poppy Seed Cake ingredients: 1 pkg. yel...
8,"bread, Mozzarella cheese, butter, garlic chips",title: Cheesy Bread ingredients: 1 loaf French...
9,"mustard, apple cider vinegar, water, sugar, sa...",title: Pickled Eggs ingredients: 2 Tbsp. prepa...
