### Clean the Recipes Dataset

To train our model we have to prepare and clean the dataset.

In [1]:
import re
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import unicodedata
import codecs

nltk.download('stopwords')

def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Convert to lowercase
#     text = text.lower()
    
    # Replace Unicode escape sequences with their corresponding characters
    text = codecs.decode(text, 'unicode_escape')    
    
    # Replace degree symbol with a string representation
    text = text.replace('\u00b0', ' degrees ')
    
    # Remove non-letter characters, except for numbers and certain measurement units
    text = re.sub(r'[^a-zA-Z0-9\s.,:;/-]|(\d+[/.]*\d*)\s*(g|kg|ml|l|oz|lb|cup[s]*|tbsp|tsp|°C|°F)', '', text)
#     text = re.sub(r'[\[\],"]', '', text)    
    # Remove whitespace
    text = text.strip()
    
    # Normalize text
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Remove stop words
#     stop_words = set(stopwords.words('english'))
#     words = text.split()
#     words = [word for word in words if not word in stop_words]
#     text = ' '.join(words)
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datascience/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### We are going to get only the recommended "Gathered" entries from the dataset.

In [2]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df_train = pd.read_csv('subset/train_recipes_001.csv')
df_test = pd.read_csv('subset/test_recipes_001.csv')

df_train = df_train[df_train['source'] == 'Gathered']
df_test = df_test[df_test['source'] == 'Gathered']

# Clean the train dataset
df_train['directions'] = df_train['directions'].str.replace('[\[\],"]', '', regex=True)
df_train['NER'] = df_train['NER'].str.replace('[\[\]"]', '', regex=True)
df_train['ingredients'] = df_train['ingredients'].str.replace('[\[\]"]', '', regex=True)

# df_train['NER'] = df_train['NER'].apply(clean_text).str.replace('[\[\]"]', '', regex=True)
# df_train['directions'] = df_train['directions'].apply(clean_text).str.replace('[\[\],"]', '', regex=True)
# df_train['ingredients'] = df_train['ingredients'].apply(clean_text).str.replace('[\[\]"]', '', regex=True)

# Clean the test dataset
df_test['directions'] = df_test['directions'].str.replace('[\[\],"]', '', regex=True)
df_test['NER'] = df_test['NER'].str.replace('[\[\]"]', '', regex=True)
df_test['ingredients'] = df_test['ingredients'].str.replace('[\[\]"]', '', regex=True)

# df_test['NER'] = df_test['NER'].apply(clean_text).str.replace('[\[\]"]', '', regex=True)
# df_test['directions'] = df_test['directions'].apply(clean_text).str.replace('[\[\],"]', '', regex=True)
# df_test['ingredients'] = df_test['ingredients'].apply(clean_text).str.replace('[\[\]"]', '', regex=True)


# merge now all into one to be used for the label later
# f"title: {title} <section> ingredients: {ingredients} <section> directions: {steps}"
df_train['merged_text'] = 'title: ' + df_train['title'] + ' ' + 'ingredients: ' + df_train['ingredients'] + ' ' + 'directions: ' + df_train['directions']
df_test['merged_text'] = 'title: ' + df_test['title'] + ' ' + 'ingredients: ' + df_test['ingredients'] + ' ' + 'directions: ' + df_test['directions']

# Drop the columns we don't need anymore
columns_to_drop = ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source']

# Drop the columns (if they exist)
for column_name in columns_to_drop:
    if column_name in df_train.columns:
        df_train = df_train.drop(column_name, axis=1)
    if column_name in df_test.columns:
        df_test = df_test.drop(column_name, axis=1)


# Save the cleaned DataFrame to a new CSV file
df_train.to_csv('finalset/train_recipes.002.clean.csv', index=False)
df_test.to_csv('finalset/test_recipes.002.clean.csv', index=False)

In [3]:
data = pd.read_csv('finalset/train_recipes.002.clean.csv')
data.shape

(65781, 2)

In [4]:
data.head(3)

Unnamed: 0,NER,merged_text
0,"lemon juice, milk, vanilla, graham cracker cru...",title: Cherry Cheese Cake ingredients: 1/3 c. ...
1,white vinegar,title: Homemade (Scented) Fabric Softener ingr...
2,"yeast, shortening, potatoes, sugar, salt, brea...",title: Potato Bread ingredients: 1 1/2 pkg. ye...


In [5]:
data[:10]

Unnamed: 0,NER,merged_text
0,"lemon juice, milk, vanilla, graham cracker cru...",title: Cherry Cheese Cake ingredients: 1/3 c. ...
1,white vinegar,title: Homemade (Scented) Fabric Softener ingr...
2,"yeast, shortening, potatoes, sugar, salt, brea...",title: Potato Bread ingredients: 1 1/2 pkg. ye...
3,"water, whole cloves, cinnamon, sugar, orange j...","title: Russian Tea ingredients: 1 gal. water, ..."
4,"grape tomatoes, garlic, red onion, carrot, sca...",title: “Light” Bolognese Sauce With Fettuccine...
5,"any white fish, water, cider, salt, butter",title: Poor Mary Lobster ingredients: 2 lb. an...
6,"ground beef, cheddar cheese, onion soup mix, m...",title: Mini Beef Rolls ingredients: 1 pound gr...
7,"lemon cake, sugar, oil, sour cream, eggs",title: Poppy Seed Cake ingredients: 1 pkg. yel...
8,"bread, Mozzarella cheese, butter, garlic chips",title: Cheesy Bread ingredients: 1 loaf French...
9,"mustard, apple cider vinegar, water, sugar, sa...",title: Pickled Eggs ingredients: 2 Tbsp. prepa...
