In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

# To process accented characters
import unicodedata

# NLTK
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words_nltk = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Machine learning
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


# from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
# lemmatizer=WordNetLemmatizer()

# # SPACY
# import spacy
# from spacy.lang.en.stop_words import STOP_WORDS
# spacy_model = spacy.load('en_core_web_sm')
# stopwords_spacy = spacy_model.Defaults.stop_words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Boss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Boss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read in the csv file
df = pd.read_csv("../data/cuisine_full_ingredients.csv")

In [3]:
# Overview of the data set
df.sample(10, random_state=2)

Unnamed: 0,cuisine,recipe,full_ingredients
3727,Italian,Spicy fregola with scallops and crispy kale,"['olive oil, for shallow and deep frying', '1 ..."
1923,Chinese,Chunky bacon and cucumber salad,"['1 tbsp groundnut oil', '6 long dried chillie..."
3552,Italian,Polenta pork,"['2 tbsp sunflower oil ', '12 rashers thick ri..."
1146,British,Oat fig stuffing,"['140g/5oz butter', '100g/3½oz jumbo oats', '1..."
3195,Italian,Bolognese with tagliatelle,"['250g/9oz 00 flour', '3-4 medium free-range e..."
1643,British,Textured potato salad,"['675g/1½lb floury potatoes, peeled and cut in..."
3054,Indian,Tandoori lamb wrap,"['150ml/5fl oz Greek-style yoghurt ', '½ small..."
842,British,Gammon and pease pudding,"['300g/10oz dried yellow split peas', '50g/2oz..."
505,British,Braised shin of beef with parsnip purée,"['4kg/9lb beef shin, on the bone', 'sea salt a..."
2224,French,Mary Berry's celeriac remoulade,"['650g/1lb 7oz celeriac, peeled and sliced int..."


## Define text pre-processing function

In [4]:
# Create a function to remove accented characters
def remove_accented_chars(matchobj):
    text = matchobj.group()
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

I'm combining the text cleaning and pre-processing steps as explored and explained in the 00_EDA file.

In [5]:
def clean(doc):
    doc = doc.str.lower()
    doc = doc.str.replace(r'\w*[\d¼½¾⅓⅔⅛⅜⅝]\w*', '')
    doc = doc.str.translate(str.maketrans('', '', string.punctuation))
    doc = doc.str.replace(r'[£×–‘’“”⁄]', '')
    doc = doc.apply(lambda x: re.sub(r'[âãäçèéêîïñóôûüōưấớ]', remove_accented_chars, x))
    doc = doc.apply(lambda x: word_tokenize(x))
    doc = doc.apply(lambda x: [word for word in x if not word in stop_words_nltk])
    doc = doc.apply(lambda x: [stemmer.stem(word) for word in x])
    processed_doc = doc.apply(lambda x: ' '.join([word for word in x]))

    return processed_doc

In [6]:
df["ingredients_processed"] = clean(df.full_ingredients)

In [7]:
df.head()

Unnamed: 0,cuisine,recipe,full_ingredients,ingredients_processed
0,African,Moroccan-style roasted veg,"['2 large red or orange peppers, de-seeded and...",larg red orang pepper deseed cut bites chunk s...
1,African,"Beef, black-eyed bean and plantain hotpot","['350g/12oz dried black-eyed beans', '900g/2lb...",dri blackey bean sirloin steak cut cube oz veg...
2,African,Bobotie,"['50ml/2fl oz olive oil', '1 onion, chopped', ...",oz oliv oil onion chop garlic clove crush lamb...
3,African,South African bobotie,"['butter, for greasing', '400ml/14fl oz full-f...",butter greas oz fullfat milk fresh white bread...
4,African,Bunny chow,"['1 tbsp olive oil', '1 onion, thinly sliced',...",tbsp oliv oil onion thinli slice garlic clove ...
