In [14]:
import pandas as pd
import re
import json
import tqdm

from utils.dependency import parent_dir   
from utils.basics import *
from utils.save import save_pickle, load_pickle

In [15]:
# #load file
layer1 = json.load(open('../data/indo_recipe.json'))

In [None]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
        u"\u013a"

                      "]+", re.UNICODE)                      

In [None]:
start_with = ['tbsp','pkt','g','tsp','x','cups','oz','mrs',
              'kg', 'pkg','tbsp','lbs','qt','lrg','grams','sm',
              'o','t','gram','c','lg','ml','ounces','ounce','g',
              'sdm','ml','gr','gram','g','l','liter','sdt','kg',
              'lbr','ekor','lembar','bahan ','rebus','cincang',
              'buah','iris','potong','sejumput','butir','kupas',
              'dadu','siung']

remove = ['%s.' %str(i) for i in range(30)]

def clean_line(line):
    '''
    Args:
        line: a string, such as food name, sentences...
    '''
    assert type(line) == str
    
    # all lowercase
    line = line.lower()
    line = line.replace(' .', '.')
    line = line.replace(' !', '!')
    line = line.replace(')', '')
    line = line.replace('*', '')
    line = line.replace('..', '.')
    line = line.replace(' - ', '')
    
    # only reserve number and alphabets
    line = re.sub(r"[^a-z0-9+()-/?&'!.,]", ' ', line)
    
    # replace things in brace
    line = re.sub(r'\([^)]*\)', '', line)
    
    # remove extra spaces
    line = re.sub(' +',' ',line).strip()
    
    #remove emoji
    line = re.sub(emoj,'', line)

    #remove unicode
    line = re.sub(r'(\\u[0-9A-Fa-f]+)', '', line)
    line = re.sub(r'(\\U[0-9A-Fa-f]+)', '', line)
    line = re.sub(r'(\\x[0-9A-Fa-f]+)', '', line)
    line = re.sub(r'(\\u[0-9]+)', '', line)

    return line

def clean_prefix(ingr):
    cleaned = []
    for ans in ingr:
        
        # strip
        ans = re.sub(' +',' ',ans).strip()
        
        # remove number
        ans = re.sub(r'\d+', '', ans)
        
        # remove period
        ans = ans.replace('.', '')
        
        # remove prefixes
        for prefix in start_with:
            ans = re.sub('^'+prefix+'\s', '', ans)
            
        # strip again
        ans = re.sub(' +',' ',ans).strip()

        if ans:
            cleaned.append(ans)
            
    return cleaned

In [67]:
def preprocessing_ingr(text):
    data = []
    for i, recipe in tqdm.tqdm(enumerate(text)):
        processed_recipe = {'id': recipe['id'],
                            'ingredients': []}
        field = 'ingredients'
        for j, line in enumerate(recipe[field]):
            cleaned = clean_line(line)
            if cleaned:
                processed_recipe[field].append(cleaned)  
        data.append(processed_recipe)
    return data


def preprocessing_steps(text):
    data = []
    for i, recipe in tqdm.tqdm(enumerate(text)):
        processed_recipe = {'id': recipe['id'],
                            'steps': ''}
        field = 'steps'
        for j, line in enumerate(recipe[field]):
            if field == 'steps':
                cleaned = clean_line(line)
                if cleaned:
                    processed_recipe[field] += cleaned + ''
        data.append(processed_recipe)
    return data

def preprocessing_title(text):
    data = []
    for i, recipe in tqdm.tqdm(enumerate(text)):
        processed_recipe = {'id': recipe['id'],
                            'title': ''}
        cleaned = clean_line(recipe['title'])
        processed_recipe['title'] += cleaned + ''
        data.append(processed_recipe)
    
    return data
        

In [46]:
layer1_ingr = preprocessing_ingr(layer1)

17509it [00:12, 1433.87it/s]


In [68]:
layer1_steps = preprocessing_steps(layer1)

17509it [00:09, 1823.19it/s]


In [69]:
layer1_title = preprocessing_title(layer1)

17509it [00:01, 11383.86it/s]


In [70]:
#combined data
layer1_combined = []
for i in range(len(layer1_ingr)):
    layer1_combined.append({'id': layer1_ingr[i]['id'],
                            'ingredients': layer1_ingr[i]['ingredients'],
                            'steps': layer1_steps[i]['steps'],
                            'title': layer1_title[i]['title']})

In [72]:
layer1_combined

[{'id': 0,
  'ingredients': ['1 ekor ayam kampung (potong 12',
   '2 buah jeruk nipis',
   '2 sdm garam',
   '3 ruas kunyit',
   '7 bawang merah',
   '7 bawang putih',
   '10 cabe merah',
   '10 cabe rawit merah (sesuai selera',
   '3 butir kemiri',
   '2 batang sereh',
   '2 lembar daun salam',
   '2 ikat daun kemangi',
   'penyedap rasa',
   '1 1/2 gelas air'],
  'steps': 'cuci bersih ayam dan tiriskan. lalu peras jeruk nipis (kalo gak ada jeruk nipis bisa pake cuka dan beri garam. aduk hingga merata dan diamkan selama 5 menit, biar ayam gak bau amis.goreng ayam tersebut setengah matang, lalu tiriskanhaluskan bumbu menggunakan blender. bawang merah, bawang putih, cabe merah, cabe rawit, kemiri dan kunyit. oh iya kasih minyak sedikit yaa biar bisa di blender. untuk sereh nya di geprek aja terus di buat simpul.setelah bumbu di haluskan barulah di tumis. jangan lupa sereh dan daun salamnya juga ikut di tumis. di tumis sampai berubah warna yamasukan ayam yang sudah di goreng setengah mat

In [71]:
#save to pickle
save_pickle(obj = layer1_combined, filename='../data/data_recipe.pickle', overwrite=True)