In [3]:
from collections import defaultdict
import random
import string
import multiprocess as mp
import pandas as pd
import numpy as np

import contractions

import gensim.downloader as api

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from keras.preprocessing.sequence import pad_sequences

STOP = stopwords.words('english') + ["yesterday"] + ['urllink']
ids = api.load('glove-wiki-gigaword-200').key_to_index

In [4]:
def process(df, f):
    pool = mp.Pool(8)
    out = pool.map(lambda d: d.map(f), np.array_split(df, 8))
    return pd.concat(out)

def strip(s):
    s = contractions.fix(s.lower())
    s = word_tokenize(s.translate(str.maketrans('','',string.punctuation)))
    return [i for i in s if not i in STOP]

def strip_sent(s, minlen=5):
    s = sent_tokenize(s)
    return [strip(m) for m in s if len(strip(m)) > minlen]

class DictDefault(dict):
    # not necessary, just a memory optimization
    __slots__ = ['_factory']  

    def __init__(self, factory, *args, **kwargs):
        self._factory = factory
        super().__init__(*args, **kwargs)

    def __missing__(self, key):
        return self._factory()

id_list=list(ids)
def default_v():
    return ids[random.choice(id_list)]

ids = DictDefault(default_v, ids)

def to_id(s):
    return np.array([ids[n] for n in s])

In [5]:
# Saving to disk bc memory issues.
# After this cell, restart the interpreter and run all the cells but this one

chunks = []
for i, chunk in enumerate(pd.read_csv('data/recipes.csv', chunksize=50000)):
    print(f'{i*50000} chunks')
    chunks.append(process(chunk['steps'].map(eval).explode().dropna(), strip_sent).explode())

n = len(chunks)
pd.DataFrame(pd.concat(chunks, ignore_index=True)).to_feather(f'recipes-parsed.feather')

chunks = []
for i, chunk in enumerate(pd.read_csv('data/blogs.csv', chunksize=50000)):
    print(f'{i*50000} chunks')
    chunks.append(process(chunk['text'], strip_sent).explode())
    if len(chunks) > n:
        break

pd.DataFrame(pd.concat(chunks, ignore_index=True)).reset_index(drop=True).to_feather(f'blogs-parsed.feather')

0 chunks
50000 chunks
100000 chunks
150000 chunks
200000 chunks
0 chunks
50000 chunks
100000 chunks
150000 chunks
200000 chunks
250000 chunks


In [5]:
blogs = pd.read_feather('blogs-parsed.feather').dropna().reset_index(drop=True)
recipes = pd.read_feather('recipes-parsed.feather').dropna().reset_index(drop=True)

blogs = blogs.squeeze().sample(recipes.shape[0]).map(to_id)
recipes = recipes.squeeze().map(to_id)

In [6]:
conc = pd.concat([blogs, recipes], keys=[0,1])
conc = conc.reset_index(level=0)
conc.columns = ['key', 'sent']
conc['sent'] = pad_sequences(conc['sent'], maxlen=20).tolist()
conc.reset_index(drop=True).to_feather('train.feather')

In [8]:
conc[conc.key==1]

Unnamed: 0,key,sent
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 434..."
1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19568, 13924, 2..."
2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 3714, 13924, 22763..."
3,1,"[0, 0, 0, 0, 0, 0, 0, 0, 13418, 5745, 3022, 43..."
4,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5604, 629..."
...,...,...
1179894,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3028, 9058, 204..."
1179895,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 289083..."
1179896,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 886, 11641, 3451, ..."
1179897,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9933, 3191, ..."
