In [55]:
import pickle
from tqdm import tqdm

In [59]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1.pickle'
SAVE_PATH='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'

In [5]:
%%time
with open(DATASET,'rb') as f:
    train,dev,test=pickle.load(f)

CPU times: user 7min 8s, sys: 10.6 s, total: 7min 18s
Wall time: 7min 17s


#### Dataset - sentences

In [61]:
# Spacy output extractors
def get_sents(sample):
    return tuple([str(x).strip() for x in sample['nlp_text'].sents])

def get_lemmatized_tokens(sample,key):
    return tuple([str(x.lemma_) for x in sample['nlp_'+key]])

def get_tokens(sample,key):
    return tuple([str(x) for x in sample['nlp_'+key]])

def get_lemmatized(sample,key):
    return ' '.join([str(x.lemma_) for x in sample['nlp_'+key]])

def to_sents(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':x['summary'],
             'text':get_sents(x),
            } 

# Dataset reducers
def to_lemmatized(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_lemmatized(x,'summary'),
             'text':get_lemmatized(x,'text'),
            } 

def to_lemmatized_tokens(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_lemmatized_tokens(x,'summary'),
             'text':get_lemmatized_tokens(x,'text'),
            } 

def to_tokens(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_tokens(x,'summary'),
             'text':get_tokens(x,'text'),
            } 

def to_raw(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':x['summary'],
             'text':x['text'],
            } 

def write(ds, name):
    with open(SAVE_PATH.format(name), 'wb') as f:
        pickle.dump((ds[0],ds[1],ds[2]), f) 

#### Prepare datasets:
- raw
- lemmatized
- tokenized
- lemmatized tokenized
- sentences

In [58]:
%%time
raw=([],[],[])
lem=([],[],[])
tok=([],[],[])
lem_tok=([],[],[])
sent=([],[],[])
original=(train,dev,test)

for j,dataset in enumerate(original):
    for sample in tqdm(dataset):
        raw[j].append(to_raw(sample))
        lem[j].append(to_lemmatized(sample))
        tok[j].append(to_tokens(sample))
        lem_tok[j].append(to_lemmatized_tokens(sample))
        sent[j].append(to_sents(sample))    

100%|██████████| 551399/551399 [04:07<00:00, 2230.99it/s]
100%|██████████| 8527/8527 [00:03<00:00, 2236.99it/s]
100%|██████████| 8527/8527 [00:03<00:00, 2277.08it/s]

CPU times: user 4min 12s, sys: 4.33 s, total: 4min 16s
Wall time: 4min 14s





#### Persist

In [63]:
WRITE_PAIRS=[
    (raw,'raw'),
    (lem,'lem'),
    (tok,'tok'),
    (lem_tok,'lem_tok'),
    (sent,'sent')
]

In [None]:
%%time
for ds, name in WRITE_PAIRS:
    write(ds,name)