In [1]:
import pickle
from tqdm import tqdm
import spacy 

In [None]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1.pickle'
SAVE_PATH='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'

In [None]:
%%time
with open(DATASET,'rb') as f:
    train,dev,test=pickle.load(f)

In [None]:
nlp = spacy.load('en_core_web_lg',disable=["ner","tagger"])
tokenizer = nlp.Defaults.create_tokenizer(nlp)

#### Dataset - sentences

In [None]:
# Spacy output extractors
def tokenize_sent(sent):
    return [str(x) for x in tokenizer(sent)]

def get_sents(sample):
    return tuple([str(x).strip() for x in sample['nlp_text'].sents])

def get_sents_tokenized(sample):
    return tuple([tokenize_sent(str(x).strip()) for x in sample['nlp_text'].sents])
    

def get_lemmatized_tokens(sample,key):
    return tuple([str(x.lemma_) for x in sample['nlp_'+key]])

def get_tokens(sample,key):
    return tuple([str(x) for x in sample['nlp_'+key]])

def get_lemmatized(sample,key):
    return ' '.join([str(x.lemma_) for x in sample['nlp_'+key]])

def to_sents(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':x['summary'],
             'text':get_sents(x),
            } 

def to_sents_tokenized(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_tokens(x,'summary'),
             'text':get_sents_tokenized(x),
            } 

# Dataset reducers
def to_lemmatized(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_lemmatized(x,'summary'),
             'text':get_lemmatized(x,'text'),
            } 

def to_lemmatized_tokens(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_lemmatized_tokens(x,'summary'),
             'text':get_lemmatized_tokens(x,'text'),
            } 

def to_tokens(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':get_tokens(x,'summary'),
             'text':get_tokens(x,'text'),
            } 

def to_raw(x):
    return {'review_id':x['review_id'], 
             'score':x['score'],
             'summary':x['summary'],
             'text':x['text'],
            } 

def write(ds, name):
    with open(SAVE_PATH.format(name), 'wb') as f:
        pickle.dump((ds[0],ds[1],ds[2]), f) 

#### Prepare datasets:
- raw
- lemmatized
- tokenized
- lemmatized tokenized
- sentences

In [None]:
%%time
raw=([],[],[])
lem=([],[],[])
tok=([],[],[])
lem_tok=([],[],[])
sent=([],[],[])
sent_tok=([],[],[])
original=(train,dev,test)

for j,dataset in enumerate(original):
    for sample in tqdm(dataset):
        raw[j].append(to_raw(sample))
        lem[j].append(to_lemmatized(sample))
        tok[j].append(to_tokens(sample))
        lem_tok[j].append(to_lemmatized_tokens(sample))
        sent[j].append(to_sents(sample))    
        sent_tok[j].append(to_sents_tokenized(sample))    

#### Persist

In [None]:
WRITE_PAIRS=[
    (raw,'raw'),
    (lem,'lem'),
    (tok,'tok'),
    (lem_tok,'lem_tok'),
    (sent,'sent'),
    (sent_tok,'sent_tok')
]

In [None]:
%%time
for ds, name in WRITE_PAIRS:
    write(ds,name)