In [1]:
import pickle
import os
import re
import numpy as np
import spacy
import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datetime import datetime
from sklearn.preprocessing import normalize as scikit_normalize
from evaluation import plot_history
from evaluation import rmse_report
from sampling import UnderSampler3D
from fasttext_embedding import FastTextEmbeddingBag
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm

%matplotlib inline
%load_ext tensorboard

In [2]:
FROM_CHECKPOINT=True
MODEL_SAVE_DIR='/home/kvassay/data/z/models/E8/keras_cnn.h5'
MODEL_CHECKPOINT_PATH='/home/kvassay/data/z/models/E8/checkpoint/cp-{epoch:04d}.ckpt'

DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='tok'
TB_LOG_DIR='/home/kvassay/data/z/log/E8/scalars/'
VEC_DIM=100
FASTTEXT='/home/kvassay/data/z/models/fasttext/cbow_{}_e{}_w{}.bin'.format(VEC_DIM,50,5)
SEQ_PADDING=50
CHECKPOINT_DIR='/tmp/z/checkpoint_dim{}_pad{}/'.format(VEC_DIM,SEQ_PADDING)
ALLOWED_SPECIAL=tuple(['?','!',':(', ':)', ':D',':-)',':-D',':\'(',':/',':-/','<3',':-P',':P'])

## Read data

In [3]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    train,dev,_=pickle.load(f)

CPU times: user 4.95 s, sys: 1.37 s, total: 6.31 s
Wall time: 6.54 s


## Load models

In [4]:
%%time
if not FROM_CHECKPOINT:
    fasttext=FastTextEmbeddingBag(FASTTEXT)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 4.29 µs


## Preprocess text + extract features
- filter out EN stop-words (and, or, ...)
- filter out non-allowed special tokens (we want to keep smileys and !,?)

In [5]:
rx_special= re.compile("|".join(re.escape(s) for s in ALLOWED_SPECIAL))

def word_filter(word):
    if word in STOP_WORDS:
        return False
    if not word.isalpha():
        if not rx_special.findall(word):
            return False
    else:
        if len(word)<3:
            return False
    return True

def preprocess_text(text):
    return [x.lower() for x in text if word_filter(x.lower())]

def preprocess_texts(dataset,text_keys=['summary','text']):
    for sample in tqdm(dataset):
        for key in text_keys:
            sample[key]=preprocess_text(sample[key])
    return dataset

In [6]:
%%time
if not FROM_CHECKPOINT:
    train=preprocess_texts(train)
    dev=preprocess_texts(dev)

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs


## Extract features
- transform texts to averages of their fastText vectors
- concatenate summary & text average vectors into single one

In [7]:
def pad_vecs(vecs_mtx,length):
    return pad_sequences(vecs_mtx,
                         maxlen=length,
                         dtype='float32',
                        padding='post',
                        truncating='post')

def extract_features(dataset, fasttext):
    default_vec=np.zeros(VEC_DIM,dtype=np.float32)
    vecs_all=[]
    for sample in tqdm(dataset):
        all_words=sample['summary']+sample['text']
        if all_words:
            vecs=fasttext.forward([x for x in all_words])
        else:
            vecs=np.array([default_vec])
        vecs=scikit_normalize(vecs)
        vecs=vecs.reshape(1,vecs.shape[0],vecs.shape[1])         
        vecs = pad_vecs(vecs, SEQ_PADDING)
        vecs_all.append(vecs)
    vecs_all=np.array(vecs_all)
    vecs_all=vecs_all.reshape(vecs_all.shape[0],vecs_all.shape[2],vecs_all.shape[3])
    return vecs_all

In [8]:
def load_checkpoint():
    with open(CHECKPOINT_DIR+'X_train.npy','rb') as f:
        X_train=np.load(f)
    with open(CHECKPOINT_DIR+'X_dev.npy','rb') as f:
        X_dev=np.load(f)
    with open(CHECKPOINT_DIR+'y_train.npy','rb') as f:
        y_train=np.load(f)
    with open(CHECKPOINT_DIR+'y_dev.npy','rb') as f:
        y_dev=np.load(f)
    return X_train,X_dev,y_train,y_dev
        
def checkpoint(X_train,X_dev,y_train,y_dev):
    if not os.path.exists(CHECKPOINT_DIR):
        os.makedirs(CHECKPOINT_DIR)
    with open(CHECKPOINT_DIR+'X_train.npy','wb') as f:
        np.save(f,X_train)
    with open(CHECKPOINT_DIR+'X_dev.npy','wb') as f:
        np.save(f,X_dev)
    with open(CHECKPOINT_DIR+'y_train.npy','wb') as f:
        np.save(f,y_train)
    with open(CHECKPOINT_DIR+'y_dev.npy','wb') as f:
        np.save(f,y_dev)

In [9]:
%%time
if not FROM_CHECKPOINT:
    X_train=extract_features(train,fasttext)
    X_dev=extract_features(dev,fasttext)
    y_train=np.array([x['score'] for x in train])
    y_dev=np.array([x['score'] for x in dev])
    print('Train samples shape: {}, Dev samples shape: {}'.format(X_train.shape,X_dev.shape))
else:
    X_train,X_dev,y_train,y_dev=load_checkpoint()

CPU times: user 258 ms, sys: 9.56 s, total: 9.82 s
Wall time: 1min 9s


In [10]:
%%time
if not FROM_CHECKPOINT:
    checkpoint(X_train,X_dev,y_train,y_dev)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 4.53 µs


## Experiment

In [30]:
def penalized_loss(y_true, y_pred):
    return K.backend.mean(K.backend.square(K.backend.abs(y_true - y_pred))/y_true)

def experiment():
    model=K.models.load_model(MODEL_SAVE_DIR
                              ,custom_objects={'penalized_loss': penalized_loss})
    y_pred_dev=model.predict(X_dev)
    rmse_report(y_dev,y_pred_dev,title='RMSE report')
    return model

In [31]:
model=experiment()

0,1
RMSE (baseline ∀1.0),1.53
RMSE,0.763

0,1
Mean partial RMSE (baseline ∀1.0),2.0
Max partial RMSE (baseline ∀1.0),1.414
St.dev. partial RMSE (baseline ∀1.0),4.0
Mean partial RMSE,0.796
Max partial RMSE,0.101
St.dev. partial RMSE,0.93

0,1
RMSE,0.767
Mean partial RMSE,1.204
Max partial RMSE,3.07

Review Score,RMSE,RMSE baseline (∀1.0),Improvement over baseline
5.0,0.75,0.0,-0.75
4.0,0.664,1.0,0.336
3.0,0.741,2.0,1.259
2.0,0.93,3.0,2.07
1.0,0.896,4.0,3.104


In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 25, 256)           77056     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 8, 256)            0         
_________________________________________________________________
dropout (Dropout)            (None, 8, 256)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 8, 256)            1024      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4, 256)            196864    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 256)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1, 256)            0