In [1]:
import pickle
import os
import re
import numpy as np
import spacy
import tensorflow as tf
import tensorflow.keras as K
from datetime import datetime
from sklearn.preprocessing import normalize as scikit_normalize
from evaluation import plot_history
from evaluation import rmse_report
from sampling import UnderSampler
from fasttext_embedding import FastTextEmbeddingBag
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm

%matplotlib inline
%load_ext tensorboard

In [2]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='tok'
TB_LOG_DIR='/home/kvassay/data/z/log/E3/scalars/'
VEC_DIM=100
FASTTEXT='/home/kvassay/data/z/models/fasttext/sent_tok_lc_{}.bin'.format(VEC_DIM)
ALLOWED_SPECIAL=tuple(['?','!',':(', ':)', ':D',':-)',':-D',':\'(',':/',':-/','<3',':-P',':P'])

## Read data

In [3]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    train,dev,_=pickle.load(f)

CPU times: user 4.6 s, sys: 1.14 s, total: 5.74 s
Wall time: 5.7 s


## Load models

In [4]:
%%time
fasttext=FastTextEmbeddingBag(FASTTEXT)




CPU times: user 2.28 s, sys: 997 ms, total: 3.28 s
Wall time: 3.02 s


## Preprocess text + extract features
- filter out EN stop-words (and, or, ...)
- filter out non-allowed special tokens (we want to keep smileys and !,?)

In [5]:
rx_special= re.compile("|".join(re.escape(s) for s in ALLOWED_SPECIAL))

def word_filter(word):
    if word in STOP_WORDS:
        return False
    if not word.isalpha():
        if not rx_special.findall(word):
            return False
    return True

def preprocess_text(text):
    return [x.lower() for x in text if word_filter(x.lower())]

def preprocess_texts(dataset,text_keys=['summary','text']):
    for sample in tqdm(dataset):
        for key in text_keys:
            sample[key]=preprocess_text(sample[key])
    return dataset

In [6]:
%%time
train=preprocess_texts(train)
dev=preprocess_texts(dev)

100%|██████████| 551399/551399 [00:16<00:00, 33657.70it/s]
100%|██████████| 8527/8527 [00:00<00:00, 35343.43it/s]

CPU times: user 16.7 s, sys: 94.3 ms, total: 16.8 s
Wall time: 16.6 s





## Extract features

In [7]:
def extract_features(dataset, fasttext):
    default_vec=np.zeros(VEC_DIM,dtype=np.float32)
    vecs=[]
    for sample in tqdm(dataset):
        if sample['summary']:
            vecs_summary=fasttext.forward([x.lower() for x in sample['summary']])
        else:
            vecs_summary=[default_vec]
        if sample['text']:
            vecs_text = fasttext.forward([x.lower() for x in sample['text']])
        else:
            vecs_text=[default_vec]
        avg_summary = np.mean(vecs_summary,axis=0)
        avg_text = np.mean(vecs_text,axis=0)
        vecs.append(np.concatenate((avg_summary,avg_text), axis=0))
    vecs=np.array(vecs)
    return scikit_normalize(vecs)

In [8]:
%%time
X_train=extract_features(train,fasttext)

100%|██████████| 551399/551399 [06:06<00:00, 1505.84it/s]


CPU times: user 6min 9s, sys: 11.2 s, total: 6min 20s
Wall time: 6min 7s


In [9]:
X_dev=extract_features(dev,fasttext)

E0825 11:47:51.054325 139839678383936 ultratb.py:149] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-ceda85e9a079>", line 1, in <module>
    X_dev=extract_features(train,fasttext)
  File "<ipython-input-7-bb5d5f835bbc>", line 10, in extract_features
    vecs_text = fasttext.forward([x.lower() for x in sample['text']])
  File "/home/kvassay/data/z/test/nb/fasttext_embedding.py", line 19, in forward
    _, subinds = self.model.get_subwords(word)
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/fasttext/FastText.py", line 104, in get_subwords
    pair = self.f.getSubwords(word, on_unicode_error)
TypeError: Unable to convert function return value to a Python type! The signature was
	(self: fasttext_pybind.fasttext, arg0: str, arg1: str) -> Tuple[List[str], List[int]]

During handling of the above exception, another exception occurred:

TypeError: Unable to convert function return value to a Python type! The signature was
	(self: fasttext_pybind.fasttext, arg0: str, arg1: str) -> Tuple[List[str], List[int]]

In [None]:
y_train=np.array([x['score'] for x in train])
y_dev=np.array([x['score'] for x in dev])
print('Train samples shape: {}, Dev samples shape: {}'.format(X_train.shape,X_dev.shape))

## Train

In [None]:
def get_tb_callback():
    suffix=datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir= os.path.join(TB_LOG_DIR,suffix)
    return K.callbacks.TensorBoard(log_dir=os.path.join(log_dir))

def train_model(batch_size,learning_rate, epochs, sampler_cls=UnderSampler):
    tensorboard_callback = get_tb_callback()
    model = K.models.Sequential([
        K.layers.Dense(100,activation='relu', input_shape=(X_train.shape[1],)),
        K.layers.Dense(1,activation='linear'),
    ])
    opt=K.optimizers.Adam(lr=learning_rate, decay=learning_rate/epochs, amsgrad=True)
    model.compile(optimizer=opt, loss='mean_squared_error')
    sampler=sampler_cls(X_train,y_train,batch_size=batch_size)
    model.fit_generator(sampler,
                        shuffle=False,
                        epochs=epochs,
                        validation_data=(X_dev.todense(),y_dev),
                        callbacks=[tensorboard_callback])
    return model

## Experiment

In [None]:
def experiment(sampling_cls,learning_rate,epochs,batch_size,name):
    model=train_model(sampler_cls=sampling_cls,epochs=epochs,batch_size=batch_size,learning_rate=learning_rate)
    y_pred_dev=model.predict(X_dev)
    rmse_report(y_dev,y_pred_dev,title='{} - RMSE report'.format(name))
    plot_history(model,title='{} - Train/Dev MSE'.format(name))
    return model

In [None]:
model=experiment(sampling_cls=UnderSampler,learning_rate=0.05, epochs=10,batch_size=256,name='TF-IDF model')

In [None]:
model.summary()