In [1]:
#!pip install --upgrade pip imbalanced-learn
#!pip install tensorflow-gpu==2.0.0-rc0 

#tensorboard --logdir /home/kvassay/data/z/log/E2/scalars/

In [2]:
import pickle
import os
import numpy as np
import tensorflow as tf
import tensorflow.keras as K
from scipy.sparse import hstack
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as scikit_normalize

from evaluation import plot_history
from evaluation import rmse_report

from sampling import UnderSampler

%matplotlib inline
%load_ext tensorboard

In [3]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='lem_tok'
TB_LOG_DIR='/home/kvassay/data/z/log/E2/scalars/'

## Load data

In [4]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    train,dev,_=pickle.load(f)

CPU times: user 5.02 s, sys: 1.09 s, total: 6.11 s
Wall time: 6.07 s


## Train TF-IDF model

In [5]:
def tf_train(dataset,key, **scikit_kwargs):
    vectorizer=TfidfVectorizer(**scikit_kwargs)
    vectorizer.fit([' '.join(x[key]) for x in dataset])
    return vectorizer

def tf_predict(vectorizer,dataset,key):
    features=vectorizer.transform([' '.join(x[key]) for x in dataset])
    return features

def extract_features(dataset,vectorizer_summary,vectorizer_text):
    summ_vecs=tf_predict(vectorizer_summary,dataset,'summary')
    text_vecs=tf_predict(vectorizer_text,dataset,'text')
    return scikit_normalize(hstack([summ_vecs, text_vecs],format='csr'))

In [6]:
%%time
vectorizer_text=tf_train(train,'text',max_features=35000,ngram_range=(1,2),max_df=0.99,lowercase=True,use_idf=False)
vectorizer_summary=tf_train(train,'summary',max_features=20000,ngram_range=(1,2),max_df=0.99,lowercase=True,use_idf=False)

CPU times: user 1min 14s, sys: 1.71 s, total: 1min 16s
Wall time: 1min 16s


## Extract features

In [7]:
%%time
X_train=extract_features(train, vectorizer_summary,vectorizer_text)
X_dev=extract_features(dev, vectorizer_summary,vectorizer_text)
y_train=np.array([x['score'] for x in train])
y_dev=np.array([x['score'] for x in dev])
print('Train samples shape: {}, Dev samples shape: {}'.format(X_train.shape,X_dev.shape))

Train samples shape: (551399, 40000), Dev samples shape: (8527, 40000)
CPU times: user 1min 1s, sys: 1.56 s, total: 1min 2s
Wall time: 1min 2s


## Training

In [8]:
def get_tb_callback():
    suffix=datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir= os.path.join(TB_LOG_DIR,suffix)
    return K.callbacks.TensorBoard(log_dir=os.path.join(log_dir))

def train_model(batch_size,learning_rate, epochs, sampler_cls=UnderSampler):
    tensorboard_callback = get_tb_callback()
    model = K.models.Sequential([
        K.layers.Dense(200,activation='relu', input_shape=(X_train.shape[1],)),
        K.layers.Dense(1,activation='linear'),
    ])
    opt=K.optimizers.Adam(lr=learning_rate, decay=learning_rate/epochs, amsgrad=True)
    model.compile(optimizer=opt, loss='mean_squared_error')
    sampler=sampler_cls(X_train,y_train,batch_size=batch_size)
    model.fit_generator(sampler,
                        shuffle=False,
                        epochs=epochs,
                        validation_data=(X_dev.todense(),y_dev),
                        callbacks=[tensorboard_callback])
    return model

## Experiment

In [9]:
def experiment(sampling_cls,learning_rate,epochs,batch_size,name):
    model=train_model(sampler_cls=sampling_cls,epochs=epochs,batch_size=batch_size,learning_rate=learning_rate)
    y_pred_dev=model.predict(X_dev)
    rmse_report(y_dev,y_pred_dev,title='{} - RMSE report'.format(name))
    plot_history(model,title='{} - Train/Dev MSE'.format(name))
    return model

In [10]:
model=experiment(sampling_cls=UnderSampler,learning_rate=0.05, epochs=20,batch_size=256,name='TF-IDF model')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
 83/564 [===>..........................] - ETA: 1:23 - loss: 0.3374

E0825 10:53:41.215569 140257521530688 ultratb.py:149] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-3504531ba297>", line 1, in <module>
    model=experiment(sampling_cls=UnderSampler,learning_rate=0.05, epochs=20,batch_size=256,name='TF-IDF model')
  File "<ipython-input-9-ff2fd67314ed>", line 2, in experiment
    model=train_model(sampler_cls=sampling_cls,epochs=epochs,batch_size=batch_size,learning_rate=learning_rate)
  File "<ipython-input-8-74854d50f6e3>", line 19, in train_model
    callbacks=[tensorboard_callback])
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py", line 1303, in fit_generator
    steps_name='steps_per_epoch')
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_generator.py", line 265, in model_itera

KeyboardInterrupt: 

In [None]:
model.summary()

## Persist

In [None]:
with open('/home/kvassay/data/z/models/E2/vectorizer_summary.pickle','wb') as f:
    pickle.dump(vectorizer_summary,f)
with open('/home/kvassay/data/z/models/E2/vectorizer_text.pickle','wb') as f:
    pickle.dump(vectorizer_text,f)
model.save('/home/kvassay/data/z/models/E2/keras_regressor.h5')