In [4]:
#!pip install --upgrade pip imbalanced-learn
#!pip install tensorflow-gpu==2.0.0-rc0 

#tensorboard --logdir /home/kvassay/data/z/log/E2/scalars/

In [5]:
import pickle
import os
import numpy as np
import tensorflow as tf
import tensorflow.keras as K
from scipy.sparse import hstack
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as scikit_normalize

from evaluation import plot_history
from evaluation import rmse_report

from sampling import UnderSampler

%matplotlib inline
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [6]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='lem_tok'
TB_LOG_DIR='/home/kvassay/data/z/log/E2/scalars/'

## Load data

In [7]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    train,dev,test=pickle.load(f)

CPU times: user 4.73 s, sys: 964 ms, total: 5.69 s
Wall time: 5.65 s


## Train TF-IDF model

In [8]:
def tf_train(dataset,key, **scikit_kwargs):
    vectorizer=TfidfVectorizer(**scikit_kwargs)
    vectorizer.fit([' '.join(x[key]) for x in dataset])
    return vectorizer

def tf_predict(vectorizer,dataset,key, apply_norm=True):
    features=vectorizer.transform([' '.join(x[key]) for x in dataset])
    if apply_norm is True:
        features=scikit_normalize(features)
    return features

def extract_features(dataset,vectorizer_summary,vectorizer_text):
    summ_vecs=tf_predict(vectorizer_summary,dataset,'summary')
    text_vecs=tf_predict(vectorizer_text,dataset,'text')
    return hstack([summ_vecs, text_vecs],format='csr')

In [9]:
%%time
vectorizer_text=tf_train(train,'text',max_features=35000,ngram_range=(1,2),max_df=0.99,lowercase=True,use_idf=False)
vectorizer_summary=tf_train(train,'summary',max_features=5000,ngram_range=(1,2),max_df=0.99,lowercase=True,use_idf=False)

E0825 09:42:19.411099 139811937101632 ultratb.py:149] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/IPython/core/magics/execution.py", line 1271, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 1, in <module>
  File "<ipython-input-8-7dd74c2cc02b>", line 3, in tf_train
    vectorizer.fit([' '.join(x[key]) for x in dataset])
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 1631, in fit
    X = super().fit_transform(raw_documents)
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform
    self.fixed_vocabulary_)
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 970, in _count_vocab
    for feature in analyze(doc):
  File "/home/kvassay/.virtualenvs/main/lib/python3.6/site-packages/sklearn/feature_extraction/text.py", line 352, in <lambda>
    tokenize(preprocess(self.dec

KeyboardInterrupt: 

## Extract features

In [10]:
%%time
X_train=extract_features(train, vectorizer_summary,vectorizer_text)
X_dev=extract_features(dev, vectorizer_summary,vectorizer_text)
y_train=np.array([x['score'] for x in train])
y_dev=np.array([x['score'] for x in dev])
print('Train samples shape: {}, Dev samples shape: {}'.format(X_train.shape,X_dev.shape))

NameError: name 'vectorizer_summary' is not defined

## Training

In [12]:
def get_tb_callback():
    suffix=datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir= os.path.join(TB_LOG_DIR,suffix)
    return K.callbacks.TensorBoard(log_dir=os.path.join(log_dir))

def train_model(batch_size,learning_rate, epochs, sampler_cls=UnderSampler):
    tensorboard_callback = get_tb_callback()
    model = K.models.Sequential([
        K.layers.Dense(100,activation='relu', input_shape=(X_train.shape[1],)),
        K.layers.Dropout(0.2),
        K.layers.Dense(20,activation='relu', input_shape=(X_train.shape[1],)),
        K.layers.Dense(1,activation='linear'),
    ])
    opt=K.optimizers.Adam(lr=learning_rate, decay=learning_rate/epochs, amsgrad=True)
    model.compile(optimizer=opt, loss='mean_squared_error')
    sampler=sampler_cls(X_train,y_train,batch_size=batch_size)
    model.fit_generator(sampler,
                        shuffle=False,
                        epochs=epochs,
                        validation_data=(X_dev.todense(),y_dev),
                        callbacks=[tensorboard_callback])
    return model

## Experiment

In [13]:
def experiment(sampling_cls,learning_rate,epochs,batch_size,name):
    model=train_model(sampler_cls=sampling_cls,epochs=epochs,batch_size=batch_size,learning_rate=learning_rate)
    y_pred_dev=model.predict(X_dev)
    rmse_report(y_dev,y_pred_dev,title='{} - RMSE report'.format(name))
    plot_history(model,title='{} - Train/Dev MSE'.format(name))
    return model

In [14]:
model=experiment(sampling_cls=UnderSampler,learning_rate=0.05, epochs=25,batch_size=300,name='TF-IDF model')

NameError: name 'X_train' is not defined

In [None]:
model.summary()

## Persist

In [None]:
with open('/home/kvassay/data/z/models/E2/vectorizer_summary.pickle','wb') as f:
    pickle.dump(vectorizer_summary,f)
with open('/home/kvassay/data/z/models/E2/vectorizer_text.pickle','wb') as f:
    pickle.dump(vectorizer_text,f)
model.save('/home/kvassay/data/z/models/E2/keras_regressor.h5')