In [35]:
#!pip install tensorflow-gpu==2.0.0-rc0 

In [36]:
import pickle
import os
import numpy as np
import tensorflow.keras as K
from scipy.sparse import hstack
from sklearn.preprocessing import normalize as scikit_normalize
from evaluation import rmse_report
%matplotlib inline

In [37]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='lem_tok'
MODEL_FOLDER='/home/kvassay/data/z/models/best_model/'

## Load data

In [38]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    _,_,test_ds=pickle.load(f)

CPU times: user 7.98 s, sys: 1.22 s, total: 9.2 s
Wall time: 8.96 s


## Load model

In [39]:
def penalized_loss(y_true, y_pred):
    return K.backend.mean(K.backend.square(K.backend.abs(y_true - y_pred))/y_true)


class SentimentPredictionModel:
    def __init__(self,model_folder):
        with open(os.path.join(model_folder, 'vectorizer_summary.pickle'),'rb') as f:
            self.vectorizer_summary=pickle.load(f)
        with open(os.path.join(model_folder,'vectorizer_text.pickle'), 'rb') as f:
            self.vectorizer_text=pickle.load(f)
        self.model = K.models.load_model(os.path.join(model_folder,'keras_regressor.h5'),
                                          custom_objects={'penalized_loss': penalized_loss})
        
    @staticmethod
    def _tf_predict(vectorizer,dataset,key):
        features=vectorizer.transform([' '.join(x[key]) for x in dataset])
        return features

    def _extract_features(self,dataset,key_summary,key_text):
        summ_vecs=self._tf_predict(self.vectorizer_summary,dataset, key_summary)
        text_vecs=self._tf_predict(self.vectorizer_text,dataset, key_text)
        return scikit_normalize(hstack([summ_vecs, text_vecs],format='csr'))

    def predict(self, dataset_tokenized, key_summary='summary',key_text='text'):
        X_pred=self._extract_features(dataset_tokenized,key_summary,key_text)
        y_pred=self.model.predict(X_pred)
        return y_pred

In [40]:
%%time
sentiment_analyzer=SentimentPredictionModel(MODEL_FOLDER)

CPU times: user 1.34 s, sys: 116 ms, total: 1.45 s
Wall time: 1.43 s


## Evaluation

In [41]:
y_test=np.array([x['score'] for x in test_ds])
len(y_test)

8527

In [42]:
%%time
y_pred=sentiment_analyzer.predict(test_ds)

W0826 14:57:04.537681 140045144872768 training.py:510] Falling back from v2 loop because of error: Failed to find data adapter that can handle input: <class 'scipy.sparse.csr.csr_matrix'>, <class 'NoneType'>


In [45]:
rmse_report(y_test,y_pred)

0,1
RMSE (baseline ∀1.0),1.533
RMSE,0.646

0,1
Mean partial RMSE (baseline ∀1.0),2.0
Max partial RMSE (baseline ∀1.0),1.414
St.dev. partial RMSE (baseline ∀1.0),4.0
Mean partial RMSE,0.683
Max partial RMSE,0.065
St.dev. partial RMSE,0.755

0,1
RMSE,0.887
Mean partial RMSE,1.317
Max partial RMSE,3.245

Review Score,RMSE,RMSE baseline (∀1.0),Improvement over baseline
5.0,0.625,0.0,-0.625
4.0,0.586,1.0,0.414
3.0,0.721,2.0,1.279
2.0,0.727,3.0,2.273
1.0,0.755,4.0,3.245
