In [1]:
#!pip install tensorflow-gpu==2.0.0-rc0 

# Evaluation of selected best model - LEM-TF-SEQ-E7

####  Model notes
- achieved best Mean Partial RMSE (0.678) & was able to reach the lowest training loss (0.0261) => higher learning ability compared to other models.
- TF-IDF was disabled because analysis shows that frequent word are significant for sentiment analysis(words like great, awful etc.). Also IDF doens't account for imbalance in our dataset.
- Model is NN with 1 hidden layer
- Model uses concatenated TF vectors of summary and text (two separate vectorization models)
- All input texts are expected to be tokenized & lemmatized.
- During training balanced undersampling and penalized loss were applied. Penalized loss makes errors in predicting higher ratings less significant.

In [2]:
import pickle
import os
import numpy as np
import tensorflow.keras as K
from scipy.sparse import hstack
from sklearn.preprocessing import normalize as scikit_normalize
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from evaluation import rmse_report
%matplotlib inline

In [3]:
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1_{}.pickle'
TYPE='lem_tok'
MODEL_FOLDER='/home/kvassay/data/z/models/best_model/'

## Load data

In [4]:
%%time
with open(DATASET.format(TYPE),'rb') as f:
    _,_,test_ds=pickle.load(f)

CPU times: user 5.35 s, sys: 1.14 s, total: 6.48 s
Wall time: 6.47 s


## Load model

In [5]:
def penalized_loss(y_true, y_pred):
    return K.backend.mean(K.backend.square(K.backend.abs(y_true - y_pred))/y_true)


class SentimentPredictionModel:
    def __init__(self,model_folder):
        with open(os.path.join(model_folder, 'vectorizer_summary.pickle'),'rb') as f:
            self.vectorizer_summary=pickle.load(f)
        with open(os.path.join(model_folder,'vectorizer_text.pickle'), 'rb') as f:
            self.vectorizer_text=pickle.load(f)
        self.model = K.models.load_model(os.path.join(model_folder,'keras_regressor.h5'),
                                          custom_objects={'penalized_loss': penalized_loss})
        
    @staticmethod
    def _tf_predict(vectorizer,dataset,key):
        features=vectorizer.transform([' '.join(x[key]) for x in dataset])
        return features

    def _extract_features(self,dataset,key_summary,key_text):
        summ_vecs=self._tf_predict(self.vectorizer_summary,dataset, key_summary)
        text_vecs=self._tf_predict(self.vectorizer_text,dataset, key_text)
        return scikit_normalize(hstack([summ_vecs, text_vecs],format='csr'))

    @staticmethod
    def _fix_ratings_over_limit(y_pred,cast_f=float):
        for i in range(y_pred.shape[0]):
            # fix values over limit (>5, <1)
            if y_pred[i]>5:
                y_pred[i]=cast_f(5)
            if y_pred[i]<1:
                y_pred[i]=cast_f(1)
        return y_pred
    
    def predict(self, dataset_tokenized, key_summary='summary',key_text='text',fix_overlimit=False,
                integer=False):
        X_pred=self._extract_features(dataset_tokenized,key_summary,key_text)
        y_pred=self.model.predict(X_pred.todense())
        if integer:
            y_pred= np.rint(y_pred)
            if fix_overlimit:
                y_pred=self._fix_ratings_over_limit(y_pred,cast_f=int)
        else:
            if fix_overlimit:
                y_pred=self._fix_ratings_over_limit(y_pred,cast_f=float)
        return y_pred

In [6]:
%%time
sentiment_analyzer=SentimentPredictionModel(MODEL_FOLDER)

CPU times: user 1.91 s, sys: 396 ms, total: 2.3 s
Wall time: 2.32 s


# Evaluation

#### Prepare predictions

In [7]:
y_test=np.array([x['score'] for x in test_ds])
y_test_int=np.array([int(x) for x in y_test])
len(y_test)

8527

In [8]:
%%time
y_pred_raw=sentiment_analyzer.predict(test_ds,fix_overlimit=False)
y_pred_float=sentiment_analyzer.predict(test_ds,fix_overlimit=True)
y_pred_int=sentiment_analyzer.predict(test_ds,fix_overlimit=True,integer=True)

CPU times: user 21.6 s, sys: 14.1 s, total: 35.7 s
Wall time: 29.1 s


# A) - Raw output RMSE 
- we calculate RMSE scores from raw outputs of the model
- fair for comparison with development set evaluation scores
- model generalizes very well

In [9]:
rmse_report(y_test,y_pred_raw)

0,1
RMSE (baseline ∀1.0),1.533
RMSE,0.646

0,1
Mean partial RMSE (baseline ∀1.0),2.0
Max partial RMSE (baseline ∀1.0),1.414
St.dev. partial RMSE (baseline ∀1.0),4.0
Mean partial RMSE,0.683
Max partial RMSE,0.065
St.dev. partial RMSE,0.755

0,1
RMSE,0.887
Mean partial RMSE,1.317
Max partial RMSE,3.245

Review Score,RMSE,RMSE baseline (∀1.0),Improvement over baseline
5.0,0.625,0.0,-0.625
4.0,0.586,1.0,0.414
3.0,0.721,2.0,1.279
2.0,0.727,3.0,2.273
1.0,0.755,4.0,3.245


## B) - RMSE scores with overlimit correction
- values >5 and <1 will be replaced by 5 or 1
- evaluation of score for realistic usage

In [10]:
rmse_report(y_test,y_pred_float)

0,1
RMSE (baseline ∀1.0),1.533
RMSE,0.617

0,1
Mean partial RMSE (baseline ∀1.0),2.0
Max partial RMSE (baseline ∀1.0),1.414
St.dev. partial RMSE (baseline ∀1.0),4.0
Mean partial RMSE,0.664
Max partial RMSE,0.077
St.dev. partial RMSE,0.741

0,1
RMSE,0.916
Mean partial RMSE,1.336
Max partial RMSE,3.259

Review Score,RMSE,RMSE baseline (∀1.0),Improvement over baseline
5.0,0.588,0.0,-0.588
4.0,0.553,1.0,0.447
3.0,0.715,2.0,1.285
2.0,0.721,3.0,2.279
1.0,0.741,4.0,3.259


## C) - Evaluation of classification scores
- in case we wanted to use this model as Amazon review classifier
- not working so well
- close categories are easy to miss however, except for 1 and 5
- possibly human agreement also low

In [11]:
print(classification_report(y_test_int, y_pred_int))

              precision    recall  f1-score   support

           1       0.83      0.75      0.79       758
           2       0.47      0.60      0.53       461
           3       0.52      0.62      0.56       635
           4       0.44      0.69      0.54      1215
           5       0.94      0.79      0.86      5458

    accuracy                           0.75      8527
   macro avg       0.64      0.69      0.65      8527
weighted avg       0.80      0.75      0.77      8527



In [19]:
print(confusion_matrix(y_test_int,y_pred_int))

[[ 565  142   37   11    3]
 [  84  277   71   27    2]
 [  20  107  393   94   21]
 [   0   21  111  843  240]
 [  11   37  151  961 4298]]


## D) - Evaluation of classification scores - binary problem
- let's say we try to classify whether customer was satisfied
- rating 1/2 => satisfied, rating 3/4/5 => not satisfied
- sentiment analyser is pretty good at distinguishing between positive/negative
- we can see model is better at detecting negative sentiment (possibly caused by sampling/penalized loss function)

In [12]:
y_test_bin=np.array([1 if x >3 else 0 for x in y_test])
y_pred_bin=np.array([1 if x >3 else 0 for x in y_pred_raw])

In [13]:
print(classification_report(y_test_bin,y_pred_bin))

              precision    recall  f1-score   support

           0       0.90      0.75      0.82      1854
           1       0.93      0.98      0.96      6673

    accuracy                           0.93      8527
   macro avg       0.92      0.86      0.89      8527
weighted avg       0.93      0.93      0.93      8527

