## Sentiment Analyser

In [142]:
import pickle
import os
import numpy as np
import tensorflow.keras as K
from scipy.sparse import hstack
from sklearn.preprocessing import normalize as scikit_normalize
import spacy 
from spacy import displacy
from IPython.display import display,HTML,IFrame,clear_output
from ipywidgets import widgets, interact
%matplotlib inline

In [81]:
MODEL_FOLDER='/home/kvassay/data/z/models/best_model/'
DEMO_DATA='/home/kvassay/data/z/data/reviews_demo.pickle'

In [3]:
KEY_SUMMARY='summary'
KEY_CONTENT='text'
VECTORIZER_SUMMARY_FNAME='vectorizer_summary.pickle'
VECTORIZER_CONTENT_FNAME='vectorizer_text.pickle'
KERAS_MODEL_FNAME='keras_regressor.h5'
LOSS_F_NAME='penalized_loss'

def penalized_loss(y_true, y_pred):
    return K.backend.mean(K.backend.square(K.backend.abs(y_true - y_pred))/y_true)


class SentimentPredictionModel:
    def __init__(self,model_folder):
        with open(os.path.join(model_folder, VECTORIZER_SUMMARY_FNAME),'rb') as f:
            self.vectorizer_summary=pickle.load(f)
        with open(os.path.join(model_folder,VECTORIZER_CONTENT_FNAME), 'rb') as f:
            self.vectorizer_text=pickle.load(f)
        self.model = K.models.load_model(os.path.join(model_folder,KERAS_MODEL_FNAME),
                                          custom_objects={LOSS_F_NAME: penalized_loss})
        
    @staticmethod
    def _tf_predict(vectorizer,dataset,key):
        features=vectorizer.transform([' '.join(x[key]) for x in dataset])
        return features

    def _extract_features(self,dataset):
        summ_vecs=self._tf_predict(self.vectorizer_summary,dataset, KEY_SUMMARY)
        text_vecs=self._tf_predict(self.vectorizer_text,dataset, KEY_CONTENT)
        return scikit_normalize(hstack([summ_vecs, text_vecs],format='csr'))

    @staticmethod
    def _fix_ratings_over_limit(y_pred,cast_f=float):
        for i in range(y_pred.shape[0]):
            # fix values over limit (>5, <1)
            if y_pred[i]>5:
                y_pred[i]=cast_f(5)
            if y_pred[i]<1:
                y_pred[i]=cast_f(1)
        return y_pred
    
    def predict(self, dataset_tokenized, fix_overlimit=False,
                integer=False):
        X_pred=self._extract_features(dataset_tokenized)
        y_pred=self.model.predict(X_pred.todense())
        if integer:
            y_pred= np.rint(y_pred)
            if fix_overlimit:
                y_pred=self._fix_ratings_over_limit(y_pred,cast_f=int)
        else:
            if fix_overlimit:
                y_pred=self._fix_ratings_over_limit(y_pred,cast_f=float)
        return y_pred

## Load models

#### Sentiment predictor

In [51]:
%%time
model=SentimentPredictionModel(MODEL_FOLDER)

CPU times: user 2.41 s, sys: 255 ms, total: 2.67 s
Wall time: 2.64 s


#### Spacy NLP

In [52]:
%%time
spacy_nlp=spacy.load('en_core_web_lg',disable=["ner","tagger"])

CPU times: user 5.86 s, sys: 545 ms, total: 6.4 s
Wall time: 6.37 s


## Sentiment predictor

In [78]:
class SentimentPredictor:
    def __init__(self,spacy_nlp,model):
        self.model=model
        self.nlp = spacy_nlp
        
    def _preprocess_text(self,text):
        return tuple([str(x.lemma_) for x in self.nlp(text)])
    
    def detect_sentiment(self,review_summary,review_content,binary=False,round_to_closest=True):
        summ=self._preprocess_text(review_summary)
        cont=self._preprocess_text(review_content)
        prediction=self.model.predict([{KEY_SUMMARY: summ, KEY_CONTENT:cont}])
        prediction=np.rint(prediction[0][0])
        if prediction> 5:
            prediction=5.0
        if prediction < 1:
            prediction=1.0
        if binary:
            prediction = 1.0 if prediction <3 else 0.0
        return prediction

In [98]:
sentiment_predictor=SentimentPredictor(spacy_nlp,model)

## Demo

#### Read test dataset

In [352]:
with open(DEMO_DATA, 'rb') as f:
    data=pickle.load(f)

In [360]:
def to_spacy_format(sample,prediction):
    label_map={1.0:'1',
          2.0:'2',
          3.0:'3',
          4.0:'4',
          5.0:'5'}
    title= 'Predicted rating: {}/5'.format(int(prediction))
    title2='\n\nTrue rating: {}/5'.format(str(int(sample['score'])))
    summ = '\n\n'+sample['summary']
    text = '\n\n'+sample['text']
    review_spacy_format = {'text':title+title2+summ+text}
    ents = {'ents':[]}

    ents['ents'].append({'start':0,'end':len(title),'label':label_map[prediction]})
    ents['ents'].append({'start':len(title),'end':len(title)+len(title2),'label':label_map[sample['score']]})
    ents['ents'].append({'start':len(title)+len(title2),'end':len(title)+len(title2)+len(summ),'label':'S'})
    ents['ents'].append({'start':len(title)+len(title2)+len(summ),'end':len(title)+len(title2)+len(summ)+len(text),'label':'C'})
        
    review_spacy_format.update(ents)
    #review_spacy_format['title']='Product stars: {}/5'.format(int(prediction))
    return review_spacy_format

## UI

In [361]:
def predict_display(review_id):
    COLOR_MAPPING={
        '5':'#3ADF00',
         '4':'#4B8A08',
        '3':'#FFFF00',
        '2':'#FAAC58',
        '1':'#FF0000',
        'S':'#F5F5F5',
        'C':'#F5F5F5'}
    sample=data[review_id]
    prediction=sentiment_predictor.detect_sentiment(sample['summary'],sample['text'])
    formatted=to_spacy_format(sample,prediction)
    display(HTML(displacy.render(formatted,
                             manual=True,
                             style='ent',
                             options={'colors':COLOR_MAPPING})))

In [370]:
predict_display(55)

<IPython.core.display.HTML object>