In [1]:
import pickle as pickle
import lightgbm as lgb
import emoji
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.sparse import hstack, csr_matrix, vstack
import numpy as np
analyzer_emoji = SentimentIntensityAnalyzer()

### Load trained model

In [2]:
print('Loading tfidf model...')
tfidf = pickle.load(open("./vectorizer.pk", "rb" ))

Loading tfidf model...


In [3]:
print('Loading model sentiment to predict...')
sentiment_model = lgb.Booster(model_file='./model_gbm_sentiment.txt')

Loading model sentiment to predict...


### Utils extract emoji sentiments

In [4]:
def extract_emojis(str):
    return [c for c in str if c in emoji.UNICODE_EMOJI]
def sentiment_emojis(sentence):
    emojis = extract_emojis(sentence)
    result = [0,0,0,0]
    if len(emojis) == 0:
        return result
    for icon in emojis:
        sen_dict = analyzer_emoji.polarity_scores(icon)
        sen = [sen_dict['neg'],sen_dict['neu'],sen_dict['pos'],sen_dict['compound']]
        result = [result[i] + sen[i] for i in range(4)]
    return [result[i] / len(emojis) for i in range(4)]
def sentiment_emojis_row(row):
    comment = row['comment']
    sen_comment = sentiment_emojis(comment)
    
    row['emoji_neg'] = sen_comment[0]
    row['emoji_neu'] = sen_comment[1]
    row['emoji_pos'] = sen_comment[2]
    row['emoji_compound'] = sen_comment[3]
    
    return row

### Clean input

In [5]:
def clean_input(input_str):
    if len(input_str) == 0:
        input_str = ' '
    return input_str.lower()

### Get statistic feature

In [6]:
def get_statistic_feature(input_str): 
    # Add num words of comment as feature
    num_words = len(input_str.split())
    # Add num words unique of comment as feature
    num_unique_words = len(set(w for w in input_str.split()))
    # Add num words unique per num words of comment as feature
    words_vs_unique = num_unique_words / num_words * 100
    # Add emojis features
    emoji_neg, emoji_neu, emoji_pos, emoji_compound = sentiment_emojis(input_str)
    return np.array([num_words, num_unique_words, words_vs_unique, emoji_neg, emoji_neu, emoji_pos, emoji_compound])

### TfIdf vector

In [7]:
def sent2vec(input_str):
    return tfidf.transform([input_str])[0]

### Get sentence features

In [8]:
def sent2features(input_str):
    return hstack([sent2vec(input_str), csr_matrix(get_statistic_feature(input_str))]).tocsr()

### Infer sentences

In [9]:
list_input = [
    '😀 tốt quá',
    'thấy không được',
    'quá tệ',
    'tạm ổn'
]
result = [sentiment_model.predict(sent2features(sen)) for sen in list_input]

In [11]:
for str_input, sentiment in zip(list_input, result):
    print('=))' if sentiment < 0.5 else '=((', "%.2f" % sentiment, str_input)

=)) 0.05 😀 tốt quá
=(( 0.96 thấy không được
=(( 0.97 quá tệ
=)) 0.26 tạm ổn
