In [1]:
from working_with_files import *
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

## Данные 1: новости

In [12]:
data = pd.read_json('train.json', encoding='utf-8')

In [13]:
set(data.sentiment)

{'negative', 'neutral', 'positive'}

### Simple lexicon-based approach

Загрузка и предобработка тонального словаря

In [14]:
# source: https://github.com/dkulagin/kartaslov/tree/master/dataset/emo_dict
lexicon = pd.read_csv('emo_dict.csv', encoding='utf-8', sep=';')

In [15]:
# sentiment dict with exact values for pos & neg
d = {}
for idx, row in lexicon.iterrows():
    if row['tag'] != 'NEUT':
        d[row['term']] = row['value']

In [16]:
len(d)

11324

Лемматизация текстов корпуса

In [2]:
from nltk import word_tokenize
import pymorphy2
morph_analyzer = pymorphy2.MorphAnalyzer()

def lemmatize(text):
    try:
        words = word_tokenize(text)
        lemmas = [morph_analyzer.parse(word)[0].normal_form for word in words] 
    except TypeError:
        return ''
    return ' '.join(lemmas)

In [18]:
data['text_lemmatized'] = data['text'].apply(lemmatize)

Функция для подсчета тональности текста на основе тональности отдельных слов 

In [3]:
# function for detecting text's sentiment based on input sentiment dict
def detect_sentiment(text, sentiment_dict):
    words = text.split()
    score = 0
    for word in words:
        if word in sentiment_dict:
            score += sentiment_dict[word]
    if score > 1:
        return 'positive'
    elif score < -1:
        return 'negative'
    else:
        return 'neutral'

NB: Алгоритм работает более точно, если пограничными значениями для опредления сентимента являются -1 и 1, а не 0

In [20]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d)

In [4]:
from sklearn.metrics import classification_report

In [22]:
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.46      0.15      0.22      1434
     neutral       0.49      0.41      0.45      4034
    positive       0.40      0.63      0.49      2795

    accuracy                           0.44      8263
   macro avg       0.45      0.40      0.39      8263
weighted avg       0.45      0.44      0.42      8263



Подсчет сентмента текста с учетом нейтральных слов из словаря

In [23]:
# include neutral values in sentiment dict
d1 = {}
for idx, row in lexicon.iterrows():
    d1[row['term']] = row['value']

In [24]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d1)

In [25]:
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.54      0.04      0.07      1434
     neutral       0.45      0.14      0.21      4034
    positive       0.36      0.88      0.51      2795

    accuracy                           0.37      8263
   macro avg       0.45      0.35      0.26      8263
weighted avg       0.43      0.37      0.29      8263



Нейтральные слова лучше не учитывать

Замена точных значений сентимента в словаре на -1 и 1

In [26]:
# try sentiment dict w plain +/-1 values
d2 = {}
for idx, row in lexicon.iterrows():
    if row['tag'] == 'NGTV':
        d2[row['term']] = -1
    if row['tag'] == 'PSTV':
        d2[row['term']] = 1

In [27]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d2)

In [28]:
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.45      0.14      0.21      1434
     neutral       0.50      0.44      0.46      4034
    positive       0.40      0.62      0.49      2795

    accuracy                           0.45      8263
   macro avg       0.45      0.40      0.39      8263
weighted avg       0.46      0.45      0.43      8263



Использование значений -1 и 1 вместо точных значений дает лучший результат

Загрузка словаря SentiLex

In [5]:
# try ru_senti_lex sentiment dict (plain +/-1 values)
# source: http://www.labinform.ru/pub/rusentilex/rusentilex_2017.txt
d3 = {}
with open('ru_senti_lex.csv', 'r', encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    columns = line.split(',')
    if ' ' not in columns[0].strip():
        if columns[3].strip() == 'negative':
            d3[columns[0].strip()] = -1
        if columns[3].strip() == 'positive':
            d3[columns[0].strip()] = 1

In [30]:
len(d3)

10542

In [31]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d3)

In [32]:
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.40      0.31      0.35      1434
     neutral       0.49      0.64      0.56      4034
    positive       0.47      0.32      0.38      2795

    accuracy                           0.48      8263
   macro avg       0.45      0.42      0.43      8263
weighted avg       0.47      0.48      0.46      8263



Словарь SentiLex лучше первого

Загрузка и предобработка словаря из исследования http://www.dialog-21.ru/media/3402/kotelnikovevetal.pdf

In [6]:
# try sentiment dict from here: http://www.dialog-21.ru/media/3402/kotelnikovevetal.pdf
# dicts: https://drive.google.com/drive/u/0/folders/0B38i30_htmrTSm1mVXRZNnljNlk
# n annotators = 4, pos/neg only

dicts = []
domains = ['cars', 'movies', 'cameras', 'books', 'restaurants']
for domain in domains:
    d = {}
    with open(domain + '.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            columns = line.split('\t')
            if columns[1] == '4':
                d[columns[0]] = 'positive'
            if columns[2] == '4':
                d[columns[0]] = 'negative'
    dicts.append(d)

In [7]:
d4 = {}
all_words = []
for d in dicts:
    all_words.extend(list(d.keys()))
all_words = set(all_words)

for word in all_words:
    values = []
    for d in dicts:
        if word in d.keys():
            values.append(d[word])
    if len(set(values)) == 1:
        d4[word] = list(set(values))[0]
        
for word, sentiment in d4.items():
    if sentiment == 'positive':
        d4[word] = 1
    else:
        d4[word] = -1

In [35]:
len(d4)

1114

In [36]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d4)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.45      0.06      0.11      1434
     neutral       0.48      0.91      0.63      4034
    positive       0.39      0.06      0.11      2795

    accuracy                           0.48      8263
   macro avg       0.44      0.35      0.28      8263
weighted avg       0.44      0.48      0.36      8263



При простом словарном подходе лучший результат показывает словарь SentiLex (d3): accuracy -- 48, F1 -- 46

### Lexicon-based approach w/custom lexicon formed using word2vec

In [8]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
from preprocessing import *
from collections import Counter
from string import punctuation
from nltk.tokenize import sent_tokenize, word_tokenize

In [9]:
# build vocabulary and train word2vec model
def build_w2v_model(documents, min_count):
    w2v_model = gensim.models.Word2Vec(documents, size=300, window=5, min_count=min_count)
    return w2v_model

# tokenize text into list of lists for building w2v model
def tokenize_text(text):
    stopwords = set(get_lexicon_from_file('ru_stopwords_extended.txt'))
    sentences = sent_tokenize(text)
    sentences_tokenized = []
    for sentence in sentences:
        words = [token for token in word_tokenize(sentence) if token.isalpha() and token.isascii() == False 
                 and len(token) > 2 and token not in stopwords]
        if words != []:
            sentences_tokenized.extend(words)
    return sentences_tokenized

# check if word in w2v-vocab
def check_if_word_in_vocab(model, word):
    return word in model.wv.vocab

# get semantic associates for a word in a list of w2v models       
def get_most_similar(word, topn, model):
    try:
        print(model.wv.most_similar(positive=word, topn=topn))
    except KeyError:
        print('\nNo such word in vocabulary\n')
        
def get_most_similar_list(model, word, topn):
    return list(dict(model.wv.most_similar(positive=word, topn=topn)).keys())

In [39]:
corpus = []
for idx, row in data.iterrows():
    corpus.append(tokenize_text(row['text_lemmatized']))
    
len(corpus)

8263

Построение word2vec-модели на текстах корпуса

In [40]:
w2v_model = build_w2v_model(corpus, 10)

Формирование расширенного тонального словаря на основе семантических ассоциатов каждого слова в изначальном словаре

Референсное исследование: https://cyberleninka.ru/article/n/analiz-primeneniya-distributivno-semanticheskih-modeley-dlya-popolneniya-slovarya-otsenochnoy-leksiki/viewer

Изначальный словарь -- самый маленький из использованных выше (1114 слов)

In [41]:
d5 = d4.copy()

for word in d4.keys():
    try: 
        associates = get_most_similar_list(w2v_model, word, 10)
        for associate in associates:
            d5[associate] = d5[word]
    except KeyError:
        pass   
    
len(d5) # +1638 new words

2752

Сентимент-анализ с использованием расширенного словаря

In [42]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d5)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.41      0.37      0.39      1434
     neutral       0.51      0.29      0.37      4034
    positive       0.42      0.69      0.52      2795

    accuracy                           0.44      8263
   macro avg       0.44      0.45      0.43      8263
weighted avg       0.46      0.44      0.42      8263



По сравнению с простым словарным подходом результат не стал лучше. F-мера для того же словаря повысилась (42 против 36), но по простой SentiLex все равно показывает более качественный результат -- 46

Расширение словаря не 10, а 5 ближайшими семантическими ассоциатами каждого слова

In [43]:
# 5 closest associates

d6 = d4.copy()
for word in d4.keys():
    try: 
        associates = get_most_similar_list(w2v_model, word, 5)
        for associate in associates:
            d6[associate] = d6[word]
    except KeyError:
        pass    
    
len(d6)

2030

In [44]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d6)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.39      0.31      0.34      1434
     neutral       0.51      0.41      0.45      4034
    positive       0.41      0.57      0.47      2795

    accuracy                           0.44      8263
   macro avg       0.43      0.43      0.42      8263
weighted avg       0.45      0.44      0.44      8263



Улучшить результат снова не удалось

Попробуем взять словарь SentiLex и расширить его

In [49]:
d7 = d3.copy()

# 10 closest associates
for word in d3.keys():
    try: 
        associates = get_most_similar_list(w2v_model, word, 10)
        for associate in associates:
            d7[associate] = d7[word]
    except KeyError:
        pass   
    
len(d7)

15626

In [50]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d7)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.34      0.57      0.43      1434
     neutral       0.48      0.09      0.16      4034
    positive       0.41      0.75      0.53      2795

    accuracy                           0.40      8263
   macro avg       0.41      0.47      0.37      8263
weighted avg       0.43      0.40      0.33      8263



F-мера упала, лучше брать словарь поменьше

Попробуем использовать word2vec-модель, обученную не на самих классифицируемых текстах, а на более крупном корпусе новостей (источник: RusVectores)

In [47]:
w2v_model1 = KeyedVectors.load_word2vec_format('russian_news_model_rusvectores.bin', binary=True)

Составление тонального словаря с предварительной очисткой слов от pos-тегов

In [48]:
pos_tags = []
for word in w2v_model1.wv.vocab:
    pos_tag = word.split('_')[1]
    pos_tags.append(pos_tag)
pos_tags = list(set(pos_tags))
pos_tags

['INTJ', 'VERB', 'PROPN', 'SYM', 'NUM', 'NOUN', 'ADV', 'ADJ', 'X']

In [51]:
d8 = d4.copy()

for word in d4.keys():
    for pos_tag in pos_tags:
        try: 
            associates = get_most_similar_list(w2v_model1, word + '_' + pos_tag, 10)
            for associate in associates:
                associate_clean = associate.split('_')[0]
                if associate_clean.isascii() == False and associate_clean.isalpha():
                    d8[associate_clean] = d8[word]
            break
        except KeyError:
            pass   
        
len(d8)

5644

In [53]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d8)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.24      0.24      0.24      1434
     neutral       0.48      0.69      0.56      4034
    positive       0.38      0.13      0.20      2795

    accuracy                           0.42      8263
   macro avg       0.37      0.35      0.33      8263
weighted avg       0.40      0.42      0.38      8263



Ощутимого улучшения результата нет (как и при использовании большего или меньшего количества семантических ассоциатов)

## Данные 2: отзывы о кино

In [14]:
data = pd.read_csv('reviews_lemmatized.csv')

In [15]:
len(data) # Корпус гораздо больше первого

46501

In [16]:
set(data.label)

{-1, 0, 1}

Преобразование числовых значений сентимента в строковые

In [17]:
def sentiment_num_to_word(sentiment):
    if sentiment == -1:
        return 'negative'
    if sentiment == 0:
        return 'neutral'
    if sentiment == 1:
        return 'positive'
    
data['sentiment'] = data['label'].apply(sentiment_num_to_word)

### Simple lexicon-based approach

Посмотрим на результаты сентимент-анализа с использованием разных словарей

SentiLex

In [59]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d3)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.19      0.35      0.25      4376
     neutral       0.14      0.37      0.20      5645
    positive       0.85      0.54      0.66     36480

    accuracy                           0.50     46501
   macro avg       0.39      0.42      0.37     46501
weighted avg       0.70      0.50      0.57     46501



Маленький словарь

In [60]:
data['sentiment_by_lex'] = data['text'].apply(detect_sentiment, sentiment_dict=d4)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.41      0.22      0.29      4376
     neutral       0.14      0.57      0.23      5645
    positive       0.87      0.51      0.64     36480

    accuracy                           0.49     46501
   macro avg       0.47      0.43      0.39     46501
weighted avg       0.74      0.49      0.56     46501



Результаты примерно одинаковые, у SentiLex чуть выше: accuracy -- 50, F-мера -- 57

### Lexicon-based approach w/custom lexicon

Обучение word2vec-модели на самом корпусе

In [30]:
corpus = []
for idx, row in data.iterrows():
    corpus.append(tokenize_text(row['text_lemmatized']))
    
len(corpus)

46501

In [20]:
w2v_model2 = build_w2v_model(corpus, 10)

Расширение тонального словаря, 10 семантических ассоциатов

In [22]:
d9 = d4.copy()
for word in d4.keys():
    try: 
        associates = get_most_similar_list(w2v_model2, word, 10)
        for associate in associates:
            d9[associate] = d9[word]
    except KeyError:
        pass   
len(d9)

5447

In [29]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d9)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.27      0.39      0.32      4376
     neutral       0.16      0.14      0.15      5645
    positive       0.84      0.81      0.83     36480

    accuracy                           0.69     46501
   macro avg       0.42      0.45      0.43     46501
weighted avg       0.70      0.69      0.70     46501



Результат заметно лучше. По сравнению с маленьким словарем F-мера выросла с 56 до 70, точность -- с 49 до 69 (у простого SentiLex F-мера была 57, точность -- 50)

Попробуем взять 5 семантических ассоциатов вместо 10

In [33]:
d10 = d4.copy()
for word in d4.keys():
    try: 
        associates = get_most_similar_list(w2v_model2, word, 5)
        for associate in associates:
            d10[associate] = d10[word]
    except KeyError:
        pass   
len(d10)

3556

In [34]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d10)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.29      0.41      0.34      4376
     neutral       0.16      0.18      0.17      5645
    positive       0.85      0.79      0.82     36480

    accuracy                           0.68     46501
   macro avg       0.43      0.46      0.44     46501
weighted avg       0.71      0.68      0.70     46501



С 10 результат чуть лучше

In [35]:
d11 = d4.copy()
for word in d4.keys():
    try: 
        associates = get_most_similar_list(w2v_model2, word, 15)
        for associate in associates:
            d11[associate] = d11[word]
    except KeyError:
        pass   
len(d11)

7019

15 ассоциатов

In [36]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d11)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.25      0.44      0.32      4376
     neutral       0.16      0.14      0.15      5645
    positive       0.84      0.79      0.82     36480

    accuracy                           0.68     46501
   macro avg       0.42      0.46      0.43     46501
weighted avg       0.71      0.68      0.69     46501



Оптимальное число ассоциатов -- 10

Возьмем модель из похожего домена с RusVectores -- web_mystem_skipgram_500_2_2015

Источник: https://rusvectores.org/ru/models/

In [37]:
w2v_model3 = KeyedVectors.load_word2vec_format('web.bin', binary=True)

In [41]:
pos_tags = []
for word in w2v_model3.wv.vocab:
    pos_tag = word.split('_')[1]
    pos_tags.append(pos_tag)
pos_tags = list(set(pos_tags))

d12 = d4.copy()

for word in d4.keys():
    for pos_tag in pos_tags:
        try: 
            associates = get_most_similar_list(w2v_model3, word + '_' + pos_tag, 10)
            for associate in associates:
                associate_clean = associate.split('_')[0]
                if associate_clean.isascii() == False and associate_clean.isalpha():
                    d12[associate_clean] = d12[word]
            break
        except KeyError:
            pass   
        
len(d12)

5496

In [42]:
data['sentiment_by_lex'] = data['text_lemmatized'].apply(detect_sentiment, sentiment_dict=d12)
print(classification_report(data.sentiment, data.sentiment_by_lex))

              precision    recall  f1-score   support

    negative       0.18      0.62      0.28      4376
     neutral       0.13      0.14      0.14      5645
    positive       0.87      0.60      0.71     36480

    accuracy                           0.55     46501
   macro avg       0.39      0.45      0.37     46501
weighted avg       0.71      0.55      0.60     46501



Обучение модели на самом корпусе дает лучшие результаты