In [1]:
import conllu, os, re, json
import xml.etree.ElementTree as et
import pandas as pd

from ufal.udpipe import Model, Pipeline
from gensim.models import KeyedVectors
from collections import defaultdict

In [138]:
def dump_json(obj, f):
    with open(f, 'w', encoding='utf-8') as outp:
        json.dump(obj, outp, ensure_ascii=False)

## 1.Extracting data from corpus

In [3]:
def get_tonal_markup(wordlist1, bigram_list1, trigram_list1, wordlist2, bigram_list2, trigram_list2, aspects, conllu_folder):
  new_path = conllu_folder + '_auto_processed'

  if not os.path.exists(new_path):
    os.mkdir(new_path)

  for file in os.listdir(conllu_folder):
    outp = []
    path = os.path.join(conllu_folder, file)
    t = conllu.parse(open(path, 'r', encoding='utf-8').read())
    for sent_id, sent in enumerate(t):
      sent_id += 1
      t_id1 = 0
      for token1, token2, token3 in zip([i['lemma'] for i in sent], [i['lemma'] for i in sent[1:]]+[''], [i['lemma'] for i in sent[2:]]+['','']):
        t_id1 += 1
        trigram = token1+' '+token2+' '+token3
        bigram = token1+' '+token2
        if trigram in trigram_list1:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+2)+'\t'+aspects[0]+'\t'+str(trigram_list1[trigram]))
        elif bigram in bigram_list1:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+1)+'\t'+aspects[0]+'\t'+str(bigram_list1[bigram]))
        elif token1 in wordlist1:
          outp.append(str(sent_id)+'\t'+str(t_id1)+'\t'+aspects[0]+'\t'+str(wordlist1[token1]))
        elif trigram in trigram_list2:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+2)+'\t'+aspects[1]+'\t'+str(trigram_list2[trigram]))
        elif bigram in bigram_list2:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+1)+'\t'+aspects[1]+'\t'+str(bigram_list2[bigram]))
        elif token1 in wordlist2:
          outp.append(str(sent_id)+'\t'+str(t_id1)+'\t'+aspects[1]+'\t'+str(wordlist2[token1]))
    
    path = os.path.join(conllu_folder+"_auto_processed", file[:file.rfind('.')]+"_auto_processed.tsv")
    with open(path, 'w', encoding='utf-8') as file_to_write:
      for line in outp:
        file_to_write.write(line+'\n')

In [140]:
xtree = et.parse("development/SentiRuEval_rest_train.xml")
root = xtree.getroot()
data = []
columns = ['id', 'food', 'service', 'text']
for review in root:
    text_id = int(review.attrib['id'])
    
    scores = review.find('scores')
    
    food = int(scores.find('food').text)
    service = int(scores.find('service').text)
    
    text = review.find('text').text
    
    data.append({'id': text_id,
                'food': food,
                'service': service,
                'text': text})
    

df = pd.DataFrame(data, columns=columns)
df = df.set_index('id')

In [141]:
udpipe_model = Model.load('udpipe_models/russian-syntagrus-ud-2.0-170801.udpipe')
pipeline = Pipeline(udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

In [142]:
def udpipe_lemmatize(text, ud_pipeline=pipeline):
    processed = conllu.parse(pipeline.process(text))
    return [[token['lemma'] for token in sent if token['upostag']!='PUNCT'] for sent in processed]

In [9]:
food_words = dict()
service_words = dict()

with open('development/Food_words.txt', 'r', encoding='utf-8') as inp:
    for line in inp.readlines():
        aspect, word, score = line.strip().split('        ')
        food_words[word] = score

with open('development/Service_words.txt', 'r', encoding='utf-8') as inp:
    for line in inp.readlines():
        aspect, word, score = line.strip().split('\t')
        service_words[word] = score

In [10]:
new_food_words = dict()
new_service_words = dict()

for file in os.listdir('разметка_финал'):
    if file.endswith('.tsv'):
        print(file)
        
        with open(os.path.join('разметка_финал',file), 'r', encoding='utf-8') as inp:
            lines = inp.readlines()

        conllu_path = os.path.join('conllu_data', file)
        with open(conllu_path, 'r', encoding='utf-8') as inp:
            conll = conllu.parse(inp.read())

        for line in lines:
            if line:
                sent_id, token_ids, aspect, mark = line.strip().split('\t')
                mark = int(mark)
                sent_id = int(sent_id)-1
                token_ids = [int(i) for i in token_ids.split(',')]
                start_id, end_id = int(token_ids[0])-1, int(token_ids[-1])
                try:
                    tokens = ' '.join(token['lemma'] for token in conll[sent_id][start_id:end_id])
                except:
                    print(sent_id, start_id, end_id)
                if aspect.strip() == 'Service' and tokens not in new_service_words:
                    new_service_words[tokens] = mark
                elif aspect.strip() == 'Food' and tokens not in new_food_words:
                    new_food_words[tokens] = mark

12943.tsv
13823.tsv
20086.tsv
28083.tsv
32840.tsv
32856.tsv
33591.tsv
33693.tsv
35486.tsv
5648.tsv


Посмотрим на пересечения выделенных нами словарей:

In [10]:
set(food_words) & set(new_food_words), set(service_words) & set(new_service_words),

({'большой',
  'вкусный',
  'невкусный',
  'отличный',
  'понравиться',
  'прекрасный',
  'сытный'},
 {'вежливый', 'ненавязчивый', 'приятный', 'хамоватый', 'хороший'})

И на их объединение:

In [11]:
set(food_words)|set(new_food_words), set(service_words)|set(new_service_words)

({'10 балл',
  'большой',
  'великолепный',
  'весь остыть',
  'вкусно',
  'вкусный',
  'впечатлять',
  'высокий все похвасть',
  'горячий',
  'достойный',
  'единственный',
  'интересный',
  'не впечатлять',
  'не очень дорого',
  'невкусный',
  'нежный',
  'необычный',
  'отличный',
  'очень большой',
  'очень вкусный',
  'плохой',
  'понравиться',
  'посредственно',
  'прекрасный',
  'приятный',
  'различный',
  'разнообразный',
  'разнообразный вкусный',
  'расстроить',
  'свежий',
  'совершенно отвратительный',
  'странный',
  'сытный',
  'хороший'},
 {'вежливый',
  'веселый',
  'внимательный',
  'вполне приемлимый',
  'высокий качество',
  'гостеприимный',
  'доброжелательный',
  'дружелюбный',
  'душевный',
  'качественный',
  'красивый',
  'милый',
  'не слишком вежливо',
  'недолгий',
  'ненавязчивый',
  'оперативность',
  'оперативный',
  'отзывчивый',
  'отличный',
  'очень аккуратно',
  'очень приветливый',
  'плохо знать меню',
  'понравиться',
  'приветливый',
  'приятный

Извлечём 1,2,3-граммы из development-корпуса - получим Unlabeled 1,2,3-граммы:

In [143]:
def extract_123grams(texts, min_freq=5, process=udpipe_lemmatize):
    ugram_freq_dict, bigram_freq_dict, trigram_freq_dict = defaultdict(int), defaultdict(int), defaultdict(int)
    for text in texts:
        if type(text) == str and text:
            try:
              text = udpipe_lemmatize(text)
            except:
              print(text)
              return None, None, None
            ## extract 1grams
            for sent in text:
                for token1, token2, token3 in zip(sent, sent[1:], sent[2:]):
                    ugram_freq_dict[token1] += 1
                    bigram_freq_dict[token1+' '+token2] += 1
                    trigram_freq_dict[token1+' '+token2+' '+token3] += 1
                
                if len(sent) > 0:
                    ugram_freq_dict[sent[-1]] += 1
                    if len(sent) > 1:
                            bigram_freq_dict[sent[-2]+' '+sent[-1]] += 1
                            ugram_freq_dict[sent[-2]] += 1
            
    ugram_freq_dict = {k:v for k,v in ugram_freq_dict.items() if v > min_freq}
    bigram_freq_dict = {k:v for k,v in bigram_freq_dict.items() if v > min_freq}
    trigram_freq_dict = {k:v for k,v in trigram_freq_dict.items() if v > min_freq}
    
    return ugram_freq_dict, bigram_freq_dict, trigram_freq_dict

In [None]:
ugrams, bigrams, trigrams = extract_123grams(df['text'])

In [0]:
os.mkdir('Частотные словари development корпус')

In [0]:
with open('ugrams.json', 'w', encoding='utf-8') as ugram_outp:
  json.dump(ugrams, ugram_outp, ensure_ascii=False)

with open('bigrams.json', 'w', encoding='utf-8') as bigram_outp:
  json.dump(bigrams, bigram_outp, ensure_ascii=False)

with open('trigrams.json', 'w', encoding='utf-8') as trigram_outp:
  json.dump(trigrams, trigram_outp, ensure_ascii=False)

In [None]:
os.chdir('..')

In [11]:
all_food = dict()
all_service = dict()

for gram in set(food_words)|set(new_food_words):
  if gram in food_words:
    all_food[gram] = food_words[gram]
  elif gram in new_food_words:
    all_food[gram] = new_food_words[gram]

for gram in set(service_words)|set(new_service_words):
  if gram in service_words:
    all_service[gram] = service_words[gram]
  elif gram in new_service_words:
    all_service[gram] = new_service_words[gram]

In [12]:
food_unigrams = {k:v for k,v in all_food.items() if k.count(' ')==0}
food_bigrams = {k:v for k,v in all_food.items() if k.count(' ')==1}
food_trigrams = {k:v for k,v in all_food.items() if k.count(' ')==2}

service_unigrams = {k:v for k,v in all_service.items() if k.count(' ')==0}
service_bigrams = {k:v for k,v in all_service.items() if k.count(' ')==1}
service_trigrams = {k:v for k,v in all_service.items() if k.count(' ')==2}

In [0]:
def extract_123grams(texts, min_freq=5, conllu_folder=False):
    ugram_freq_dict, bigram_freq_dict, trigram_freq_dict = defaultdict(int), defaultdict(int), defaultdict(int)
    if conllu_folder:
        texts = [conllu.parse(open(os.path.join(texts,i), 'r', encoding='utf-8').read())for i in os.listdir(texts)]
        for text_id, text in enumerate(texts):
          for sent_id, sent in enumerate(text):
            texts[text_id][sent_id] = [i['lemma'] for i in sent if i['upostag']!='PUNCT']
    for text in texts:
        if not conllu_folder:
            if type(text) == str and text:
                try:
                  text = udpipe_lemmatize(text)
                except:
                  print(text)
                  return None, None, None
            else:
                continue
            ## extract 1grams
        for sent in text:
            for token1, token2, token3 in zip(sent, sent[1:], sent[2:]):
                ugram_freq_dict[token1] += 1
                bigram_freq_dict[token1+' '+token2] += 1
                trigram_freq_dict[token1+' '+token2+' '+token3] += 1
            
            if len(sent) > 0:
                ugram_freq_dict[sent[-1]] += 1
                if len(sent) > 1:
                    bigram_freq_dict[sent[-2]+' '+sent[-1]] += 1
                    ugram_freq_dict[sent[-2]] += 1
            
    ugram_freq_dict = {k:v for k,v in ugram_freq_dict.items() if v > min_freq}
    bigram_freq_dict = {k:v for k,v in bigram_freq_dict.items() if v > min_freq}
    trigram_freq_dict = {k:v for k,v in trigram_freq_dict.items() if v > min_freq}
    
    return ugram_freq_dict, bigram_freq_dict, trigram_freq_dict

In [0]:
train_unigrams, train_bigrams, train_trigrams = extract_123grams('conllu_data', conllu_folder=True, min_freq=0)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
vec = TfidfVectorizer(tokenizer=lambda x: [j for i in udpipe_lemmatize(x) for j in i]).fit([i for i in df['text'] if type(i)==str])

In [1]:
idf_vocab = dict()

for word in vec.vocabulary_:
  idf_vocab[word] = vec.idf_[vec.vocabulary_[word]]

NameError: name 'vec' is not defined

In [0]:
with open('idf_vocab.json', 'w', encoding='utf-8') as outp:
  json.dump(idf_vocab, outp, ensure_ascii=False)

## 2. Classifying word tonality

In [34]:
import numpy as np
from scipy.spatial.distance import cosine

In [2]:
base_dir = 'Частотные словари development корпус'

with open(os.path.join(base_dir, 'ugrams.json'), 'r', encoding='utf-8') as inp:
    unigrams = json.load(inp)

with open(os.path.join(base_dir, 'bigrams.json'), 'r', encoding='utf-8') as inp:
    bigrams = json.load(inp)

with open(os.path.join(base_dir, 'trigrams.json'), 'r', encoding='utf-8') as inp:
    trigrams = json.load(inp)

In [3]:
with open('idf_vocab.json', 'r', encoding='utf-8') as inp:
    idf_dict = json.load(inp)

In [4]:
tonal_df = pd.read_excel("linis_dictionary/words_all_full_rating.xlsx")

In [6]:
tonal_df.head()

Unnamed: 0,Words,mean,dispersion,average rate
0,аборигенный,-0.25,0.433013,0
1,аборт,-1.0,0.816497,-1
2,абрамович,0.0,0.0,0
3,абсолютный,0.333333,0.471405,0
4,абстрактный,-0.111111,0.87489,0


In [7]:
ft_model_path = 'C:\\Users\\k1l77\\infosearch\\infosearch_assignments\\final_project\\models\\fasttext\\model.model'

In [8]:
ft_model = KeyedVectors.load(ft_model_path)

In [121]:
food_pos = [i for i in all_food if int(all_food[i])==1]
food_neg = [i for i in all_food if int(all_food[i])==0]
service_pos = [i for i in all_service if int(all_service[i])==1]
service_neg = [i for i in all_service if int(all_service[i])==0]

In [122]:
len(all_food), len(all_service)

(34, 34)

In [123]:
len(food_pos), len(food_neg), len(service_pos), len(service_neg)

(27, 7, 29, 5)

In [124]:
def vectorize(ngram, ft_model=ft_model):
    words = ngram.split(' ')
    try:
        idfs = [idf_dict[word] for word in words]
    except:
        idfs = [1 for word in words]
    idf_norm = sum(idfs)
    idfs = [i/idf_norm for i in idfs]
    return np.sum([ft_model[word] for word in words], axis=0)

In [125]:
avg = lambda x: sum(x)/len(x)

In [126]:
tonal_df.loc[tonal_df['average rate']!=0].shape

(2681, 8)

In [127]:
tonal_df['dist_food_pos'] = tonal_df['Words'].apply(lambda x: min([cosine(ft_model[x],
                                                                          vectorize(i)) for i in food_pos if i in ft_model]) if x in ft_model else None)
tonal_df['dist_food_neg'] = tonal_df['Words'].apply(lambda x: min([cosine(ft_model[x],
                                                                          vectorize(i)) for i in food_neg if i in ft_model]) if x in ft_model else None)
tonal_df['dist_service_pos'] = tonal_df['Words'].apply(lambda x: min([cosine(ft_model[x],
                                                                          vectorize(i)) for i in service_pos if i in ft_model]) if x in ft_model else None)
tonal_df['dist_service_neg'] = tonal_df['Words'].apply(lambda x: min([cosine(ft_model[x],
                                                                          vectorize(i)) for i in service_neg if i in ft_model]) if x in ft_model else None)

In [128]:
positive_food_words = tonal_df.loc[(tonal_df['average rate'] > 0) & (tonal_df['dist_food_pos'] < 0.33)]
negative_food_words = tonal_df.loc[(tonal_df['average rate'] < 0) & (tonal_df['dist_food_neg'] < 0.33)]
positive_service_words = tonal_df.loc[(tonal_df['average rate'] > 0) & (tonal_df['dist_service_pos'] < 0.35)]
negative_service_words = tonal_df.loc[(tonal_df['average rate'] < 0) & (tonal_df['dist_service_neg'] < 0.35)]

Таким способом у нас выделяется очень мало тональных слов:

In [129]:
positive_food_words

Unnamed: 0,Words,mean,dispersion,average rate,dist_food_pos,dist_food_neg,dist_service_pos,dist_service_neg
172,аппетитный,1.0,0.0,1,0.301613,0.322828,0.562092,0.566851
765,вкусный,1.333333,0.471405,1,0.0,0.118757,0.514137,0.529298
908,впечатлять,1.0,0.816497,1,0.0,0.175848,0.573486,0.690705
909,впечатляющий,1.2,0.4,1,0.128095,0.284354,0.524866,0.617993
1552,достойный,0.666667,0.471405,1,0.0,0.70087,0.586399,0.720102
1827,занятный,0.666667,0.942809,1,0.309227,0.555894,0.536328,0.604198
2152,интересный,1.333333,0.471405,1,0.0,0.475116,0.480831,0.592428
3312,небезынтересный,0.666667,0.471405,1,0.192283,0.497975,0.558973,0.583303
3426,нежность,1.333333,0.471405,1,0.291765,0.654892,0.475583,0.678658
3552,необыкновенный,0.666667,0.471405,1,0.321173,0.591423,0.547207,0.495839


In [130]:
negative_food_words

Unnamed: 0,Words,mean,dispersion,average rate,dist_food_pos,dist_food_neg,dist_service_pos,dist_service_neg
302,безвкусный,-1.333333,0.471405,-1,0.352997,0.250005,0.515652,0.500159
1591,дурной,-1.0,0.0,-1,0.404308,0.318719,0.404308,0.506686
3341,невкусный,-1.0,0.666667,-1,0.118757,0.0,0.561238,0.524946
3785,несъедобный,-1.0,0.816497,-1,0.355387,0.268345,0.582637,0.575463
4316,отвратительный,-2.0,0.0,-2,0.474315,0.234001,0.55176,0.288479
4660,плохо,-1.333333,0.471405,-1,0.51487,0.319332,0.375858,0.281194
4661,плохой,-1.333333,0.471405,-1,0.264011,0.0,0.264011,0.441741
4947,посредственный,-0.666667,0.471405,-1,0.594043,0.165137,0.543464,0.696751
5636,расстраиваться,-1.25,0.433013,-1,0.53001,0.194153,0.53001,0.597681
5639,расстройство,-0.666667,0.471405,-1,0.697115,0.262955,0.564006,0.729128


In [131]:
positive_service_words

Unnamed: 0,Words,mean,dispersion,average rate,dist_food_pos,dist_food_neg,dist_service_pos,dist_service_neg
73,аккуратный,1.0,0.0,1,0.639506,0.666374,0.301067,0.577808
384,бережливый,1.0,0.0,1,0.58984,0.591716,0.301188,0.578068
467,благодарить,1.333333,0.471405,1,0.631592,0.721053,0.346802,0.623102
471,благожелательный,0.833333,0.372678,1,0.473942,0.640529,0.1006,0.577233
650,вежливость,1.222222,0.628539,1,0.654785,0.743174,0.286305,0.471309
651,вежливый,1.0,0.816497,1,0.549789,0.642324,0.0,0.425045
795,внимательный,1.0,0.0,1,0.535346,0.698112,0.0,0.643679
1044,высококачественный,1.666667,0.471405,2,0.456346,0.557,0.223513,0.65457
1478,доброжелательный,1.25,0.433013,1,0.534899,0.627329,0.0,0.590633
1479,доброкачественный,1.333333,0.471405,1,0.457946,0.51757,0.201559,0.667699


In [132]:
negative_service_words

Unnamed: 0,Words,mean,dispersion,average rate,dist_food_pos,dist_food_neg,dist_service_pos,dist_service_neg
104,аляповатый,-1.0,0.0,-1,0.536405,0.53969,0.469332,0.28477
1180,глуповатый,-1.0,0.0,-1,0.508597,0.550245,0.433894,0.318916
1279,грубоватый,-0.666667,0.471405,-1,0.506048,0.608854,0.452018,0.32701
1700,жуликоватый,-1.5,0.5,-2,0.613951,0.601603,0.451664,0.296812
1707,жуткий,-1.25,0.433013,-1,0.431019,0.508271,0.595861,0.304386
1708,жутковатый,-1.0,0.0,-1,0.449695,0.526484,0.496054,0.349554
2540,кошмарный,-1.333333,0.471405,-1,0.478076,0.504385,0.59329,0.326208
3075,мрачноватый,-1.0,0.0,-1,0.52037,0.618466,0.456256,0.316012
4316,отвратительный,-2.0,0.0,-2,0.474315,0.234001,0.55176,0.288479
4660,плохо,-1.333333,0.471405,-1,0.51487,0.319332,0.375858,0.281194


Но это лучше чем ничего

In [133]:
food_neg = set(food_neg) | set(negative_food_words['Words'])
food_pos = set(food_pos) | set(positive_food_words['Words'])
service_neg = set(service_neg) | set(negative_service_words['Words'])
service_pos = set(service_pos) | set(positive_service_words['Words'])

In [134]:
len(food_neg), len(food_pos), len(service_neg), len(service_pos)

(15, 40, 20, 54)

In [109]:
len(all_food)

34

In [110]:
len(all_service)

34

In [135]:
all_service = dict()
all_food = dict()

for i in food_neg:
    all_food[i] = 0

for i in food_pos:
    all_food[i] = 1

for i in service_pos:
    all_service[i] = 1

for i in service_neg:
    all_service[i] = 0

In [139]:
dump_json(all_service, 'all_service.json')
dump_json(all_food, 'all_food.json')

Попробуем также выделить тональные слова, биграммы и триграммы, используя размечённые по тональности текст из большого корпуса:

In [144]:
df.head()

Unnamed: 0_level_0,food,service,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17600,8,8,И пускай на меня не обижается наш прославленны...
23518,9,10,"- Здравствуйте. Виа Д’Арженто! - Добрый вечер,..."
27221,9,1,"Советую вам уволить Вашего метродотеля Елену, ..."
29097,8,9,отличный средне вековый интеръер. Приятное обс...
23065,10,8,Ужинали в ресторане Баден-Баден 6 марта . Импо...


In [145]:
df['food'] = df['food'].apply(lambda x: 1 if x > 5 else 0)

In [146]:
df['service'] = df['service'].apply(lambda x: 1 if x > 5 else 0)

In [150]:
food_pos_corpus = df.loc[df['food'] == 1]['text']
food_neg_corpus = df.loc[df['food'] == 0]['text']
service_pos_corpus = df.loc[df['service'] == 1]['text']
service_neg_corpus = df.loc[df['service'] == 0]['text']

In [151]:
fpc_ugrams, fpc_bigrams, fpc_trigrams = extract_123grams(food_pos_corpus)

In [153]:
fnc_ugrams, fnc_bigrams, fnc_trigrams = extract_123grams(food_neg_corpus)

In [None]:
snc_ugrams, snc_bigrams, s