In [2]:
import conllu, os, re, json
import xml.etree.ElementTree as et
import pandas as pd

from ufal.udpipe import Model, Pipeline
from gensim.models import KeyedVectors
from collections import defaultdict

In [3]:
def get_tonal_markup(wordlist1, bigram_list1, trigram_list1, wordlist2, bigram_list2, trigram_list2, aspects, conllu_folder):
  new_path = conllu_folder + '_auto_processed'

  if not os.path.exists(new_path):
    os.mkdir(new_path)

  for file in os.listdir(conllu_folder):
    outp = []
    path = os.path.join(conllu_folder, file)
    t = conllu.parse(open(path, 'r', encoding='utf-8').read())
    for sent_id, sent in enumerate(t):
      sent_id += 1
      t_id1 = 0
      for token1, token2, token3 in zip([i['lemma'] for i in sent], [i['lemma'] for i in sent[1:]]+[''], [i['lemma'] for i in sent[2:]]+['','']):
        t_id1 += 1
        trigram = token1+' '+token2+' '+token3
        bigram = token1+' '+token2
        if trigram in trigram_list1:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+2)+'\t'+aspects[0]+'\t'+str(trigram_list1[trigram]))
        elif bigram in bigram_list1:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+1)+'\t'+aspects[0]+'\t'+str(bigram_list1[bigram]))
        elif token1 in wordlist1:
          outp.append(str(sent_id)+'\t'+str(t_id1)+'\t'+aspects[0]+'\t'+str(wordlist1[token1]))
        elif trigram in trigram_list2:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+2)+'\t'+aspects[1]+'\t'+str(trigram_list2[trigram]))
        elif bigram in bigram_list2:
          outp.append(str(sent_id)+'\t'+str(t_id1)+','+str(t_id1+1)+'\t'+aspects[1]+'\t'+str(bigram_list2[bigram]))
        elif token1 in wordlist2:
          outp.append(str(sent_id)+'\t'+str(t_id1)+'\t'+aspects[1]+'\t'+str(wordlist2[token1]))
    
    path = os.path.join(conllu_folder+"_auto_processed", file[:file.rfind('.')]+"_auto_processed.tsv")
    with open(path, 'w', encoding='utf-8') as file_to_write:
      for line in outp:
        file_to_write.write(line+'\n')

In [4]:
xtree = et.parse("development/SentiRuEval_rest_train.xml")
root = xtree.getroot()
data = []
columns = ['id', 'food', 'service', 'text']
for review in root:
    text_id = int(review.attrib['id'])
    
    scores = review.find('scores')
    
    food = int(scores.find('food').text)
    service = int(scores.find('service').text)
    
    text = review.find('text').text
    
    data.append({'id': text_id,
                'food': food,
                'service': service,
                'text': text})
    

df = pd.DataFrame(data, columns=columns)
df = df.set_index('id')

In [5]:
udpipe_model = Model.load('udpipe_models/russian-syntagrus-ud-2.0-170801.udpipe')
pipeline = Pipeline(udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

In [6]:
def udpipe_lemmatize(text, ud_pipeline=pipeline):
    processed = conllu.parse(pipeline.process(text))
    return [[token['lemma'] for token in sent if token['upostag']!='PUNCT'] for sent in processed]

In [7]:
t = conllu.parse(pipeline.process('Маша ела кашу, мама мыла раму'))

In [8]:
food_words = dict()
service_words = dict()

with open('development/Food_words.txt', 'r', encoding='utf-8') as inp:
    for line in inp.readlines():
        aspect, word, score = line.strip().split('        ')
        food_words[word] = score

with open('development/Service_words.txt', 'r', encoding='utf-8') as inp:
    for line in inp.readlines():
        aspect, word, score = line.strip().split('\t')
        service_words[word] = score

In [9]:
new_food_words = dict()
new_service_words = dict()

for file in os.listdir('разметка_финал'):
    if file.endswith('.tsv'):
        print(file)
        
        with open(os.path.join('разметка_финал',file), 'r', encoding='utf-8') as inp:
            lines = inp.readlines()

        conllu_path = os.path.join('conllu_data', file)
        with open(conllu_path, 'r', encoding='utf-8') as inp:
            conll = conllu.parse(inp.read())

        for line in lines:
            if line:
                sent_id, token_ids, aspect, mark = line.strip().split('\t')
                mark = int(mark)
                sent_id = int(sent_id)-1
                token_ids = [int(i) for i in token_ids.split(',')]
                start_id, end_id = int(token_ids[0])-1, int(token_ids[-1])
                try:
                    tokens = ' '.join(token['lemma'] for token in conll[sent_id][start_id:end_id])
                except:
                    print(sent_id, start_id, end_id)
                if aspect.strip() == 'Service' and tokens not in new_service_words:
                    new_service_words[tokens] = mark
                elif aspect.strip() == 'Food' and tokens not in new_food_words:
                    new_food_words[tokens] = mark

12943.tsv
13823.tsv
20086.tsv
28083.tsv
32840.tsv
32856.tsv
33591.tsv
33693.tsv
35486.tsv
5648.tsv


Посмотрим на пересечения выделенных нами словарей:

In [10]:
set(food_words) & set(new_food_words), set(service_words) & set(new_service_words),

({'большой',
  'вкусный',
  'невкусный',
  'отличный',
  'понравиться',
  'прекрасный',
  'сытный'},
 {'вежливый', 'ненавязчивый', 'приятный', 'хамоватый', 'хороший'})

И на их объединение:

In [11]:
set(food_words)|set(new_food_words), set(service_words)|set(new_service_words)

({'10 балл',
  'большой',
  'великолепный',
  'весь остыть',
  'вкусно',
  'вкусный',
  'впечатлять',
  'высокий все похвасть',
  'горячий',
  'достойный',
  'единственный',
  'интересный',
  'не впечатлять',
  'не очень дорого',
  'невкусный',
  'нежный',
  'необычный',
  'отличный',
  'очень большой',
  'очень вкусный',
  'плохой',
  'понравиться',
  'посредственно',
  'прекрасный',
  'приятный',
  'различный',
  'разнообразный',
  'разнообразный вкусный',
  'расстроить',
  'свежий',
  'совершенно отвратительный',
  'странный',
  'сытный',
  'хороший'},
 {'вежливый',
  'веселый',
  'внимательный',
  'вполне приемлимый',
  'высокий качество',
  'гостеприимный',
  'доброжелательный',
  'дружелюбный',
  'душевный',
  'качественный',
  'красивый',
  'милый',
  'не слишком вежливо',
  'недолгий',
  'ненавязчивый',
  'оперативность',
  'оперативный',
  'отзывчивый',
  'отличный',
  'очень аккуратно',
  'очень приветливый',
  'плохо знать меню',
  'понравиться',
  'приветливый',
  'приятный

Извлечём 1,2,3-граммы из development-корпуса - получим Unlabeled 1,2,3-граммы:

In [0]:
def extract_123grams(texts, min_freq=5, process=udpipe_lemmatize):
    ugram_freq_dict, bigram_freq_dict, trigram_freq_dict = defaultdict(int), defaultdict(int), defaultdict(int)
    for text in texts:
        if type(text) == str and text:
            try:
              text = udpipe_lemmatize(text)
            except:
              print(text)
              return None, None, None
            ## extract 1grams
            for sent in text:
                for token1, token2, token3 in zip(sent, sent[1:], sent[2:]):
                    ugram_freq_dict[token1] += 1
                    bigram_freq_dict[token1+' '+token2] += 1
                    trigram_freq_dict[token1+' '+token2+' '+token3] += 1
                
                if len(sent) > 0:
                    ugram_freq_dict[sent[-1]] += 1
                    if len(sent) > 1:
                            bigram_freq_dict[sent[-2]+' '+sent[-1]] += 1
                            ugram_freq_dict[sent[-2]] += 1
            
    ugram_freq_dict = {k:v for k,v in ugram_freq_dict.items() if v > min_freq}
    bigram_freq_dict = {k:v for k,v in bigram_freq_dict.items() if v > min_freq}
    trigram_freq_dict = {k:v for k,v in trigram_freq_dict.items() if v > min_freq}
    
    return ugram_freq_dict, bigram_freq_dict, trigram_freq_dict

In [0]:
ugrams,bigrams,trigrams  = extract_123grams(df['text'])

In [0]:
len(ugrams), len(bigrams), len(trigrams)

(17823, 57040, 24075)

In [0]:
os.chdir('..')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
os.chdir('drive/My Drive/АвтОбрЕЯ SuperPowerTeam3000')

In [0]:
os.mkdir('Частотные словари development корпус')

In [0]:
os.chdir('Частотные словари development корпус')

In [0]:
with open('ugrams.json', 'w', encoding='utf-8') as ugram_outp:
  json.dump(ugrams, ugram_outp, ensure_ascii=False)

with open('bigrams.json', 'w', encoding='utf-8') as bigram_outp:
  json.dump(bigrams, bigram_outp, ensure_ascii=False)

with open('trigrams.json', 'w', encoding='utf-8') as trigram_outp:
  json.dump(trigrams, trigram_outp, ensure_ascii=False)

In [0]:
all_food = dict()
all_service = dict()

for gram in set(food_words)|set(new_food_words):
  if gram in food_words:
    all_food[gram] = food_words[gram]
  elif gram in new_food_words:
    all_food[gram] = new_food_words[gram]

for gram in set(service_words)|set(new_service_words):
  if gram in service_words:
    all_service[gram] = service_words[gram]
  elif gram in new_service_words:
    all_service[gram] = new_service_words[gram]

In [0]:
food_unigrams = {k:v for k,v in all_food.items() if k.count(' ')==0}
food_bigrams = {k:v for k,v in all_food.items() if k.count(' ')==1}
food_trigrams = {k:v for k,v in all_food.items() if k.count(' ')==2}

service_unigrams = {k:v for k,v in all_service.items() if k.count(' ')==0}
service_bigrams = {k:v for k,v in all_service.items() if k.count(' ')==1}
service_trigrams = {k:v for k,v in all_service.items() if k.count(' ')==2}

In [0]:
def extract_123grams(texts, min_freq=5, conllu_folder=False):
    ugram_freq_dict, bigram_freq_dict, trigram_freq_dict = defaultdict(int), defaultdict(int), defaultdict(int)
    if conllu_folder:
        texts = [conllu.parse(open(os.path.join(texts,i), 'r', encoding='utf-8').read())for i in os.listdir(texts)]
        for text_id, text in enumerate(texts):
          for sent_id, sent in enumerate(text):
            texts[text_id][sent_id] = [i['lemma'] for i in sent if i['upostag']!='PUNCT']
    for text in texts:
        if not conllu_folder:
            if type(text) == str and text:
                try:
                  text = udpipe_lemmatize(text)
                except:
                  print(text)
                  return None, None, None
            else:
                continue
            ## extract 1grams
        for sent in text:
            for token1, token2, token3 in zip(sent, sent[1:], sent[2:]):
                ugram_freq_dict[token1] += 1
                bigram_freq_dict[token1+' '+token2] += 1
                trigram_freq_dict[token1+' '+token2+' '+token3] += 1
            
            if len(sent) > 0:
                ugram_freq_dict[sent[-1]] += 1
                if len(sent) > 1:
                    bigram_freq_dict[sent[-2]+' '+sent[-1]] += 1
                    ugram_freq_dict[sent[-2]] += 1
            
    ugram_freq_dict = {k:v for k,v in ugram_freq_dict.items() if v > min_freq}
    bigram_freq_dict = {k:v for k,v in bigram_freq_dict.items() if v > min_freq}
    trigram_freq_dict = {k:v for k,v in trigram_freq_dict.items() if v > min_freq}
    
    return ugram_freq_dict, bigram_freq_dict, trigram_freq_dict

In [0]:
train_unigrams, train_bigrams, train_trigrams = extract_123grams('conllu_data', conllu_folder=True, min_freq=0)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
vec = TfidfVectorizer(tokenizer=lambda x: [j for i in udpipe_lemmatize(x) for j in i]).fit([i for i in df['text'] if type(i)==str])

In [1]:
idf_vocab = dict()

for word in vec.vocabulary_:
  idf_vocab[word] = vec.idf_[vec.vocabulary_[word]]

NameError: name 'vec' is not defined

In [0]:
with open('idf_vocab.json', 'w', encoding='utf-8') as outp:
  json.dump(idf_vocab, outp, ensure_ascii=False)

In [0]:
tonal_df = pd.read_excel("linis_dictionary/words_all_full_rating.xlsx")

In [169]:
tonal_df

Unnamed: 0,Words,mean,dispersion,average rate
0,аборигенный,-0.250000,0.433013,0
1,аборт,-1.000000,0.816497,-1
2,абрамович,0.000000,0.000000,0
3,абсолютный,0.333333,0.471405,0
4,абстрактный,-0.111111,0.874890,0
...,...,...,...,...
7540,ярый,-0.333333,0.942809,0
7541,ясно,0.000000,0.000000,0
7542,ясность,0.666667,0.471405,1
7543,ясный,0.666667,0.471405,1


In [0]:
from gensim.models import KeyedVectors

In [173]:
!wget http://vectors.nlpl.eu/repository/11/182.zip

--2019-12-23 19:39:05--  http://vectors.nlpl.eu/repository/11/182.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 637613765 (608M) [application/zip]
Saving to: ‘182.zip’

182.zip               4%[                    ]  27.40M   162KB/s    eta 27m 24s^C
