In [596]:
import pandas as pd
import json
from collections import Counter

In [597]:
pool = pd.read_csv('ukr_ru_pool_last_batch.tsv', delimiter='\t')
pool = pool[['INPUT:text1', 'INPUT:text2', 
             'INPUT:source1', 'INPUT:source2',
             'INPUT:summ_text1', 'INPUT:summ_text2',
             'INPUT:summ_with_documents_text1', 'INPUT:summ_with_documents_text2',
             'OUTPUT:result', 'OUTPUT:comment', 'OUTPUT:main_result',
             'OUTPUT:type_problems_article_1', 'OUTPUT:type_problems_article_2',
             'OUTPUT:comment_which_problems_article_1',
             'OUTPUT:comment_which_problems_article_2',
             'ASSIGNMENT:link', 'ASSIGNMENT:task_id',
             'ASSIGNMENT:assignment_id', 'ASSIGNMENT:worker_id']].copy()
pool.rename(columns=lambda x: x.split(':')[1], inplace=True)
pool.rename(columns={'main_result':'what_same', 'comment':'what_comment'}, inplace=True)

In [598]:
pool[pool["type_problems_article_1"].notna()]

Unnamed: 0,text1,text2,source1,source2,summ_text1,summ_text2,summ_with_documents_text1,summ_with_documents_text2,result,what_comment,what_same,type_problems_article_1,type_problems_article_2,comment_which_problems_article_1,comment_which_problems_article_2,link,task_id,assignment_id,worker_id


In [601]:
pool[pool["type_problems_article_2"].notna()]

Unnamed: 0,text1,text2,source1,source2,summ_text1,summ_text2,summ_with_documents_text1,summ_with_documents_text2,result,what_comment,what_same,type_problems_article_1,type_problems_article_2,comment_which_problems_article_1,comment_which_problems_article_2,link,task_id,assignment_id,worker_id


# WHAT/WHO/WHERE/WHEN MV

In [610]:
def parse_columns(result):
    json_dict = json.loads(result)
    return pd.Series([json_dict['who_same'], json_dict['where_same'], json_dict['when_same']], 
                     index=['who_same', 'where_same', 'when_same'])

pool[['who_same', 'where_same', 'when_same']] = pool['result'].apply(parse_columns)

In [611]:
def MV(list_for_MV):
    word_counts = Counter(list_for_MV)
    most_common_word, frequency = word_counts.most_common(1)[0]
    if frequency > 1:
        return most_common_word
    else:
        return 'disagreement'


grouped = pool.groupby('task_id').agg({
    'who_same': MV,
    'where_same': MV,
    'when_same': MV,
    'what_same': MV,
})

grouped.rename(columns={'who_same': 'aggregated_who',
                        'what_same': 'aggregated_what',
                        'when_same': 'aggregated_when',
                        'where_same': 'aggregated_where',
                       }, inplace=True)
    

pool = pd.merge(pool, grouped, on='task_id', how='left')

In [612]:
pool["different_who"] = (pool["aggregated_who"] != pool["who_same"])
pool["different_when"] = (pool["aggregated_when"] != pool["when_same"])
pool["different_where"] = (pool["aggregated_where"] != pool["where_same"])
pool["different_what"] = (pool["aggregated_what"] != pool["what_same"])

# LLM

In [613]:
import requests
import configparser
import cohere
import time
import re

config = configparser.ConfigParser()
config.read("keys.config")

hf_key = config['credentials']['hf_key']
cohere_key = config['credentials']['cohere']

co = cohere.Client(cohere_key)

headers = {'Content-type': 'application/json', "Authorization": (f"Bearer " + hf_key)}

def query_mistral(prompt,
                  max_new_tokens=3,
                  do_sample=False, 
                  temperature=0.01,
                  top_p=0.99,
                  return_full_text=False):
    parameters = {'max_new_tokens':max_new_tokens, 
                  'return_full_text': return_full_text, 
                  'do_sample': True, 
                  'top_p':top_p, 
                  'temperature':temperature}
    options = {'use_cache': False}
    payload = {'inputs': prompt,
               'parameters': parameters,
               'options': options}
    data = json.dumps(payload)
    response = requests.request("POST",
                                "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2",
                                headers=headers,
                                data=data)
    try:
        print(json.loads(response.content.decode("utf-8")))
        return json.loads(response.content.decode("utf-8"))[0]['generated_text']
    except Exception as e:
        print(json.loads(response.content.decode("utf-8")))
    return 'Model error'

def query_command_r(prompt):
    try:
        response = co.chat(
            model="command-r",
            message=prompt,
            prompt_truncation="OFF"
        )
        time.sleep(7)
        print(response.text)
        return response.text
    except Exception as e:
        print(e)
        return e

# Different topics

In [614]:
def prompt_different_mistral(what_comment):
    return f'''
        <s>[INST]
        You are provided a comment, explaining why news in a pair of news are different.
        Your goal is to judge if out of this comment one can understand what these two news are about (Yes) or comment is just saying that the news are different (No).
        For instance, out of the following comments: 
        - "Иск от журналистов и суд над полицейским"
        Yes, both news are briefly summarized.
        - "Поединок с единоборств в США;обзор пятерки старых бойцов единоборств"
        Yes, both news are briefly summarized.
        - "Перша стаття про те, що в Дагестані налагоджено випуск кисневих вентилів для медобладнання у зв'язку з попитом, що виріс через пандемію. Друга стаття про проведення "контртерористичної операції" у Махачкалі та Каспійську."
        Yes, both news are briefly summarized.
        - "Перша-ковид в Украине,друга-парад в России"
        Yes, both news are briefly summarized.
        - "Нападение на пенсионера и погода"
        Yes, both news are briefly summarized.
        - "події різні"
        No, it just says that news are different.
        - "зовсім різні події"
        No, it just says that news are different.
        - "події ніяк не зв'язані між собою"
        [/INST]
        No
        </s>
        [INST]
        - "{what_comment}"
        [/INST]'''

def prompt_different_command_r(what_comment):
    return f'''
        ## Instruction
        You are provided a comment, explaining why news in a pair of news are different.
        Your goal is to judge if out of this comment one can understand what these two news are about (Yes) or comment is just saying that the news are different (No).
        
        ## Examples
        For instance, out of the following comments: 
        - "Иск от журналистов и суд над полицейским"
        Yes, both news are briefly summarized.
        - "Поединок с единоборств в США;обзор пятерки старых бойцов единоборств"
        Yes, both news are briefly summarized.
        - "Перша стаття про те, що в Дагестані налагоджено випуск кисневих вентилів для медобладнання у зв'язку з попитом, що виріс через пандемію. Друга стаття про проведення "контртерористичної операції" у Махачкалі та Каспійську."
        Yes, both news are briefly summarized.
        - "Перша-ковид в Украине,друга-парад в России"
        Yes, both news are briefly summarized.
        - "Нападение на пенсионера и погода"
        Yes, both news are briefly summarized.
        - "події різні"
        No, it just says that news are different.
        - "зовсім різні події"
        No, it just says that news are different.
        - "події ніяк не зв'язані між собою"
        No
        
        ## Input
        - "{what_comment}"'''

In [None]:
pool_different = pool[pool.what_same == "different"].copy()
pool_different["mistral_prompt"] = pool_different["what_comment"].apply(prompt_different_mistral)
pool_different["mistral_answer"] = pool_different["mistral_prompt"].apply(query_mistral)
pool_different["command_r_prompt"] = pool_different["what_comment"].apply(prompt_different_command_r)
pool_different["command_r_answer"] = pool_different["command_r_prompt"].apply(query_command_r)

In [617]:
def parse_response_different(input_string):
    input_string = input_string.lower()
    contains_yes = bool(re.search(r'\byes\b', input_string))
    contains_no = bool(re.search(r'\bno\b', input_string))
    
    if contains_yes and not contains_no:
        return "Yes"
    elif contains_no and not contains_yes:
        return "No"
    else:
        return "Maybe"

In [618]:
pool_different["Good_comment_mistral"] = pool_different["mistral_answer"].apply(parse_response_different)
pool_different[pool_different.Good_comment_mistral != "Yes"].what_comment

Series([], Name: what_comment, dtype: object)

In [619]:
pool_different["Good_comment_command_r"] = pool_different["command_r_answer"].apply(parse_response_different)
pool_different[pool_different.Good_comment_command_r != "Yes"].what_comment

5    1 стаття: Туркменістан Аірвейс призупинив поль...
Name: what_comment, dtype: object

### Reject/accept

In [622]:
pool_different['ACCEPT:verdict'] = pool_different["Good_comment_command_r"].apply(lambda x: "+" if x == "Yes" else "-")
pool_different['ACCEPT:comment'] = pool_different["Good_comment_command_r"].apply(lambda x: "" if x == "Yes" else "У навчанні було показано, що якщо новини різні, правильним коментарем є коротке пояснення, про що одна і друга новина. Інформації, що новини різні, недостатньо.")

In [624]:
pool_different = pool_different[[
    'text1', 'text2', 'source1', 'source2', 'summ_text1', 'summ_text2',
    'summ_with_documents_text1', 'summ_with_documents_text2', 'result',
    'what_comment', 'what_same', 'type_problems_article_1',
    'type_problems_article_2', 'comment_which_problems_article_1',
    'comment_which_problems_article_2', 'link', 'task_id', 'assignment_id',
    'worker_id', 'who_same', 'where_same', 'when_same',
    'aggregated_who', 'aggregated_where', 'aggregated_when',
    'aggregated_what', 'different_who', 'different_when', 'different_where',
    'different_what', 'ACCEPT:verdict', 'ACCEPT:comment']]

# Somewhat related topics

In [625]:
def prompt_somewhat_related_mistral(what_comment):
    return f'''
        <s>[INST]
        You are provided a comment written by a labeler, explaining why news in a pair of news are both on a one specific topic.
        However, some labelers did not read the instruction, where it says that covid-19/elections/death in itself is too broad of a topic, to be common for news.
        Also, if they did not read instruction, they won't specify the specific topic, just write smth like "загальна тема".
        If news are both a specific subtopic of covid-19 e.g. death statistics, recent anti-covid measures, new vaccine, new stamm; a specific subtopic of elections e.g. elections in USA 2024, elections in Russia 2024, elections in covid; a specific subtopic of death e.g. death from war, death from murder on a street then they are both on a one specific topic, if they are both just about covid-19/elections/death it does not mean at all that they both are on a one specific topic.
        Your goal is to judge if labelers did read the intruction (Yes), or they did not (No) and put a comment just saying that the common topic of news is covid-19.
        ## Examples
        - "В обоих статьях затронута тема пандемии коронавируса"
        No, labeler did not read instruction, it's just a comment about covid-19.
        - "Общая тема ковида и постковида"
        No, labeler did not read instruction, it's just a comment about covid-19.
        - "Пандемия"
         No, labeler did not read instruction, it's just a comment about covid-19.
        - "Новый штамм короновируса и короновирус"
        Yes, it's not only about covid-19, but about new stamm of it
        - "загальна тема"
        No, it does not say anything about the topic.
        - "Дослідження COVID-19"
        Yes, it's not only about covid-19, but about research around it
        - "обе темы касаются паники по поводу ковида"
        Yes, it's not only about covid-19, but also about panic around it
        - "общая тема выборы"
        No, labeler did not read instruction, it's just a comment about elections
        - "Экстренная посадка самолёта"
        Yes, it's not about covid-19/elections/death
        - "обе темы касаются паники по поводу ковида"
        [/INST]
        Yes, it's not only about covid-19, but also about panic around it
        </s>
        [INST]
        - "{what_comment}"
        [/INST]'''

def prompt_somewhat_related_command_r(what_comment):
    return f'''
        ## Instruction
        You are provided a comment written by a labeler, explaining why news in a pair of news are both on a one specific topic.
        However, some labelers did not read the instruction, where it says that covid-19/elections/death in itself is too broad of a topic, to be common for news.
        Also, if they did not read instruction, they won't specify the specific topic, just write smth like "загальна тема".
        If news are both a specific subtopic of covid-19 e.g. death statistics, recent anti-covid measures, new vaccine, new stamm; a specific subtopic of elections e.g. elections in USA 2024, elections in Russia 2024, elections in covid; a specific subtopic of death e.g. death from war, death from murder on a street then they are both on a one specific topic, if they are both just about covid-19/elections/death it does not mean at all that they both are on a one specific topic.
        Your goal is to judge if labelers did read the intruction (Yes), or they did not (No) and put a comment just saying that the common topic of news is covid-19.
        
        ## Examples
        - "В обоих статьях затронута тема пандемии коронавируса"
        No, labeler did not read instruction, it's just a comment about covid-19.
        - "Общая тема ковида и постковида"
        No, labeler did not read instruction, it's just a comment about covid-19.
        - "Пандемия"
         No, labeler did not read instruction, it's just a comment about covid-19.
        - "загальна тема"
        No, it does not say anything about the topic.
        - "Новый штамм короновируса и короновирус"
        Yes, it's not only about covid-19, but about new stamm of it
        - "общая тема выборы"
        No, labeler did not read instruction, it's just a comment about elections
        - "Дослідження COVID-19"
        Yes, it's not only about covid-19, but about research around it
        - "обе темы касаются паники по поводу ковида"
        Yes, it's not only about covid-19, but also about panic around it
        - "Экстренная посадка самолёта"
        Yes, it's not even about covid-19/elections/death
        - "обе темы касаются паники по поводу ковида"
        Yes, it's not only about covid-19, but also about panic around it
        ## Input
        - "{what_comment}"'''

In [None]:
pool_somewhat_related = pool[pool.what_same == "somewhat_related"].copy()
pool_somewhat_related["mistral_prompt"] = pool_somewhat_related["what_comment"].apply(prompt_somewhat_related_mistral)
pool_somewhat_related["mistral_answer"] = pool_somewhat_related["mistral_prompt"].apply(query_mistral)
pool_somewhat_related["command_r_prompt"] = pool_somewhat_related["what_comment"].apply(prompt_somewhat_related_command_r)
pool_somewhat_related["command_r_answer"] = pool_somewhat_related["command_r_prompt"].apply(query_command_r)

In [629]:
def parse_response_somewhat_related(input_string):
    input_string = input_string.lower()
    contains_yes = bool(re.search(r'\byes\b', input_string))
    contains_no = bool(re.search(r'\bno\b', input_string))
    
    if contains_yes and not contains_no:
        return "Yes"
    elif contains_no and not contains_yes:
        return "No"
    else:
        return "Maybe"

In [630]:
pool_somewhat_related["Good_comment_mistral"] = pool_somewhat_related["mistral_answer"].apply(parse_response_somewhat_related)
pool_somewhat_related[pool_somewhat_related.Good_comment_mistral != "Yes"].what_comment

Series([], Name: what_comment, dtype: object)

In [631]:
pool_somewhat_related["Good_comment_command_r"] = pool_somewhat_related["command_r_answer"].apply(parse_response_somewhat_related)
pool_somewhat_related[pool_somewhat_related.Good_comment_command_r != "Yes"].what_comment

10    1 стаття: ринок нафти в Росії продовжує падінн...
Name: what_comment, dtype: object

### Reject/accept

In [633]:
pool_somewhat_related['ACCEPT:verdict'] = pool_somewhat_related["Good_comment_command_r"].apply(lambda x: "+" if x == "Yes" else "-")
pool_somewhat_related['ACCEPT:comment'] = pool_somewhat_related["Good_comment_command_r"].apply(lambda x: "" if x == "Yes" else "В інструкції та прикладах було показано, що сам по собі covid-19/вибори/зглавтування/смерть/\"загальна тема\" не є достатньо конкретною загальною темою для новин, тому що величезна кількість абсолютно різних новин була написана на цю тему. Перечитайте, будь ласка, інструкцію.")

In [635]:
pool_somewhat_related = pool_somewhat_related[[
    'text1', 'text2', 'source1', 'source2', 'summ_text1', 'summ_text2',
    'summ_with_documents_text1', 'summ_with_documents_text2', 'result',
    'what_comment', 'what_same', 'type_problems_article_1',
    'type_problems_article_2', 'comment_which_problems_article_1',
    'comment_which_problems_article_2', 'link', 'task_id', 'assignment_id',
    'worker_id', 'who_same', 'where_same', 'when_same',
    'aggregated_who', 'aggregated_where', 'aggregated_when',
    'aggregated_what', 'different_who', 'different_when', 'different_where',
    'different_what', 'ACCEPT:verdict', 'ACCEPT:comment']]

# Same topics

In [636]:
pool_similar = pool[pool.what_same == "similar"].copy()
pool_similar['ACCEPT:verdict'] = pool_similar.apply(lambda row: "+" if row.who_same == "yes" and row.when_same == "yes" and row.where_same == "yes" else "?", axis=1)
pool_similar['ACCEPT:comment'] = ""

# WHO/WHEN/WHERE/WHAT MV REJECT

In [637]:
pool = pd.concat([pool_similar, pool_somewhat_related, pool_different], ignore_index=True)

In [638]:
def who_comment_ukr(aggregated_who): #+аппеляция
    #yes, no, partly, incomparable
    question = "Чи збігаються головні дійові особи новин? (ХТО)"
    if aggregated_who == "yes":
        answer = "У точностi"
    if aggregated_who == "no":
        answer = "Нi"
    if aggregated_who == "partly":
        answer = "Частково (є перетини)"
    if aggregated_who == "incomparable":
        answer = "У хоча б однієї статті немає дійових осіб"
    return f'''Ми відхиляємо Ваше завдання, оскільки дві інші людини, які розмістили цю пару новин, на запитання: "{question}" обидві відповіли: "{answer}". Якщо Ви вважаєте, що саме Ваша відповідь правильна, а не відповідь більшості, будь ласка, апелюйте відхилення, ми уважно перевіримо завдання ще раз.'''
    
def when_comment_ukr(aggregated_when):
    #yes, no, partly, incomparable
    question = "Чи збігається час подій, що відбуваються в новинах? (КОЛИ)"
    if aggregated_when == "yes":
        answer = "Одна й та сама подія за часом"
    if aggregated_when == "no":
        answer = "Різниця, більше ніж місяць"
    if aggregated_when == "partly":
        answer = "У межах одного місяця"
    if aggregated_when == "incomparable":
        answer = "З хоча б однієї статті неможливо встановити час події"
    return f'''Ми відхиляємо Ваше завдання, оскільки дві інші людини, які розмістили цю пару новин, на запитання: "{question}" обидві відповіли: "{answer}". Якщо Ви вважаєте, що саме Ваша відповідь правильна, а не відповідь більшості, будь ласка, апелюйте відхилення, ми уважно перевіримо завдання ще раз.'''
    
def where_comment_ukr(aggregated_where):
    #yes, no, partly, incomparable
    question = "Чи збігаються місця, в яких відбуваються події з новин? (ДЕ)"
    if aggregated_where == "yes":
        answer = "У точностi"
    if aggregated_where == "no":
        answer = "Нi"
    if aggregated_where == "partly":
        answer = "Частково (є перетини)"
    if aggregated_where == "incomparable":
        answer = "Хоча б в одній статті місце подій зовсім не вказано"
    return f'''Ми відхиляємо Ваше завдання, оскільки дві інші людини, які розмістили цю пару новин, на запитання: "{question}" обидві відповіли: "{answer}". Якщо Ви вважаєте, що саме Ваша відповідь правильна, а не відповідь більшості, будь ласка, апелюйте відхилення, ми уважно перевіримо завдання ще раз.'''
    
def what_comment_ukr(aggregated_what):
    #somewhat_related, different, similar
    question = "Як співвідносяться теми/події новин? (ЩО)"
    if aggregated_what == "similar":
        answer = "Однакові"
    if aggregated_what == "different":
        answer = "Різні"
    if aggregated_what == "somewhat_related":
        answer = "Загальна конкретна тема"
    return f'''Ми відхиляємо Ваше завдання, оскільки дві інші людини, які розмістили цю пару новин, на запитання: "{question}" обидві відповіли: "{answer}". Якщо Ви вважаєте, що саме Ваша відповідь правильна, а не відповідь більшості, будь ласка, апелюйте відхилення, ми уважно перевіримо завдання ще раз.'''
    

def reject_accept(row):
    if row["ACCEPT:verdict"] == '-':
        return [row["ACCEPT:verdict"], row['ACCEPT:comment']]
    if row.different_who and row.aggregated_who != "disagreement":
        return ['-', who_comment_ukr(row['aggregated_who'])]
    if row.different_when and row.aggregated_when != "disagreement":
        return ['-', when_comment_ukr(row['aggregated_when'])]
    if row.different_what and row.aggregated_what != "disagreement":
        return ['-', what_comment_ukr(row['aggregated_what'])]
    if row.different_where and row.aggregated_where != "disagreement":
        return ['-', where_comment_ukr(row['aggregated_where'])]
    if (not row.different_who) and (not row.different_when) and (not row.different_where) and (not row.different_what) and (row["ACCEPT:verdict"] == "+"):
        return ['+', '']
    return ['?', '']

results = pool.apply(reject_accept, axis=1)
pool['ACCEPT:verdict'] = [result[0] for result in results]
pool['ACCEPT:comment'] = [result[1] for result in results]

# Final reject/accept file

In [639]:
pool.reset_index(drop=True, inplace=True)

In [641]:
result = pool[['assignment_id', 'ACCEPT:verdict', 'ACCEPT:comment']]
print('For manual acceptance goes:', result[result['ACCEPT:verdict'] == '?'].shape[0])
result = result[result['ACCEPT:verdict'] != '?']
result = result.rename(columns={"assignment_id": "ASSIGNMENT:assignment_id"})
result.to_csv('reject_accept_batch_3_ukr_ru.tsv', sep='\t', header=True, index=False)

For manual acceptance goes: 1
