In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd


model_checkpoint = 'cointegrated/rubert-base-cased-nli-twoway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

def predict_pair(text1, text2):
    with torch.inference_mode():
        out = model(**tokenizer(text1, text2, return_tensors='pt').to(model.device))
        proba = torch.softmax(out.logits, -1).cpu().numpy()[0]
        return {v: proba[k] for k, v in model.config.id2label.items()}['entailment']

In [None]:
df_dev = pd.read_csv('opusparcus_v1/ru/dev/ru-dev.txt', sep='\t', names=['id', 'text1', 'text2', 'score'])

In [None]:
df_test = pd.read_csv('opusparcus_v1/ru/test/ru-test.txt', sep='\t', names=['id', 'text1', 'text2', 'score'])

In [None]:
df = pd.concat([df_dev, df_test])

In [None]:
import tqdm

scores = []

for _, row in tqdm.tqdm(df.iterrows()):
    scores.append((predict_pair(row['text1'], row['text2']),
                   predict_pair(row['text2'], row['text1'])))
    

In [None]:
df['ent1'] = [s[0] for s in scores]
df['ent2'] = [s[1] for s in scores]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(df['score'], df['ent1'] * df['ent2'], alpha=0.1)

In [None]:
df[(df['ent1']*df['ent2']) > 0.6].score.hist()

In [None]:
# df.to_csv('opusparcus_entail_test.csv', index=False)

In [None]:
import tqdm


reader = pd.read_csv('opusparcus_v1/ru/train/ru-train.txt', sep='\t',
                 names=['id', 'text1', 'text2', 's1', 's2', 's3', 's4'],
                 chunksize=512)

with open('opusparcus_entail_train.csv', 'w', encoding='utf8') as f:
    for df in tqdm.tqdm(reader, total=22118837//512):
        for _, row in df.iterrows():
            if row['s1'] < 7:
                raise
            ent1 = predict_pair(row['text1'], row['text2'])
            ent2 = predict_pair(row['text2'], row['text1'])
            f.write(row['id']+'\t'+row['text1'] +'\t'+row['text2']+'\t'+str(float(ent1))+'\t'+str(float(ent2))+'\n')
        

In [1]:
import pandas as pd

df = pd.read_csv('data/opusparcus_entailment.tsv', sep='\t')

In [2]:
df.columns = ['id', 'text1', 'text2', 'ent1', 'ent2']

In [3]:
df = df[(df.ent1 > 0.7) & (df.ent2 > 0.7)]

In [4]:
df = df[~(df.text1.str.lower().str.contains('чёрт|черт|бля|хрен|целк') | df.text2.str.lower().str.contains('чёрт|черт|бля|хрен|целк'))]

In [5]:
df = df[(df.text1.str.len() + df.text2.str.len()) > 45]

In [6]:
df = df[(df.text1.str.split().apply(len) > 3) & (df.text2.str.split().apply(len) > 3)]

In [7]:
df = df[~(df.text1.apply(lambda x: '...' in x)) | (df.text2.apply(lambda x: '...' in x))]

In [8]:
def vity(row):
    if 'вы' in row['text1'].lower() and 'ты' in row['text2'].lower():
        return True
    elif 'вы' in row['text2'].lower() and 'ты' in row['text1'].lower():
        return True
    return False

In [9]:
df = df[~df.apply(vity, axis=1)]

In [10]:
from alphabet_detector import AlphabetDetector
ad = AlphabetDetector()


df = df[~(df.text1.apply(lambda x: ad.is_latin(x)))]
df = df[~(df.text2.apply(lambda x: ad.is_latin(x)))]

In [11]:
len(df)

687017

In [12]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [13]:
df['leven'] = df.apply(lambda x: similar(x['text1'], x['text2']), axis=1)

In [14]:
df = df[df['leven'] < 0.9]

In [15]:
len(df)

652591

In [16]:
from razdel import tokenize

sw = ('ну', 'хуй', 'хуя', 'ээ', 'эээ', 'мэм',
      'ох', 'ах', 'вау', 'бог', 'боже', 'эй', '...', '—')
def stopwords(s):
    for tok in tokenize(s):
        if tok.text.lower() in sw:
            return True
    return False

In [17]:
def repeat(s):
    toks = [_.text for _ in tokenize(s)]
    if len(toks) / len(set(toks)) > 2:
        return True
    return False

In [18]:
df = df[~df.text1.apply(stopwords)]
df = df[~df.text2.apply(stopwords)]

In [19]:
df = df[~df.text1.apply(repeat)]
df = df[~df.text2.apply(repeat)]

In [20]:
df = df[~df.text1.apply(lambda x: 'хер' in x)]
df = df[~df.text2.apply(lambda x: 'хер' in x)]

In [21]:
# df[df.id.isin(['ru-N2362921', 'ru-N7967249'])]

In [22]:
df = df[~df.text1.str.contains('l')]
df = df[~df.text2.str.contains('l')]

In [23]:
len(df)

599379

In [24]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)
from collections import Counter
from razdel import tokenize

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

def match_ents(s1, s2):
    doc1, doc2 = Doc(s1), Doc(s2)
    
    doc1.segment(segmenter)
    doc2.segment(segmenter)
    
    doc1.tag_ner(ner_tagger)
    doc2.tag_ner(ner_tagger)
    
    if Counter([s.type for s in doc1.spans]) != Counter([s.type for s in doc2.spans]):
        return False
    
    numbers1 = set()
    numbers2 = set()
    
    for tok in doc1.tokens:
        if has_numbers(tok.text):
            numbers1.add(tok.text)
    
    for tok in doc2.tokens:
        if has_numbers(tok.text):
            numbers2.add(tok.text)
    
    if numbers1 != numbers2:
        return False
    
    return True
    

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def match_abbr(s1, s2):
    abbrs1 = set()
    abbrs2 = set()
    
    for tok in tokenize(s1):
        if tok.text.isupper() and len(tok.text) > 1:
            abbrs1.add(tok.text)
    for tok in tokenize(s2):
        if tok.text.isupper() and len(tok.text) > 1:
            abbrs2.add(tok.text)
    if abbrs1 == abbrs2:
        return True
    return False

def match_latin(s1, s2):
    lat1 = set()
    lat2 = set()
    
    for tok in tokenize(s1):
        if ad.islatin(tok.text):
            lat1.add(tok.text)
    for tok in tokenize(s2):
        if ad.islatin(tok.text):
            lat2.add(tok.text)

    if lat1 == lat2:
        return True
    return False

In [25]:
df = df[df.apply(lambda x: match_abbr(x['text1'], x['text2']), axis=1)]

In [26]:
import tqdm

ner_data = []
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if match_ents(row['text1'], row['text2']):
        ner_data.append(row['id'])

100%|██████████| 598240/598240 [20:20<00:00, 490.19it/s]


In [27]:
df = df[df.id.isin(ner_data)]

In [28]:
len(df)

572276

In [29]:
def same(s1, s2):
    toks1 = set([_.text for _ in tokenize(s1) if _.text.isalpha()])
    toks2 = set([_.text for _ in tokenize(s2) if _.text.isalpha()])
    
    if toks1 == toks2:
        return True
    return False

In [30]:
df = df[~df.apply(lambda x: same(x['text1'], x['text2']), axis=1)]

In [31]:
df.to_csv('data/clean/opusparcus_clean.tsv', sep='\t', index=False)

In [32]:
len(df)

563332

In [33]:
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)

In [34]:
def root_gender(s1, s2):
    try:
        g = set()
        d1 = Doc(s1)
        d1.segment(segmenter)
        d1.parse_syntax(syntax_parser)
        d1.tag_morph(morph_tagger)

        d2 = Doc(s2)
        d2.segment(segmenter)
        d2.parse_syntax(syntax_parser)
        d2.tag_morph(morph_tagger)

        root_g1 = g.add([_.feats['Gender'] for _ in d1.tokens if _.rel =='root'][0])
        root_g2 = g.add([_.feats['Gender'] for _ in d2.tokens if _.rel =='root'][0])
    #     print(g)
        if g == {'Fem', 'Masc'}:
            return False
    except:
        pass
    return True

In [35]:
root_gender('Уверена , что с ним всё в порядке .', 'Я уверен , он в порядке .')

False

In [39]:
gen_data = []

for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if root_gender(row['text1'], row['text2']):
        gen_data.append(row['id'])

100%|██████████| 563332/563332 [37:21<00:00, 251.36it/s]


In [50]:
df = df[df.id.isin(gen_data)]

In [51]:
len(df)

536985

In [56]:
from collections import Counter

c = Counter()

for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    c.update(Counter(row['text1']))
    c.update(Counter(row['text2']))

100%|██████████| 536985/536985 [00:35<00:00, 15084.35it/s]


In [59]:
df.to_csv('data/clean/opusparcus_clean.tsv', sep='\t', index=False)

In [None]:
# мужской vs женский род (по глаголу центральному root) --> можно попробовать повосстанавливать
спец символы
# 1 слово vs дофига слов
буква ё
# начало с ---
# многоточие?
# Маты убрать и междометия
# Фразы на иностранном языке 
# NER filtering
# числа? 
# повторы (нет нет нет)
думаешь vs думаете (вы ты в root)
# ru-N2362921, ru-N7967249 wtf
# мб удалить то что в скобках