In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd


model_checkpoint = 'cointegrated/rubert-base-cased-nli-twoway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

def predict_pair(text1, text2):
    with torch.inference_mode():
        out = model(**tokenizer(text1, text2, return_tensors='pt').to(model.device))
        proba = torch.softmax(out.logits, -1).cpu().numpy()[0]
        return {v: proba[k] for k, v in model.config.id2label.items()}['entailment']

In [3]:
import os

In [24]:
news = [l.strip().split('\t') for l in open(f"data/raw/zipped/news.tsv") if len(l.split('\t')) == 2]
backed = [l.strip().split('\t') for l in open(f"data/raw/zipped/pairs450.tsv") if len(l.split('\t')) == 2]
subs = [l.strip().split('\t') for l in open(f"data/raw/zipped/subtitles.tsv") if len(l.split('\t')) == 2]

In [29]:
from collections import Counter
c = Counter([len(l) for l in subs])

In [42]:
data = {"news": news,
        "backed": backed,
        "subs": subs}

In [89]:
import tqdm

with open('data/rysshe_entailment.tsv', 'w', encoding='utf8') as f:
    for k in data:
        for text1, text2 in tqdm.tqdm(data[k]):
            ent1 = str(float(predict_pair(text1, text2)))
            ent2 = str(float(predict_pair(text2, text1)))
            f.write('\t'.join([k, text1, text2, ent1, ent2])+'\n')

  4%|▍         | 88350/2337604 [14:23<6:06:03, 102.41it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 12%|█▏        | 288521/2337604 [47:08<5:33:32, 102.39it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 21%|██        | 493230/2337604 [1:20:54<5:01:08, 102.08it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config vari

In [103]:
df = [l.split('\t') for l in open('data/rysshe_entailment.tsv').readlines()]

In [104]:
from collections import Counter

df = pd.DataFrame(df, columns=['kind', 'text1', 'text2', 'ent1', 'ent2'])

In [108]:
df['ent1'] = df['ent1'].astype(float)
df['ent2'] = df['ent2'].astype(float)

In [111]:
# df.to_csv('data/rysshe_entailment.tsv', sep='\t', index=False)

In [4]:
import pandas as pd
df = pd.read_csv('data/rysshe_entailment.tsv', sep='\t')

In [123]:
'Ğ�Ğ¿Ñ€ĞµĞ´ĞµĞ»Ğ¸'

'Ğ�Ğ¿Ñ€ĞµĞ´ĞµĞ»Ğ¸'

In [116]:
'самолеты работники'

'самолеты работники'

In [7]:
df = df[(df.ent1 > 0.7) & (df.ent2 > 0.7)]

In [10]:
len(df)

313861

In [11]:
df = df[~(df.text1.str.lower().str.contains('чёрт|черт|бля|хрен|целк|хер|чё |че') | df.text2.str.lower().str.contains('чёрт|черт|бля|хрен|целк|хер|чё |че'))]

In [20]:
df = df[(df.text1.str.len() + df.text2.str.len()) > 35]

In [22]:
df = df[(df.text1.str.split().apply(len) > 3) & (df.text2.str.split().apply(len) > 3)]

In [24]:
df = df[~(df.text1.apply(lambda x: '...' in x)) | (df.text2.apply(lambda x: '...' in x))]

In [25]:
len(df)

300817

In [26]:
def vity(row):
    if 'вы' in row['text1'].lower() and 'ты' in row['text2'].lower():
        return True
    elif 'вы' in row['text2'].lower() and 'ты' in row['text1'].lower():
        return True
    return False

In [27]:
df = df[~df.apply(vity, axis=1)]

In [29]:
from alphabet_detector import AlphabetDetector
ad = AlphabetDetector()


df = df[~(df.text1.apply(lambda x: ad.is_latin(x)))]
df = df[~(df.text2.apply(lambda x: ad.is_latin(x)))]

In [31]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [32]:
df['leven'] = df.apply(lambda x: similar(x['text1'], x['text2']), axis=1)

In [35]:
df = df[df['leven'] < 0.85]

In [56]:
from razdel import tokenize

sw = ('ну', 'хуй', 'хуя', 'ээ', 'эээ', 'мэм',
      'ох', 'ах', 'вау', 'бог', 'боже', 'эй', '...', '—', 'дерьмо')
def stopwords(s):
    for tok in tokenize(s):
        if tok.text.lower() in sw:
            return True
    return False

def repeat(s):
    toks = [_.text for _ in tokenize(s)]
    if len(toks) / len(set(toks)) > 2:
        return True
    return False

In [53]:
df = df[~df.text1.apply(stopwords)]
df = df[~df.text2.apply(stopwords)]

In [57]:
df = df[~df.text1.apply(repeat)]
df = df[~df.text2.apply(repeat)]

In [68]:
df = df[~df.text1.apply(lambda x: 'хер' in x)]
df = df[~df.text2.apply(lambda x: 'хер' in x)]

In [307]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)
from collections import Counter
from razdel import tokenize

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

def match_ents(s1, s2):
    doc1, doc2 = Doc(s1), Doc(s2)
    
    doc1.segment(segmenter)
    doc2.segment(segmenter)
    
    doc1.tag_ner(ner_tagger)
    doc2.tag_ner(ner_tagger)
    print(doc1.spans, doc2.spans)
    if Counter([s.type for s in doc1.spans]) != Counter([s.type for s in doc2.spans]):
        return False
    
    numbers1 = set()
    numbers2 = set()
    
    for tok in doc1.tokens:
        if has_numbers(tok.text):
            numbers1.add(tok.text)
    
    for tok in doc2.tokens:
        if has_numbers(tok.text):
            numbers2.add(tok.text)
    
    if numbers1 != numbers2:
        return False
    
    return True
    

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def match_abbr(s1, s2):
    abbrs1 = set()
    abbrs2 = set()
    
    for tok in tokenize(s1):
        if tok.text.isupper() and len(tok.text) > 1:
            abbrs1.add(tok.text)
    for tok in tokenize(s2):
        if tok.text.isupper() and len(tok.text) > 1:
            abbrs2.add(tok.text)
    if abbrs1 == abbrs2:
        return True
    return False

def match_latin(s1, s2):
    lat1 = set()
    lat2 = set()
    
    for tok in tokenize(s1):
        if ad.is_latin(tok.text) and tok.text.isalpha():
            lat1.add(tok.text)
    for tok in tokenize(s2):
        if ad.is_latin(tok.text) and tok.text.isalpha():
            lat2.add(tok.text)

    if lat1 == lat2:
        return True
    return False

In [71]:
df = df[df.apply(lambda x: match_abbr(x['text1'], x['text2']), axis=1)]

In [72]:
len(df)

221801

In [74]:
import tqdm

ner_data = []
for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if match_ents(row['text1'], row['text2']):
        ner_data.append(i)

100%|██████████| 221801/221801 [10:46<00:00, 343.12it/s]


In [240]:
df = df[df.index.isin(ner_data)]

In [244]:
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)

In [245]:
def root_gender(s1, s2):
    try:
        g = set()
        d1 = Doc(s1)
        d1.segment(segmenter)
        d1.parse_syntax(syntax_parser)
        d1.tag_morph(morph_tagger)

        d2 = Doc(s2)
        d2.segment(segmenter)
        d2.parse_syntax(syntax_parser)
        d2.tag_morph(morph_tagger)

        root_g1 = g.add([_.feats['Gender'] for _ in d1.tokens if _.rel =='root'][0])
        root_g2 = g.add([_.feats['Gender'] for _ in d2.tokens if _.rel =='root'][0])
    #     print(g)
        if g == {'Fem', 'Masc'}:
            return False
    except:
        pass
    return True

In [246]:
gen_data = []

for i, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    if root_gender(row['text1'], row['text2']):
        gen_data.append(i)

100%|██████████| 189680/189680 [15:21<00:00, 205.82it/s]


In [258]:
df = df[(df.index.isin(gen_data) & (df.kind=='subs')) | (df.kind!='subs')]

In [308]:
df = df[df.apply(lambda x: match_latin(x['text1'], x['text2']), axis=1)]

In [374]:
df = df[~df.text1.apply(lambda x: 'блин' in x)]
df = df[~df.text2.apply(lambda x: 'блин' in x)]

In [383]:
df = df[~df.text1.apply(lambda x: ('че ' in x.lower()) or ('чё ' in x.lower()))]
df = df[~df.text2.apply(lambda x: ('че ' in x.lower()) or ('чё 'in x.lower()))]

In [407]:
q

kind                                                  news
text1    Дочь Стивена Спилберга решила сделать карьеру ...
text2           Дочь Стивена Спилберга стала порноактрисой
ent1                                              0.982754
ent2                                              0.959038
leven                                             0.612613
Name: 143101, dtype: object

In [422]:
q = df.sample(3).iloc[0]

print(q['text1'])
print(q['text2'])

А как я могу доверять тебе
Откуда мне знать что я могу верить вам


In [424]:
from collections import Counter

c = Counter()

for _, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    c.update(Counter(row['text1']))
    c.update(Counter(row['text2']))

100%|██████████| 176747/176747 [00:12<00:00, 13685.46it/s]


In [446]:
garbage = set(['ї',
 '„',
 'Ђ',
 'Ќ',
 '\\',
 '”',
 '#',
 'j',
 '•',
 'ќ',
 'Є',
 'q',
 'ћ',
 '°',
 '…',
 'Ь',
 '§',
 '·',
 '@',
 'ƒ',
 '®',
 '—',
 '}',
 '’',
 'Ѕ',
 'Ћ',
 '~',
 '►',
 '\xad',
 '^',
 'Η',
 '™',
 '●',
 'ä',
 '`',
 'І',
 '×',
 'ѣ',
 'ο',
 'є',
 '‘',
 '±',
 '{',
 '・',
 'ń',
 '◈',
 '▶',
 'á',
 'ü',
 '\x07',
 'ā',
 '½',
 'ѐ',
 '◊',
 '−',
 '\x1f',
 'ö',
 'Π',
 '£',
 'º',
 'Џ',
 'ό',
 '═',
 '¬',
 '\u200e',
 '╣',
 '╚',
 '╩',
 '└',
 '\ufeff',
 '‐',
 '─',
 'β',
 '→',
 'Τ'])

In [449]:
'|'.join(garbage)

'§|◊|}|\\|◈|~|™|á|ü|\u200e|・|ń|‘|ο|ā|\x1f|{|Η|└|▶|°|®|^|\ufeff|ї|…|І|Π|£|\x07|‐|„|j|”|►|╩|Τ|`|Ѕ|Ь|q|Є|ѣ|є|●|½|→|×|Ќ|·|\xad|ό|╚|ћ|╣|ƒ|Ћ|═|ќ|−|ö|β|º|±|’|—|Џ|Ђ|#|ä|ѐ|─|@|¬|•'

In [453]:
def has_garbage(s):
    s = set(s)
    for el in garbage:
        if el in s:
            return True
    return False

In [456]:
df = df[~df.text1.apply(has_garbage)]
df = df[~df.text2.apply(has_garbage)]

In [458]:
df.to_csv('data/clean/rysshe_clean.tsv', sep='\t', index=False)

In [459]:
df

Unnamed: 0,kind,text1,text2,ent1,ent2,leven
230,news,Трамп рассказал о ситуации с коронавирусом в США,Трамп оценил ситуацию с коронавирусом в США,0.842401,0.859896,0.813187
248,news,В Иране коронавирусом заразился высокопоставле...,В Иране чиновник заболел коронавирусом,0.969853,0.771793,0.432990
255,news,Иранский министр заразился коронавирусом,Высокопоставленный иранский чиновник заразился...,0.893407,0.867574,0.700000
279,news,У замминистра здравоохранения Ирана выявили ко...,Коронавирус поразил замминистра здравоохранени...,0.854719,0.842087,0.629630
297,news,Замминистра здравоохранения Ирана заразился ко...,Коронавирус поразил замминистра здравоохранени...,0.989599,0.901100,0.581818
...,...,...,...,...,...,...
3098363,subs,С ней всё будет в порядке,С тобой все будет хорошо,0.949547,0.971785,0.612245
3098364,subs,С ней всё в порядке,С тобой все в порядке,0.943584,0.966565,0.800000
3098366,subs,Нет дело не в этом,Я не это имел в виду,0.825783,0.871824,0.421053
3098384,subs,Что ты об этом думаешь,Что ты хочешь этим сказать,0.791357,0.848458,0.625000
