In [1]:
import json
import os
from collections import defaultdict

import nltk
import pandas as pd
from tqdm import tqdm

bigram_df = pd.read_csv('../data/processed/word_bigrams/oshhamaho.csv')

with open('../data/processed/oshhamaho.txt', 'r') as f:
    text = f.read()

output_path = '../data/processed/bigrams_sent/'
os.makedirs(output_path, exist_ok=True)

for i in range(1, 10):
    len_min = 10 * i
    len_max = 10 * i + 10

    short_sents = set([sent.replace('\n', ' ') for sent in nltk.sent_tokenize(text) if len_min < len(sent) < len_max])

    bigrams_sent = defaultdict(list)
    for bigram in tqdm(bigram_df['w_bigram']):
        for sent in short_sents:
            if bigram in sent:
                bigrams_sent[bigram].append(sent)
            if len(bigrams_sent[bigram]) > 10:
                break

    with open(f'../data/processed/bigrams_sent/{len_min}_{len_max}.json', 'w') as f:
        json.dump(bigrams_sent, f, ensure_ascii=False, indent=4)

100%|██████████| 2945/2945 [00:09<00:00, 320.60it/s]
100%|██████████| 2945/2945 [00:15<00:00, 186.34it/s]
100%|██████████| 2945/2945 [00:18<00:00, 161.71it/s]
100%|██████████| 2945/2945 [00:18<00:00, 156.02it/s]
100%|██████████| 2945/2945 [00:20<00:00, 145.74it/s]
100%|██████████| 2945/2945 [00:20<00:00, 145.35it/s]
100%|██████████| 2945/2945 [00:18<00:00, 160.54it/s]
100%|██████████| 2945/2945 [00:15<00:00, 196.09it/s]
100%|██████████| 2945/2945 [00:12<00:00, 232.16it/s]


In [2]:
import random

sent_pairs = []
for bg_sent_file in os.listdir('../data/processed/bigrams_sent/'):
    with open(f'../data/processed/bigrams_sent/{bg_sent_file}', 'r') as f:
        bigrams_sent = json.load(f)
    
    anti_pairs = []
    for bigram, sents in bigrams_sent.items():
        if len(sents) < 2:
            continue

        sent_product = [(bigram, s1, s2) for s1 in sents for s2 in sents if s1 != s2]
        sent_pairs.extend(sent_product)
        
        if len(sent_pairs) < 1000:
            continue

        for sent in sents:
            for _ in range(5):
                anti_pairs.append(('random', sent, random.choice(sent_pairs)[2]))

    sent_pairs.extend(anti_pairs)

In [3]:
sent_pairs_df = pd.DataFrame(sent_pairs, columns=['bigram', 'sent1', 'sent2'])
sent_pairs_df.sort_values('bigram', inplace=True)
sent_pairs_df['sent1'] = sent_pairs_df['sent1'].str.replace('\n', ' ')
sent_pairs_df['sent2'] = sent_pairs_df['sent2'].str.replace('\n', ' ')

In [4]:
import fasttext
from scipy import spatial

kbd_model = fasttext.load_model('../data/processed/embeddings/fasttext_skipgram_kbd_100.bin')


def get_sent_distance(sent_1, sent_2):
    vec_1 = kbd_model.get_sentence_vector(sent_1)
    vec_2 = kbd_model.get_sentence_vector(sent_2)

    # cosine similarity by sklearn
    return 1 - spatial.distance.cosine(vec_1, vec_2)



In [5]:
sent_pairs_df['vec_similarity'] = sent_pairs_df.apply(lambda row: get_sent_distance(row['sent1'], row['sent2']), axis=1)

In [6]:
mean_bigram_len = sent_pairs_df[sent_pairs_df['bigram'] != 'random']['bigram'].apply(lambda x: len(x)).mean()
sent_pairs_df['bigram_len'] = sent_pairs_df['bigram'].apply(lambda x: float(mean_bigram_len) * 0.8 if x == 'random' else len(x))

sent_pairs_df['sent1_len'] = sent_pairs_df['sent1'].apply(lambda x: len(x))
sent_pairs_df['sent2_len'] = sent_pairs_df['sent2'].apply(lambda x: len(x))

# calculate the score for each pair based on the length of the bigram and sentence length (from 0 to 1)
sent_pairs_df['correction_score'] = sent_pairs_df.apply(
    lambda row: 1 - (abs(row['bigram_len'] - mean_bigram_len) / mean_bigram_len + abs(row['sent1_len'] - row['sent2_len']) / (row['sent1_len'] + row['sent2_len'])), axis=1)

sent_pairs_df['final_score'] = sent_pairs_df['vec_similarity'] * sent_pairs_df['correction_score']

In [7]:
# drop random bigram pairs if vec_similarity is too high
sent_pairs_df = sent_pairs_df[~((sent_pairs_df['bigram'] == 'random') & (sent_pairs_df['vec_similarity'] > 0.8))]
sent_pairs_df.drop_duplicates(subset=['bigram', 'sent1', 'sent2'], inplace=True)

In [8]:
sent_pairs_df_anti = sent_pairs_df[sent_pairs_df['bigram'] == 'random']
sent_pairs_df_anti

Unnamed: 0,bigram,sent1,sent2,vec_similarity,bigram_len,sent1_len,sent2_len,correction_score,final_score
316882,random,НэгъуэщI къэралхэм щыпсэу адыгэхэм папщIэ респ...,ЩоджэнцIыкIу Алий и тхыгъэхэм кIуэцIрыкIыу къы...,0.700045,9.056288,99,96,0.784615,0.549266
316881,random,НэгъуэщI къэралхэм щыпсэу адыгэхэм папщIэ респ...,"ИужькIэ десэжащ: къуажэм дэс псори зэроцIыху, ...",0.665268,9.056288,99,89,0.746809,0.496828
316880,random,Я адэжь лъахэм щыпсэу адыгэхэм ятеухуа пэжри д...,1946 гъэм алыдж-урым бэнэкIэмкIэ Европэм и чем...,0.563241,9.056288,98,88,0.746237,0.420311
316879,random,Я адэжь лъахэм щыпсэу адыгэхэм ятеухуа пэжри д...,"Бгырыс хабзэм ипкъ иткIэ, псалъэмакъыр къыхэзы...",0.634261,9.056288,98,62,0.575000,0.364700
316878,random,Я адэжь лъахэм щыпсэу адыгэхэм ятеухуа пэжри д...,Нэхъ IэнатIэ лъагэ ягъэкIуэнуми блэкIа щыIэтэк...,0.630854,9.056288,98,84,0.723077,0.456156
...,...,...,...,...,...,...,...,...,...
167488,random,"КIэщIу жыпIэмэ, абы теухуауэ мифологием зэхидж...",1965-1970 гъэхэм ЩоджэнцIыкIу Iэдэм «Iуащхьэма...,0.515844,9.056288,69,88,0.678981,0.350248
167489,random,КъБАССР-м и къэрал архивым абы теухуауэ щIэлъы...,Къызэрысхуэмейри къызэсэн мурад зэримыIэри япэ...,0.646413,9.056288,62,81,0.667133,0.431243
71531,random,– Иджыри къэс бэлыхьу стелъар пщIэншэ щыхъуакI...,Къэбэрдей литературэм нобэ бжьыпэр щызыIыгъыр ...,0.597313,9.056288,86,89,0.782857,0.467610
62457,random,"Зы сыхьэт дэкIатэкъым, Джабий-ефэнды Хьэрамэ к...","Рамзес шордакъым къытехьэрти, еш имыщIэу, маху...",0.738766,9.056288,89,88,0.794350,0.586839


In [9]:
sent_pairs_df.to_csv('../data/processed/sent_pairs.csv', index=False)