In [None]:
import torch
import glob
from tqdm.notebook import tqdm
from tqdm import tqdm
tqdm.pandas()
import numpy as np

import wikipediaapi
import json
import jsonlines
from pathlib import Path

from nltk.tokenize import sent_tokenize

import matplotlib

import pandas as pd

import pathlib

output_path = 'bitext/bitext'

p = pathlib.Path(output_path)
p.mkdir(parents=True, exist_ok=True)


lang = 'bar'

In [None]:
torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel

model_id = "sentence-transformers/LaBSE"
model = SentenceTransformer(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

pool = 'CLS'
model._modules["1"].pooling_mode_cls_token = True

model.to(device)

In [None]:
def get_average_cosine_sim(sentences1, sentences2, tokenizer=tokenizer, model=model):
    try:
        embeddings1 = model.encode(sentences1, convert_to_tensor=True)
        embeddings2 = model.encode(sentences2, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings1, embeddings2).to('cpu')
        avg_sim = torch.topk(cosine_scores, k=1).values.mean().item()
    except:
        avg_sim = 0
    return avg_sim

def align_sentences(sentences1, sentences2, model=model, is_sentence_model=True):
    
    records = []

    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)

    cosine_scores = util.cos_sim(embeddings1, embeddings2).to('cpu')

    idx = torch.topk(cosine_scores, k=1).indices.flatten().tolist()
    values = torch.topk(cosine_scores, k=1).values
    for i, sent in enumerate(sentences1):
        records.append([sent, sentences2[idx[i]], float(values[i])])
    return records

In [None]:
df = pd.read_feather(f'wiki/{lang}_aligned_pages')
df.rename(columns={'de':'de_title'}, inplace=True)

df.dropna(subset=f'{lang}_sentences', inplace=True)
df.dropna(subset=f'de_sentences', inplace=True)


records = []
for idx, row in tqdm(df.iterrows(), total = len(df)):
    aligned_sents = align_sentences(row[f'{lang}_sentences'].tolist(), row['de_sentences'].tolist())
    for sentence_pair in aligned_sents:
        records.append([row[f'{lang}_id'], row['de_id'], row[f'{lang}_title'], row['de_title']]+sentence_pair)
parallel_sents_df = pd.DataFrame.from_records(records, columns = [f'{lang}_id', 'de_id', f'{lang}_title', 'de_title', f'{lang}_sent', 'de_sent', 'cos_sim'])
parallel_sents_df.sort_values(by='cos_sim', ascending=False, inplace=True)


model_id = model_id.split('/')[1]
parallel_sents_df.reset_index(drop=True,inplace=True)
parallel_sents_df.to_csv(f'{output_path}/{model_id}-{pool}_{lang}.csv')

In [None]:

def num_tokens(text):
    import re
    tokens = re.findall('\w+', text)
    return len(tokens)

def replace_period(text):
    if text.endswith('. .'):
        text = text.replace('. .','.')
    else:
        pass
    return text

parallel_sents_df = pd.read_csv(f'{output_path}/{model_id}-{pool}_{lang}.csv', index_col = 0)

In [None]:
parallel_sents_df = pd.read_csv(f'{output_path}/{model_id}-{pool}_{lang}.csv', index_col = 0)

parallel_sents_df[f'{lang}_sent'] = parallel_sents_df[f'{lang}_sent'].apply(replace_period)
parallel_sents_df['de_sent'] = parallel_sents_df['de_sent'].apply(replace_period)
print('replace_period', len(parallel_sents_df))

parallel_sents_df[f'{lang}_num_tokens'] = parallel_sents_df[f'{lang}_sent'].apply(num_tokens)
parallel_sents_df[f'de_num_tokens'] = parallel_sents_df[f'de_sent'].apply(num_tokens)
print('num tokens',len(parallel_sents_df))


parallel_sents_df = parallel_sents_df[parallel_sents_df['de_num_tokens'] >= 5]
parallel_sents_df = parallel_sents_df[parallel_sents_df['de_num_tokens'] <= 20]
print('filter de num tokens', len(parallel_sents_df))

parallel_sents_df = parallel_sents_df[parallel_sents_df[f'{lang}_num_tokens'] >= 5]
parallel_sents_df = parallel_sents_df[parallel_sents_df[f'{lang}_num_tokens'] <= 20]
print(f'filter {lang} num tokens', len(parallel_sents_df))

parallel_sents_df = parallel_sents_df[~parallel_sents_df['de_sent'].str.contains('\[\]')]
parallel_sents_df = parallel_sents_df[~parallel_sents_df[f'{lang}_sent'].str.contains('\[\]')]
print(f'filter braces', len(parallel_sents_df))

parallel_sents_df = parallel_sents_df[~parallel_sents_df['de_sent'].str.contains('\(\)')]
parallel_sents_df = parallel_sents_df[~parallel_sents_df[f'{lang}_sent'].str.contains('\(\)')]
print(f'filter braces', len(parallel_sents_df))

parallel_sents_df = parallel_sents_df[~parallel_sents_df['de_sent'].str.endswith(':.')]
parallel_sents_df = parallel_sents_df[~parallel_sents_df[f'{lang}_sent'].str.endswith(':.')]
print(f'filter incomplete sents', len(parallel_sents_df))




parallel_sents_df.to_csv(f'{output_path}/{model_id}-{pool}_{lang}_clean.csv')