# Requirements and Initialization

In [2]:
%%capture
!pip install datasets nltk transformers[sentencepiece] torch tqdm

In [None]:
import torch
import nltk

from tqdm import tqdm
from typing import List
from pathlib import Path
from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from datasets import load_dataset

nltk.download('punkt')

In [None]:
model_ckp = 'hyunussarioglu/tr-paraphrase-mt5-base-ost'

generate_args = {
    'max_new_tokens': 60,
    'do_sample': True,
    'top_p': 0.95,
    'top_k': 100,
    'num_return_sequences': 1,
}

# Utils

In [4]:
def convert_to_sublists(lst: List, k: int) -> List[List]:
    return [lst[i:i + k] for i in range(0, len(lst), k)]

In [5]:
def flatten_list(l: List[List]) -> List:
    return [item for sublist in l for item in sublist]

# Read Data

In [6]:
dataset_file_type = 'csv'
dataset_type = 'sentence-similarity'
dataset_name = 'stsb-tr'
splits = ['train', 'test']


datasets_path = Path('../datasets/augmentation')
dataset_path = datasets_path / dataset_type / dataset_name

In [None]:
if dataset_file_type == 'csv':
  dataset = load_dataset('csv', data_files={split: str(dataset_path / f'{split}.csv') for split in splits}).with_format('torch')
elif dataset_file_type == 'json':
  dataset = load_dataset('json', data_files={split: str(dataset_path / f'{split}.jsonlines') for split in splits}).with_format('torch')
else:
  raise Exception(f'File type [{dataset_file_type}] not supported!')

In [15]:
train_texts = dataset['train']['text']
train_texts = [sent_tokenize(text, language='turkish') for text in train_texts]

# get number of sentences in each text
n_sentences = [len(t) for t in train_texts]

train_texts = flatten_list(train_texts)

In [16]:
train_text_dataloader = DataLoader(train_texts, batch_size=256, shuffle=False)

# Paraphrase

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckp)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckp)
model = model.to(device)

In [None]:
outputs = []

for batch in tqdm(train_text_dataloader):
  tokenized  = tokenizer(batch,
                      padding='max_length',
                      truncation=True,
                      max_length=128,
                      return_tensors='pt')
  tokenized = tokenized.to(device)
  output = model.generate(**tokenized, **generate_args)
  output = tokenizer.batch_decode(output, skip_special_tokens=True)
  outputs.extend(output)

In [None]:
# TODO: something more suitable for sent_tokenize?
detokenizer = TreebankWordDetokenizer() 

paraphrases = []
i = 0
for n in n_sentences:
  paraphrases.append(detokenizer.detokenize(outputs[i:i+n]))
  i += n

In [None]:
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

train_df['paraphrase'] = paraphrases
train_df.to_csv(dataset_path / f'{dataset_name}_train.csv', index=False)
test_df.to_csv(dataset_path / f'{dataset_name}_test.csv', index=False)