## Styletransfer

* Install Styleformer from Github
* Load Styleformer model
* Filter for fake news test set
* Transform style of test set from casual to formal

In [44]:
from shared import create_tensor_dataset, \
    test_dataset_path
!pip install git+https://github.com/PrithivirajDamodaran/Styleformer.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/PrithivirajDamodaran/Styleformer.git
  Cloning https://github.com/PrithivirajDamodaran/Styleformer.git to /private/var/folders/6w/6hnvrrg501z8ycmdvfyk59n00000gn/T/pip-req-build-a5ufmd36
  Running command git clone --filter=blob:none --quiet https://github.com/PrithivirajDamodaran/Styleformer.git /private/var/folders/6w/6hnvrrg501z8ycmdvfyk59n00000gn/T/pip-req-build-a5ufmd36
  Resolved https://github.com/PrithivirajDamodaran/Styleformer.git to commit 02c9a7fd6798bf5bbbb04456e5068566d6caef55
  Preparing metadata (setup.py) ... [?25ldone


In [None]:
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
)
import nltk

from styleformer import Adequacy

In [None]:
nltk.download("stopwords")
nltk.download("punkt")
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

In [None]:
# Import test set
test = torch.load(test_dataset_path)
test.shape

In [None]:
# Select the first 100 fake news articles from the test set
test_fake_news = test[test["label"] == 0][1000:2000]
test_fake_news.shape

In [None]:
CTF_MODEL_TAG = "prithivida/informal_to_formal_styletransfer"
ADEQUACY_MODEL_TAG = "prithivida/parrot_adequacy_model"

MAX_CANDIDATES = 5

In [None]:
ctf_tokenizer = AutoTokenizer.from_pretrained(CTF_MODEL_TAG, use_auth_token=False)
ctf_model = AutoModelForSeq2SeqLM.from_pretrained(CTF_MODEL_TAG, use_auth_token=False)
adequacy = Adequacy(model_tag=ADEQUACY_MODEL_TAG)

In [None]:
def apply_st_to_sentence(src_sentence):
    ctf_prefix = "transfer Casual to Formal: "
    input_sentence = ctf_prefix + src_sentence
    input_ids = ctf_tokenizer.encode(input_sentence, return_tensors="pt")

    preds = ctf_model.generate(
        input_ids,
        do_sample=True,
        max_length=32,
        top_k=10,
        top_p=0.95,
        early_stopping=False,
        num_return_sequences=MAX_CANDIDATES,
    )

    gen_sentences = set()
    for pred in preds:
        gen_sentences.add(ctf_tokenizer.decode(pred, skip_special_tokens=True).strip())

    adequacy_scored_phrases = adequacy.score(src_sentence, list(gen_sentences), 0, device)
    ranked_sentences = sorted(adequacy_scored_phrases.items(), key=lambda x: x[1], reverse=True)
    return ranked_sentences[0][0]


def apply_st_to_paragraph(input_sentence):
    sentences = nltk.sent_tokenize(input_sentence)
    sentences_transformed = []
    for sentence in sentences:
        sentences_transformed.append(apply_st_to_sentence(sentence))

    concatenated_sentences = " ".join(sentences_transformed).strip()
    if not concatenated_sentences:
        print("empty:")
        print(concatenated_sentences)

    return concatenated_sentences


test_fake_news["st_title"] = test_fake_news["title"].apply(apply_st_to_paragraph)
test_fake_news["st_text"] = test_fake_news["text"].apply(apply_st_to_paragraph)
test_fake_news["st_all_text"] = test_fake_news["st_title"] + "\n\n\n\n" + test_fake_news["st_text"]

In [None]:
test_fake_news.head(10)

In [None]:
test_fake_news["st_all_text"]

In [None]:
test_fake_news.to_csv("data/transferred_fake_testset.csv", index=False)

In [None]:
untransferred_dataset = create_tensor_dataset(test_fake_news)
transferred_dataset = create_tensor_dataset(test_fake_news, "st_all_text")

In [None]:
from pathlib import Path

WORKDIR = Path(".")
DATADIR = WORKDIR / "data"
MODELDIR = WORKDIR / "model"

dataset_path = WORKDIR / 'dataset.csv'

train_dataset_path = DATADIR / 'train_dataset.pt'
test_dataset_path = DATADIR / 'test_dataset.pt'

tokenized_train_dataset_path = DATADIR / 'train_dataset_tokenized.pt'
tokenized_test_dataset_path = DATADIR / 'test_dataset_tokenized.pt'

untransferred_fake_testset_path = DATADIR / 'untransferred_fake_testset.pt'
transferred_fake_testset_path = DATADIR / 'transferred_fake_testset.pt'
transferred_fake_csv_path = DATADIR / 'transferred_fake_testset.csv'


In [None]:
# Export
torch.save(untransferred_dataset, untransferred_fake_testset_path)
print(f"Untransferred Testset saved to {untransferred_fake_testset_path}")
torch.save(transferred_dataset, transferred_fake_testset_path)
print(f"Transferred Testset saved to {transferred_fake_testset_path}")