In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    TopKLogitsWarper,
    TemperatureLogitsWarper,
    StoppingCriteriaList,
    MaxLengthCriteria,
)
import torch

tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-564M")
model = AutoModelForCausalLM.from_pretrained("facebook/xglm-564M")

In [2]:
# model.cuda()

In [2]:
def get_response(prompt_text, use_gpu=False):
    input_prompt = f'คำสุภาพของ \"{prompt_text}\" คือ'
    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
    if use_gpu:
        input_ids = input_ids.cuda()

    # instantiate logits processors
    logits_processor = LogitsProcessorList(
        [
            MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
        ]
    )
    # instantiate logits processors
    logits_warper = LogitsProcessorList(
        [
            TopKLogitsWarper(50),
            TemperatureLogitsWarper(0.7),
        ]
    )

    stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=len(prompt_text)*2)])

    torch.manual_seed(0)
    outputs = model.sample(
        input_ids,
        logits_processor=logits_processor,
        logits_warper=logits_warper,
        stopping_criteria=stopping_criteria,
    )

    return (tokenizer.batch_decode(outputs, skip_special_tokens=True))[0]

In [1]:
import pandas as pd

test_df = pd.read_csv("data/non_sample_impolite.csv")

In [5]:
from tqdm import tqdm
polite = []
for _, row in tqdm(test_df.iterrows()):
    polite.append(get_response(row["text"]))

396it [23:39,  1.17it/s] 

In [None]:
polite_filtered = []
for p in polite:
    idx = p.find("คือ")
    text = p[idx+3:].strip()
    polite_filtered.append(text)

In [None]:
sentence_pair = pd.DataFrame(
    {'impolite': test_df.text,
     'polite': polite_filtered}
)

In [None]:
sentence_pair.drop(columns=["labels"]).to_csv("result.csv", index=False)