In [1]:
! pip install -q accelerate datasets evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/270.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.7/270.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
import evaluate
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, EvalPrediction

from time import time
from transformers import set_seed
SEED = 2023
set_seed(2023)

In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        # temperture=temperture,
        input_ids, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [25]:
tags = ['Appeal to authority', 'Appeal to fear/prejudice', 'Bandwagon', 'Black-and-white Fallacy/Dictatorship', 'Causal Oversimplification', 'Doubt', 'Exaggeration/Minimisation', 'Flag-waving', 'Glittering generalities (Virtue)', 'Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion', 'Presenting Irrelevant Data (Red Herring)', 'Reductio ad hitlerum', 'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché', 'Whataboutism']

ix2tag = {i:tags[i] for i in range(len(tags))}
tag2ix = {tags[i]:i for i in range(len(tags))}


def replace_none_with_str(dataset):
  for i in range(len(dataset)):
    if dataset[i]['text'] == None:
      dataset[i]['text'] = ''

def sigmoid(X):
    return 1 / (1 + np.exp(-X))

def heaviside(X):
    return np.heaviside(X - 0.5, 0)

def compute_metrics(eval_preds: EvalPrediction):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = heaviside(sigmoid(logits))

    f1 = f1_score(labels, predictions, average=None, zero_division=0.0)
    f1 = {f'f1_C{i}': f1[i] for i in range(len(f1))}
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0.0)
    recall = recall_score(labels, predictions, average=None, zero_division=0.0)
    recall = {f'recall_C{i}': recall[i] for i in range(len(recall))}
    recall_macro = recall_score(labels, predictions, average='macro', zero_division=0.0)
    precision = precision_score(labels, predictions, average=None, zero_division=0.0)
    precision = {f'precision_C{i}': precision[i] for i in range(len(precision))}
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0.0)
    accuracy = accuracy_score(labels, predictions)
    results = {'accuracy': accuracy, 'precision_macro': precision_macro, 'recall_macro': recall_macro, 'f1_macro': f1_macro, **f1, **recall, **precision}
    return results

In [5]:
ds_url = f'/content/'
ds_files = {
    'train': ds_url + 'train.tsv',
    'validation': ds_url + 'validation.tsv',
}

ds = load_dataset('csv', data_files=ds_files, delimiter='\t')
ds = ds.rename_columns({'ID':'id', 'Text':'text', 'Label': 'label'})
ds

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})

In [6]:
def convert_labels(example):
  example["label"] = [float(num) for num in example['label'][1:-1].split(' ')]
  return example

def replace_none_with_str(example):
  if example['text'] == None:
    example['text'] = ''
  return example

ds = ds.map(convert_labels)
ds = ds.map(replace_none_with_str)

print(ds['train'].features)
print(ds['validation'].features)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}
{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}


In [29]:
ix2tag[18]

'Thought-terminating cliché'

In [34]:
i = 7
text = ds['train']['text'][i].lower()
id = ds['train']['id'][i]
print(id)
print(text)

paraphrase(text)

79372
term limits are everywhere & politicians can't ignore it\n\nterm limits ahead


["The introduction of term limits is a common occurrence in politics, and it's crucial for politicians to acknowledge this.",
 'Politicians are unable to ignore the impact of term limits, which are prevalent in discussions.',
 'There are no limits to the number of words spoken in a given speech, and politicians must take into account these limitations when discussing future regulations.',
 'Term limits are prevalent in politics, and they must be observed before the implementation of term limits.',
 'No one can escape the fact that there are term limits in place.']

In [38]:
for text in ds['train']['text'][:10]:
  paraphrase(text)