#  Create Pseudo-Parallel Dataset with Style Scores

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
    
import sys, os
sys.path.append('../paraphrase/')
sys.path.append('../jointclassifier/')
from paraphraser_args import ModelArguments as pma, DataTrainingArguments as pda, TrainingArguments as pta
from paraphraser_dataloader import load_dataset as pld, load_dataset_style as lds
from paraphraser_trainer import ParaphraserTrainer
from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead, HfArgumentParser
from joint_args import ModelArguments as jma, DataTrainingArguments as jda, TrainingArguments as jta
from joint_dataloader import load_dataset as jld
from joint_trainer import JointTrainer
from joint_model_v1 import JointSeqClassifier

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm, trange
from torch import cuda, no_grad

## Load in desired dataset and paraphraser model
In the cell below, define the dataset you want to work with and the paraphraser model (here a `"t5-small"` [from Hugging Face](https://huggingface.co/t5-small))

In [2]:
data_dir = "../data/processed_filtered/"

In [3]:
paraphrase_model_name = "t5_paraphrase"
paraphrase_task = 'wiki'
paraphrase_model_nick = "t5_paraphrase"
paraphrase_model_type = 't5-small'
output_dir = "../models/"
epochs = "3"
train_batch_size = "16"
eval_batch_size = "16"
save_log_steps = "400"

parser = HfArgumentParser((pma, pda, pta))
model_args_para, data_args_para, training_args_para = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    paraphrase_model_name,
    "--model_nick",
    paraphrase_model_nick,
    "--data_dir",
    data_dir,
    "--output_dir",
    os.path.join(output_dir, paraphrase_model_nick),
    "--cache_dir",
    os.path.join(output_dir,"cache"),
    "--overwrite_cache",
    "--per_device_train_batch_size",
    train_batch_size,
    "--per_device_eval_batch_size",
    eval_batch_size,
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    save_log_steps,
    "--save_steps",
    save_log_steps,
    "--data_parallel",
    "True"
])


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [4]:
joint_task = "wiki"
data_dir = "../data/processed_filtered/"
joint_model_name = "distilbert-base-uncased"
joint_model_nick = "distilbert_uncased_2"
output_dir = "../models/"
freeze_encoder = "False"
skip_preclassifier = "False"
train_jointly = "True"
epochs = "5"
train_batch_size = "256"
eval_batch_size = "512"
log_save_steps = "200"

parser = HfArgumentParser((jma, jda, jta))
model_args_joint, data_args_joint, training_args_joint = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    joint_model_name,
    "--model_nick",
    joint_model_nick,
    "--task",
    joint_task,
    "--data_dir",
    data_dir,
    "--output_dir",
    os.path.join(output_dir, joint_model_nick, joint_task, 'joint'),
    "--cache_dir",
    os.path.join(output_dir,"cache"),
    "--freeze_encoder",
    freeze_encoder,
    "--skip_preclassifier",
    skip_preclassifier,
    "--train_jointly",
    train_jointly,
    "--overwrite_cache",
    "--per_device_train_batch_size",
    train_batch_size,
    "--per_device_eval_batch_size",
    eval_batch_size,
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    log_save_steps,
    "--save_steps",
    log_save_steps
])


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [8]:
# Create the paraphraser tokenizer and dataset objects
para_tokenizer = AutoTokenizer.from_pretrained(paraphrase_model_type, cache_dir=model_args_para.cache_dir,
                                         model_max_length = data_args_para.max_seq_len)
dataset = lds(data_dir, para_tokenizer,
                            task=paraphrase_task, mode="dev", n_proc=6000)

loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at ../models/cache/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
  

In [9]:
# Use the paraphrase configuration defined above to create the model
model = AutoModelWithLMHead.from_pretrained(os.path.join(output_dir, paraphrase_model_name))
#training_args_para.output_dir)

loading configuration file ../models/t5_paraphrase/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate En

## Use the Paraphraser to Generate Predictions

In [10]:
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=32)

num_return_sequences = 3

device = ("cuda" if cuda.is_available() else "cpu") #and not self.args.no_cuda
model = model.to(device)
model.eval()
predicted1 = []
predicted2 = []
predicted3 = []

epoch_iterator = tqdm(dataloader, desc="Iteration")
with no_grad():
    for step, batch in enumerate(epoch_iterator):
        batch = tuple(t.to(device) for t in batch)  # GPU or CPU
        generated_outputs = model.generate(input_ids = batch[0], 
                                           attention_mask = batch[1], 
                                           max_length=70, 
                                           num_beams=9,
                                           early_stopping=True,
                                           encoder_no_repeat_ngram_size=5,
                                           num_beam_groups=3,
                                           diversity_penalty=0.5,
                                           num_return_sequences=num_return_sequences)
        paras = para_tokenizer.batch_decode(generated_outputs.detach().cpu().numpy(), 
                                                 skip_special_tokens=True)
        predicted1 += paras[0::3]
        predicted2 += paras[1::3]
        predicted3 += paras[2::3]

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=326.0, style=ProgressStyle(description_wi…




## Save results to a csv file

In [12]:
import pandas as pd

In [13]:
# Store outputs to disk using in_filename as the original texts 
# and writing outputs to out_filename

# If you want to do other parts of the dataset other than train, 
# set the mode in 'dataset' above to the desired mode and then rerun the paraphrase
# and change these filenames to point to the slice of the data you want to use (dev, test, etc.)

# in_filename = 'train.csv'
# out_filename = 'train_paraphrased.csv'

in_filename = 'dev.csv'
out_filename = 'dev_paraphrased.csv'

df_para = pd.DataFrame(data={'paraphrased1' : predicted1, 
                             'paraphrased2' : predicted2, 
                             'paraphrased3' : predicted3}) 
df = pd.read_csv(os.path.join(data_dir, paraphrase_task, in_filename), names =['text', 'label'])
df['paraphrased1'] = df_para['paraphrased1']
df['paraphrased2'] = df_para['paraphrased2']
df['paraphrased3'] = df_para['paraphrased3']
df.to_csv(os.path.join(data_dir, paraphrase_task, out_filename), 
               header=False, index=False)

In [14]:
# Inspect some results
df.head()

Unnamed: 0,text,label,paraphrased1,paraphrased2,paraphrased3
0,The main goal of this inductive transfer mecha...,1,in order to improve the generalization of the ...,in order to improve the generalization of the ...,in order to improve the generalization of the ...
1,In this paper we propose an energy-efficient l...,1,"in this paper, we propose an energy efficient ...",we propose a system of energy-efficient learni...,we propose a system of energy-efficient learni...
2,"In this work, we propose a method of deep arti...",1,we're proposing a method for deep artificial n...,we're proposing a method for deep artificial n...,we propose a technique for deep artificial neu...
3,"To achieve this goal, we treat images as bags ...",1,the integration of weakly monitored multiple l...,the integration of weakly monitored multiple l...,the integration of weakly monitored multiple l...
4,The study provides some guidelines: (1) alteri...,1,(1) it is not always better to change the geom...,(1) it is not always better to change the geom...,(1) it is not always better to change the geom...


In [15]:
df.tail()

Unnamed: 0,text,label,paraphrased1,paraphrased2,paraphrased3
10407,This is analogous to the F-test used in linear...,1,this is similar to the F test used in linear r...,this is an analogy to the F test used for line...,the significance of prediction is similar to t...
10408,Nonlinear models for binary dependent variable...,1,the probit model and logit model are nonlinear...,the probit model and logit model are nonlinear...,the probit model and logit model are non-linea...
10409,"In particular, Advanced Driver Assistance Syst...",1,"in particular, ML is a significant role in adv...","in particular, ML is a significant role in adv...",ML is particularly important in the two areas ...
10410,We prove that the proposed algorithm converges...,1,"if a minimum reward machine is inferred, the m...","if a minimum reward machine is inferred, the m...","if a minimum reward machine is inferred, the m..."
10411,"Toward the end of the 1990s, a significant cha...",1,the increased interaction between computer gra...,the increased interaction between computer gra...,the increase in the interaction between comput...


## Now use classifier for Scoring
This may cause GPU memory issues, so it's possible you may have to shutdown the kernel and restart without running the paraphraser first to run this next portion. If doing so, reload the df that was written to disk in several cells above.  

## Load in desired dataset and classifier model
In the cell below, define the dataset you want to work with and the classifier model.

In [5]:
model_config = AutoConfig.from_pretrained(model_args_joint.model_name_or_path, 
                                          cache_dir=model_args_joint.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_args_joint.model_name_or_path, 
                                          cache_dir=model_args_joint.cache_dir,
                                          model_max_length = data_args_joint.max_seq_len)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at ../models/cache/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.4.0.dev0",
  "vocab_size": 30522
}

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at ../models/cache/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a7

In [6]:
# Load data as expected by joint classifier
tasks = data_args_joint.task.split('+')
train_dataset, idx_to_classes = jld(data_args_joint.data_dir, 
                                             tokenizer, 
                                             model_name=model_args_joint.model_name_or_path, 
                           tasks=tasks, mode="train", n_proc=6000)
dev_dataset, _ = jld(data_args_joint.data_dir, 
                              tokenizer, 
                              model_name=model_args_joint.model_name_or_path, 
                              tasks=tasks, mode="dev", n_proc=6000)

100%|██████████| 6/6 [00:00<00:00,  7.15it/s]
100%|██████████| 2/2 [00:00<00:00, 15.66it/s]torch.Size([34302, 64]) torch.Size([34302, 64]) torch.Size([34302, 1]) torch.Size([34302])
torch.Size([7759, 64]) torch.Size([7759, 64]) torch.Size([7759, 1]) torch.Size([7759])



In [7]:
label_dims = {task : 1 if len(list(idx_to_classes[task].keys())) == 2 else len(list(idx_to_classes[task].keys())) for task in idx_to_classes}
label_dims

{'shakespeare': 1}

In [8]:
joint_model = JointSeqClassifier.from_pretrained(os.path.join(output_dir,
                                                              model_args_joint.model_nick, joint_task,'joint'),
                                           tasks=tasks,
                                           model_args=model_args_joint,
                                           task_if_single=None, 
                                           joint = training_args_joint.train_jointly,
                                           label_dims=label_dims)

trainer = JointTrainer([training_args_joint,model_args_joint, data_args_joint], 
                       joint_model, train_dataset, dev_dataset, idx_to_classes)

loading configuration file ../models/distilbert_uncased_2/shakespeare/joint/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "JointSeqClassifier"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.4.0.dev0",
  "vocab_size": 30522
}

loading weights file ../models/distilbert_uncased_2/shakespeare/joint/pytorch_model.bin
All model checkpoint weights were used when initializing JointSeqClassifier.

All the weights of JointSeqClassifier were initialized from the model checkpoint at ../models/distilbert_uncased_2/shakespeare/joint.
If your task is similar to the task the model of the che

## Run classifier on paraphrased and original text

This is currently done with pd DataFrames but could probably be made better by using a batch data loader. 

In [9]:
import scipy.stats as ss
from tqdm import tqdm
import pandas as pd
tqdm.pandas()

In [10]:
tasks

['shakespeare']

In [11]:
def pred_paraphrases(row, tasks, cols):
    '''
    Make style predictions on a given df row for a given set of text columns
    and classification tasks. 
    '''
    preds = {}
    for col in cols:
        sentence = row[col]
        out = trainer.predict_for_sentence(sentence, tokenizer)
        for task in tasks:
            pred = float(out[task]['prob'])
            preds[task + '_' + col] = pred
    return preds

def get_best_pred(row, cols, target_val=0.5):
    '''
    Helper funtion for determiningg which paraphrase is 'best' 
    for a given set of paraphrase column style scores and a target value
    that you want the scores to be close to. Currently just outputs the best score
    but could be modified to get best sentence as well.
    '''
    best_diff = 1
    best_val = None
    for col in cols:
        diff = abs(row[col] - target_val)
        if diff < best_diff:
            best_val = row[col]
            best_diff = diff
    return best_val

In [24]:
#Read in the file with the paraphrases
# joint_task1 = 'abstract'
joint_task2 = 'shakespeare'

paraphrase_task = 'shakespeare'
filename = 'dev_paraphrased.csv'
# filename = 'dev_paraphrased.csv'

df = pd.read_csv(os.path.join(data_dir, paraphrase_task, filename), header=None)
df.columns = ['text','label', 'paraphrased1', 'paraphrased2', 'paraphrased3']

df = df[df['label']==1]

In [25]:
df.head()

Unnamed: 0,text,label,paraphrased1,paraphrased2,paraphrased3
1,"But thus, I trust, you will not marry her.",1,I'm sure you won't marry her.,but I'm sure you won't marry her.,so I'm sure you won't marry her.
5,Stand from the hearse.,1,stand in front of the hearse!,stand out of the hearse!,stand by the hearse!
6,"I have no will to wander forth of doors, Yet s...",1,"I'm not going to walk out of the door, but som...","I don't want to go out of the door, but someth...","I'm not going to go out of the door, but somet..."
9,"How do you mean, removing of him?",1,how do you mean removing him?,"what do you mean, remove him?",how do you mean he's removed?
11,"O Thou, whose captain I account myself, Look o...",1,"O Thou, I'm a captain, and I'm a gracious eye ...","O Thou, I'm a captain, and I'm a gracious eye ...","O Thou, I'm a captain, and I'm a gracious man."


In [26]:
# Define columns on which to run the classification
cols_to_use = ['text','paraphrased1', 'paraphrased2', 'paraphrased3']
# Define the names of the columns where the output scores will be stored
cols_preds = [f'pred_{joint_task2}_orig',
            #   f'pred_{joint_task1}_paraphrased1', f'pred_{joint_task2}_paraphrased1',
            #   f'pred_{joint_task1}_paraphrased2', f'pred_{joint_task2}_paraphrased2',
            #   f'pred_{joint_task1}_paraphrased3', f'pred_{joint_task2}_paraphrased3']
            f'pred_{joint_task2}_paraphrased1',
             f'pred_{joint_task2}_paraphrased2',
             f'pred_{joint_task2}_paraphrased3']
# Store results into df
df[cols_preds] = df.progress_apply(lambda x : pred_paraphrases(x, tasks, cols_to_use), 
                                   axis=1, result_type="expand")

100%|██████████| 3151/3151 [01:12<00:00, 43.58it/s]


In [27]:
# Store results of style classification:
out_filename = paraphrase_task + '_dev_cross_predict_paraphrases.csv'

df.to_csv(os.path.join(data_dir, paraphrase_task, out_filename), header=True, index=False)

In [28]:
df.head().T

Unnamed: 0,1,5,6,9,11
text,"But thus, I trust, you will not marry her.",Stand from the hearse.,"I have no will to wander forth of doors, Yet s...","How do you mean, removing of him?","O Thou, whose captain I account myself, Look o..."
label,1,1,1,1,1
paraphrased1,I'm sure you won't marry her.,stand in front of the hearse!,"I'm not going to walk out of the door, but som...",how do you mean removing him?,"O Thou, I'm a captain, and I'm a gracious eye ..."
paraphrased2,but I'm sure you won't marry her.,stand out of the hearse!,"I don't want to go out of the door, but someth...","what do you mean, remove him?","O Thou, I'm a captain, and I'm a gracious eye ..."
paraphrased3,so I'm sure you won't marry her.,stand by the hearse!,"I'm not going to go out of the door, but somet...",how do you mean he's removed?,"O Thou, I'm a captain, and I'm a gracious man."
pred_shakespeare_orig,0.821566,0.631334,0.98707,0.960118,0.993091
pred_shakespeare_paraphrased1,0.004811,0.019826,0.005536,0.185102,0.990232
pred_shakespeare_paraphrased2,0.00474,0.209268,0.005621,0.267502,0.990249
pred_shakespeare_paraphrased3,0.004765,0.15968,0.005888,0.115327,0.9904
