#  Generate Some Transfered Sentences and get Style Scores

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
    
import sys, os
import numpy as np
import torch
sys.path.append('../paraphrase/')
sys.path.append('../jointclassifier/')
from paraphraser_args import ModelArguments as pma, DataTrainingArguments as pda, TrainingArguments as pta
from paraphraser_dataloader import load_dataset as pld, load_dataset_style as lds
from paraphraser_dataloader import load_dataset_pseudo as ldp, load_dataset_pseudo_binary_single as ldpb
from paraphraser_dataloader import load_dataset_pseudo_joint as ldpj
from paraphraser_trainer import ParaphraserTrainer
from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead, HfArgumentParser
from joint_args import ModelArguments as jma, DataTrainingArguments as jda, TrainingArguments as jta
from joint_dataloader import load_dataset as jld
from joint_trainer import JointTrainer
from joint_model_v1 import JointSeqClassifier

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm.notebook import tqdm, trange
from torch import cuda, no_grad

## Load in desired dataset and paraphraser model
In the cell below, define the dataset you want to work with and the paraphraser model (here a `"t5-small"` [from Hugging Face](https://huggingface.co/t5-small))

In [9]:
pseudo_data_dir = '../data/pseudo/'
#"../data/processed_filtered/"
binary = True
joint = False
#joint_transfer_tasks = ['formality', 'emo']

In [10]:
def get_model_tokenizer_binary(task, model_nick):
    data_dir = "../data/pseudo"
    model_name = "t5-small"
    meta_task_type = "binary_single"
    meta_task = 'transfer'


    output_dir = "../models/"
    epochs = "5"
    train_batch_size = "16"
    eval_batch_size = "16"
    save_log_steps = "800"

    parser = HfArgumentParser((pma, pda, pta))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses([
        "--model_name_or_path",
        model_name,
        "--model_nick",
        model_nick,
        "--data_dir",
        data_dir,
        "--output_dir",
        os.path.join(output_dir, model_nick),
        "--cache_dir",
        os.path.join(output_dir,"cache"),
        "--overwrite_cache",
        "--per_device_train_batch_size",
        train_batch_size,
        "--per_device_eval_batch_size",
        eval_batch_size,
        "--max_seq_len",
        "64",
        "--gradient_accumulation_steps",
        "1",
        "--num_train_epochs",
        epochs,
        "--logging_steps",
        save_log_steps,
        "--save_steps",
        save_log_steps,
        "--data_parallel",
        "True",
        "--meta_task",
        meta_task,
        "--meta_task_type",
        meta_task_type
    ])

    # Eval
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelWithLMHead.from_pretrained(os.path.join(output_dir, model_nick))   
    return tokenizer, model 

In [11]:
mode = 'dev'
paraphrase_model_name = "t5_transfer_wiki_binary"
paraphrase_task = 'wiki'
prompt_task = paraphrase_task #+ "_prompt"


paraphrase_model_nick = "t5_transfer_formality_joint"
paraphrase_model_type = 't5-small'
output_dir = "../models/"
epochs = "3"
train_batch_size = "16"
eval_batch_size = "16"
save_log_steps = "400"

parser = HfArgumentParser((pma, pda, pta))
model_args_para, data_args_para, training_args_para = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    paraphrase_model_name,
    "--model_nick",
    paraphrase_model_nick,
    "--data_dir",
    pseudo_data_dir,
    "--output_dir",
    os.path.join(output_dir, paraphrase_model_nick),
    "--cache_dir",
    os.path.join(output_dir,"cache"),
    "--overwrite_cache",
    "--per_device_train_batch_size",
    train_batch_size,
    "--per_device_eval_batch_size",
    eval_batch_size,
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    save_log_steps,
    "--save_steps",
    save_log_steps,
    "--data_parallel",
    "True"
])


PyTorch: setting up devices


In [29]:
joint_task = "abstract+shakespeare"

class_data_dir = "../data/processed_filtered/"
joint_model_name = "distilbert-base-uncased"
joint_model_nick = "distilbert_uncased_2"
output_dir = "../models/"
freeze_encoder = "False"
skip_preclassifier = "False"
train_jointly = "True"
epochs = "5"
train_batch_size = "256"
eval_batch_size = "512"
log_save_steps = "200"

parser = HfArgumentParser((jma, jda, jta))
model_args_joint, data_args_joint, training_args_joint = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    joint_model_name,
    "--model_nick",
    joint_model_nick,
    "--task",
    joint_task,
    "--data_dir",
    class_data_dir,
    "--output_dir",
    os.path.join(output_dir, joint_model_nick, joint_task, 'joint'),
    "--cache_dir",
    os.path.join(output_dir,"cache"),
    "--freeze_encoder",
    freeze_encoder,
    "--skip_preclassifier",
    skip_preclassifier,
    "--train_jointly",
    train_jointly,
    "--overwrite_cache",
    "--per_device_train_batch_size",
    train_batch_size,
    "--per_device_eval_batch_size",
    eval_batch_size,
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    log_save_steps,
    "--save_steps",
    log_save_steps
])


PyTorch: setting up devices


In [30]:
#If using a binary model, run this:
if binary:
    para_tokenizer, model = get_model_tokenizer_binary(paraphrase_task,
                                                       paraphrase_model_name)
    dataset = ldpb(pseudo_data_dir, para_tokenizer, mode=mode, tasks=[prompt_task], n_proc=6000)
elif not joint:
    # Create the paraphraser tokenizer and dataset objects
    para_tokenizer = AutoTokenizer.from_pretrained(paraphrase_model_type, cache_dir=model_args_para.cache_dir,
                                             model_max_length = data_args_para.max_seq_len)
    dataset = ldp(pseudo_data_dir, para_tokenizer, mode=mode, tasks=[prompt_task], n_proc=6000)
    # Use the paraphrase configuration defined above to create the model
    model = AutoModelWithLMHead.from_pretrained(os.path.join(output_dir, paraphrase_model_name))
    
# Handle joint case:
else:
    para_tokenizer = AutoTokenizer.from_pretrained(paraphrase_model_type, cache_dir=model_args_para.cache_dir,
                                             model_max_length = data_args_para.max_seq_len)
    dataset = ldpj(pseudo_data_dir, para_tokenizer, mode=mode, tasks=[prompt_task], n_proc=6000) 
    model = AutoModelWithLMHead.from_pretrained(os.path.join(output_dir, paraphrase_model_name))

PyTorch: setting up devices
loading configuration file https://huggingface.co/t5-small/resolve/main/config.json from cache at /home/dmac/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,

wiki dev


100%|██████████| 1/1 [00:00<00:00,  1.16it/s]


## Use the Paraphraser to Generate Predictions

In [14]:
device = ("cuda" if cuda.is_available() else "cpu")

In [15]:
torch.cuda.empty_cache()

In [16]:
sampler = SequentialSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=8)

num_return_sequences = 3

model = model.to(device)
model.eval()
predicted1 = []
predicted2 = []
predicted3 = []



epoch_iterator = tqdm(dataloader, desc="Iteration")
with no_grad():
    for step, batch in enumerate(epoch_iterator):
        batch = tuple(t.to(device) for t in batch)  # GPU or CPU
        generated_outputs = model.generate(input_ids= batch[0], 
                                           attention_mask = batch[1], 
                                           max_length=50, 
                                           num_beams=9,
                                           early_stopping=True,
                                           encoder_no_repeat_ngram_size=5,
                                           no_repeat_ngram_size=4,
                                           num_beam_groups=3,
                                           diversity_penalty=0.5,
                                           num_return_sequences=num_return_sequences)
        paras = para_tokenizer.batch_decode(generated_outputs.detach().cpu().numpy(), 
                                                 skip_special_tokens=True)
        predicted1 += paras[0::3]
        predicted2 += paras[1::3]
        predicted3 += paras[2::3]

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=625.0, style=ProgressStyle(description_wi…




## Save results to a csv file

In [17]:
import pandas as pd

In [18]:
# Store outputs to disk using in_filename as the original texts 
# and writing outputs to out_filename

# If you want to do other parts of the dataset other than train, 
# set the mode in 'dataset' above to the desired mode and then rerun the paraphrase
# and change these filenames to point to the slice of the data you want to use (dev, test, etc.)




df_para = pd.DataFrame(data={'transfered1' : predicted1, 
                             'transfered2' : predicted2, 
                             'transfered3' : predicted3}) 

if not binary and not joint:
    in_filename = f'{mode}.csv'
    out_filename = f'{mode}_transfered.csv'
    df = pd.read_csv(os.path.join(pseudo_data_dir, paraphrase_task, in_filename), names =['paraphrase', 
                                                                               'para_bucket',
                                                                               'orig_text', 
                                                                               'oring_bucket'])
elif not joint:
    in_filename = f'{mode}_binary.csv'
    out_filename = f'{mode}_binary_transfered.csv'
    df = pd.read_csv(os.path.join(pseudo_data_dir, paraphrase_task, in_filename), names =['paraphrase',
                                                                                           'orig_text'])   
else:
    in_filename = f'{mode}.csv'
    out_filename = f'{mode}_transfered.csv'
    df = pd.read_csv(os.path.join(pseudo_data_dir, paraphrase_task, in_filename), 
                     names =['paraphrase',
                             f"{joint_transfer_tasks[0]}_para_bucket",
                             f"{joint_transfer_tasks[1]}_para_bucket",
                             'orig_text', 
                             f"{joint_transfer_tasks[0]}_orig_bucket",
                             f"{joint_transfer_tasks[1]}_orig_bucket"])     

    
df['transfered1'] = df_para['transfered1']
df['transfered2'] = df_para['transfered2']
df['transfered3'] = df_para['transfered3']
df.to_csv(os.path.join(pseudo_data_dir, prompt_task, out_filename), 
               header=False, index=False)

In [19]:
# Inspect some results
df.head()

Unnamed: 0,paraphrase,orig_text,transfered1,transfered2,transfered3
0,there's more than two possible outcomes in K.,there are K possible outcomes rather than just...,"In K, there are more than 2 possible outcomes.","In K, there are more than two possible results.","In K, there are more than 2 possible outcomes"
1,the small number of multiplications in matrix-...,The nonlinear map from the parameter to this c...,The nonlinear parameter map to vector collecti...,The nonlinear parameter map to the vector coll...,The nonlinear parameter map to the vector coll...
2,"the template must not be linked to the image, ...",The template need not to be anyhow related to ...,"The template cannot be linked to an image, and...","The template cannot be linked to an image, and...",It is not necessary to link the template to an...
3,the initialization of the deep learning classi...,Latent representations from original SNP seque...,Latent representations from original SNP seque...,Latent representations from original SNP seque...,Latent representations from original SNP seque...
4,a visual representation of the indoor environm...,The aim of this work is to use Variational Aut...,"For this work, visual representations of indoo...","For this work, visual representations of indoo...","For this work, visual representations of indoo..."


## Now use classifier for Scoring
This may cause GPU memory issues, so it's possible you may have to shutdown the kernel and restart without running the paraphraser first to run this next portion. If doing so, reload the df that was written to disk in several cells above.  

## Load in desired dataset and classifier model
In the cell below, define the dataset you want to work with and the classifier model.

In [33]:
model_config = AutoConfig.from_pretrained(model_args_joint.model_name_or_path, 
                                          cache_dir=model_args_joint.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_args_joint.model_name_or_path, 
                                          cache_dir=model_args_joint.cache_dir,
                                          model_max_length = data_args_joint.max_seq_len)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at ../models/cache/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd8f468e81fe979ada957a8c361
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.3.3",
  "vocab_size": 30522
}

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at ../models/cache/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.d423bdf2f58dc8b77d5f5d18028d7ae4a72dcfd

In [34]:
# Load data as expected by joint classifier
tasks = data_args_joint.task.split('+')
train_dataset, idx_to_classes = jld(data_args_joint.data_dir, 
                                             tokenizer, 
                                             model_name=model_args_joint.model_name_or_path, 
                           tasks=tasks, mode="train", n_proc=6000)
dev_dataset, _ = jld(data_args_joint.data_dir, 
                              tokenizer, 
                              model_name=model_args_joint.model_name_or_path, 
                              tasks=tasks, mode="dev", n_proc=6000)

100%|██████████| 14/14 [00:02<00:00,  4.92it/s]


torch.Size([81523, 64]) torch.Size([81523, 64]) torch.Size([81523, 2]) torch.Size([81523])


100%|██████████| 6/6 [00:01<00:00,  5.70it/s]
 25%|██▌       | 1/4 [00:00<00:00,  8.07it/s]

torch.Size([113262, 64]) torch.Size([113262, 64]) torch.Size([113262, 2]) torch.Size([113262])


100%|██████████| 4/4 [00:00<00:00,  6.75it/s]
 50%|█████     | 1/2 [00:00<00:00,  9.91it/s]

torch.Size([20306, 64]) torch.Size([20306, 64]) torch.Size([20306, 2]) torch.Size([20306])


100%|██████████| 2/2 [00:00<00:00,  7.66it/s]

torch.Size([28322, 64]) torch.Size([28322, 64]) torch.Size([28322, 2]) torch.Size([28322])





In [35]:
label_dims = {task : 1 if len(list(idx_to_classes[task].keys())) == 2 else len(list(idx_to_classes[task].keys())) for task in idx_to_classes}

In [36]:
joint_model = JointSeqClassifier.from_pretrained(os.path.join(output_dir,
                                                              model_args_joint.model_nick, joint_task),
                                           tasks=tasks,
                                           model_args=model_args_joint,
                                           task_if_single=None, 
                                           joint = training_args_joint.train_jointly,
                                           label_dims=label_dims)

trainer = JointTrainer([training_args_joint,model_args_joint, data_args_joint], 
                       joint_model, train_dataset, dev_dataset, idx_to_classes)

loading configuration file ../models/distilbert_uncased_2/abstract+shakespeare/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "JointSeqClassifier"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.3.3",
  "vocab_size": 30522
}

loading weights file ../models/distilbert_uncased_2/abstract+shakespeare/pytorch_model.bin
All model checkpoint weights were used when initializing JointSeqClassifier.

All the weights of JointSeqClassifier were initialized from the model checkpoint at ../models/distilbert_uncased_2/abstract+shakespeare.
If your task is similar to the task the model of the

## Run classifier on paraphrased and original text

This is currently done with pd DataFrames but could probably be made better by using a batch data loader. 

In [37]:
import scipy.stats as ss
from tqdm import tqdm
tqdm.pandas()

In [38]:
tasks

['abstract', 'shakespeare']

In [39]:
def pred_paraphrases(row, tasks, cols):
    '''
    Make style predictions on a given df row for a given set of text columns
    and classification tasks. 
    '''
    preds = {}
    for col in cols:
        sentence = row[col]
        out = trainer.predict_for_sentence(sentence, tokenizer)
        for task in tasks:
            pred = float(out[task]['prob'])
            preds[task + '_' + col] = pred
    return preds

def get_best_pred(row, cols, target_val=0.5):
    '''
    Helper funtion for determiningg which paraphrase is 'best' 
    for a given set of paraphrase column style scores and a target value
    that you want the scores to be close to. Currently just outputs the best score
    but could be modified to get best sentence as well.
    '''
    best_diff = 1
    best_val = None
    for col in cols:
        diff = abs(row[col] - target_val)
        if diff < best_diff:
            best_val = row[col]
            best_diff = diff
    return best_val

In [40]:
# Define columns on which to run the classification
cols_to_use = ['orig_text', 'paraphrase','transfered1', 'transfered2', 'transfered3']
# Define the names of the columns where the output scores will be stored
cols_preds = [f'pred_{tasks[0]}_orig', f'pred_{tasks[1]}_orig',
              f'pred_{tasks[0]}_para', f'pred_{tasks[1]}_para',
              f'pred_{tasks[0]}_transfered1', f'pred_{tasks[1]}_transfered1',
              f'pred_{tasks[0]}_transfered2', f'pred_{tasks[1]}_transfered2',
              f'pred_{tasks[0]}_transfered3', f'pred_{tasks[1]}_transfered3']
# Store results into df
df[cols_preds] = df.progress_apply(lambda x : pred_paraphrases(x, tasks, cols_to_use), 
                                   axis=1, result_type="expand")

100%|██████████| 5000/5000 [04:58<00:00, 16.72it/s]


## Analysing the results of the transfer for style changes

In [41]:
df.head(100)

Unnamed: 0,paraphrase,orig_text,transfered1,transfered2,transfered3,pred_formality_orig,pred_emo_orig,pred_formality_para,pred_emo_para,pred_formality_transfered1,...,pred_abstract_orig,pred_shakespeare_orig,pred_abstract_para,pred_shakespeare_para,pred_abstract_transfered1,pred_shakespeare_transfered1,pred_abstract_transfered2,pred_shakespeare_transfered2,pred_abstract_transfered3,pred_shakespeare_transfered3
0,there's more than two possible outcomes in K.,there are K possible outcomes rather than just...,"In K, there are more than 2 possible outcomes.","In K, there are more than two possible results.","In K, there are more than 2 possible outcomes",0.873103,0.824869,0.880773,0.885823,0.823150,...,0.000399,0.276801,0.000686,0.113090,0.000349,0.304722,0.000363,0.310722,0.999428,0.351737
1,the small number of multiplications in matrix-...,The nonlinear map from the parameter to this c...,The nonlinear parameter map to vector collecti...,The nonlinear parameter map to the vector coll...,The nonlinear parameter map to the vector coll...,0.969080,0.882344,0.956581,0.693250,0.962489,...,0.999576,0.378452,0.999519,0.408578,0.999549,0.411191,0.999552,0.387537,0.999526,0.382447
2,"the template must not be linked to the image, ...",The template need not to be anyhow related to ...,"The template cannot be linked to an image, and...","The template cannot be linked to an image, and...",It is not necessary to link the template to an...,0.916163,0.418875,0.995045,0.535523,0.994128,...,0.000582,0.231495,0.000421,0.143112,0.000348,0.237502,0.000342,0.254806,0.000362,0.209619
3,the initialization of the deep learning classi...,Latent representations from original SNP seque...,Latent representations from original SNP seque...,Latent representations from original SNP seque...,Latent representations from original SNP seque...,0.990662,0.947853,0.993578,0.900966,0.991874,...,0.999584,0.345480,0.000349,0.284943,0.000341,0.298768,0.000343,0.296286,0.999579,0.334723
4,a visual representation of the indoor environm...,The aim of this work is to use Variational Aut...,"For this work, visual representations of indoo...","For this work, visual representations of indoo...","For this work, visual representations of indoo...",0.969464,0.986648,0.995278,0.947575,0.994145,...,0.999578,0.326097,0.000370,0.203785,0.000356,0.225494,0.000350,0.234881,0.999552,0.334261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"in order to provide more information, the resp...",The lower layer responses are transferred to t...,The responses of the lower layers are transfer...,The responses of the lower layers are transfer...,The responses of the lower layers are transfer...,0.981573,0.892616,0.994789,0.830346,0.994796,...,0.999584,0.350174,0.000355,0.230139,0.000347,0.256072,0.000348,0.248210,0.999576,0.316912
96,"however, both departments have failed to excel...","However, existing frameworks fail to excel in ...","However, existing frameworks have failed in bo...","However, existing frameworks have failed in bo...","However, existing frameworks have failed in bo...",0.986414,0.486884,0.994676,0.624792,0.979495,...,0.999564,0.330449,0.000383,0.193641,0.999538,0.302278,0.999572,0.325910,0.999566,0.319097
97,structural changes that can encourage physical...,Understanding the association between specific...,The understanding of association between speci...,The understanding of association between speci...,The understanding of association between speci...,0.990401,0.755807,0.993889,0.806404,0.994012,...,0.999566,0.346098,0.000416,0.201334,0.000363,0.230093,0.999572,0.337382,0.000362,0.237188
98,the results demonstrate that such a declarativ...,Experiments performed on a large computing clu...,The results demonstrate that such declarative ...,The results show that such a afirmative approa...,The results demonstrate that such declarative ...,0.991427,0.985020,0.992018,0.983310,0.991762,...,0.999569,0.338455,0.999520,0.256230,0.999579,0.319539,0.999584,0.325544,0.000354,0.226534


In [42]:
#for prompt_task in ['formality', 'emo']:
df[f'{prompt_task}_diff1'] =  abs(df[f'pred_{prompt_task}_orig'] - df[f'pred_{prompt_task}_transfered1'])
df[f'{prompt_task}_diff2'] =  abs(df[f'pred_{prompt_task}_orig'] - df[f'pred_{prompt_task}_transfered2'])
df[f'{prompt_task}_diff3'] =  abs(df[f'pred_{prompt_task}_orig'] - df[f'pred_{prompt_task}_transfered3'])

KeyError: 'pred_wiki_orig'

In [None]:
# for prompt_task in ['formality', 'emo']:
df[f'{prompt_task}_diff_max'] = df.apply(lambda x : np.max([x[f'{prompt_task}_diff1'], 
                                                       x[f'{prompt_task}_diff2'], 
                                                       x[f'{prompt_task}_diff3']]), axis=1)

In [None]:

# Store results of style classification:
if binary:
    out_filename = paraphrase_task + f'_{mode}_binary_cross_predict_transfers.csv'
else:
    out_filename = paraphrase_task + f'_{mode}_cross_predict_transfers.csv'

df.to_csv(os.path.join(pseudo_data_dir, prompt_task, out_filename), header=True, index=False)

Let's look at best sytle difference summary stats

In [None]:
df[f'{prompt_task}_diff_max'].describe()

Let's disaggregate by class label

In [None]:
df.columns

In [None]:
# df[df['para_bucket']=='low'][f'{prompt_task}_diff_max'].describe()

In [None]:
# df[df['para_bucket']=='mid'][f'{prompt_task}_diff_max'].describe()


# Temp for running analysis

In [None]:
import os 
import pandas as pd
import numpy as np

data_dir = '../data/pseudo/'
model_name = 'abstract'
dataset = 'abstract'
mode = 'dev'
binary = True

prompt_task = dataset 

in_filename = f'{model_name}_{mode}_cross_predict_transfers.csv'
full_path = os.path.join(data_dir, model_name, in_filename)
parallel_df = pd.read_csv(full_path)

In [None]:
parallel_df[f'{dataset}_diff1'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered1'])
parallel_df[f'{dataset}_diff2'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered2'])
parallel_df[f'{dataset}_diff3'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered3'])

parallel_df[f'{dataset}_para_diff1'] =  abs(parallel_df[f'pred_{dataset}_para'] - parallel_df[f'pred_{dataset}_transfered1'])
parallel_df[f'{dataset}_para_diff2'] =  abs(parallel_df[f'pred_{dataset}_para'] - parallel_df[f'pred_{dataset}_transfered2'])
parallel_df[f'{dataset}_para_diff3'] =  abs(parallel_df[f'pred_{dataset}_para'] - parallel_df[f'pred_{dataset}_transfered3'])

parallel_df[f'{dataset}_para_orig_diff'] = abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_para'])

In [None]:
parallel_df[f'{dataset}_orig_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{dataset}_diff1'], 
                                                       x[f'{dataset}_diff2'], 
                                                       x[f'{dataset}_diff3']]), axis=1)

parallel_df[f'{dataset}_para_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{dataset}_para_diff1'], 
                                                       x[f'{dataset}_para_diff2'], 
                                                       x[f'{dataset}_para_diff3']]), axis=1)

In [None]:
orig_diff_mean = parallel_df[f'{dataset}_orig_diff_max'].mean()
orig_diff_std = parallel_df[f'{dataset}_orig_diff_max'].std()
print(f'orig_diff {orig_diff_mean :.4f} ({orig_diff_std :.4f}) ')
para_diff_mean = parallel_df[f'{dataset}_para_diff_max'].mean()
para_diff_std = parallel_df[f'{dataset}_para_diff_max'].std()
print(f'para_diff {para_diff_mean :.4f} ({para_diff_std :.4f}) ')

In [None]:
parallel_df.head()

In [None]:
parallel_df['transfer_best_style'] = parallel_df.apply(lambda x : \
                                                       np.min([x[f'pred_{dataset}_transfered1'], 
                                                       x[f'pred_{dataset}_transfered2'], 
                                                       x[f'pred_{dataset}_transfered3']]), axis=1)

In [None]:
parallel_df[f'pred_{dataset}_transfered3'].describe()

In [None]:
parallel_df[f'pred_{dataset}_orig'].describe()