## Imports

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import torch

from transformers import AutoTokenizer, AutoModel
# Sentence bert model for getting similarity
model_name = "sentence-transformers/stsb-distilbert-base"
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [2]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    '''
    Mean pool embeddings to get a sentence-level representation.
    '''
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_sent_embedding(sent, model, tokenizer): 
    '''
    Get sentence embedding for a given sentence using a given transformer model.
    '''
    #Tokenize sentences
    encoded_input = tokenizer(sent, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt')
    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    #Perform pooling. In this case, mean pooling
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embedding
    
def len_pen(sent_orig, sent_generated):
    '''
    Length penalty for simialrity score. 
    If sentences are signifcantly different in length in either direction, 
    there are penalized.
    '''
    encoded_input_orig = tokenizer(sent_orig, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt') 
    len_orig = encoded_input_orig['attention_mask'].shape[1]
    
    encoded_input_generated = tokenizer(sent_generated, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt') 
    len_generated = encoded_input_generated['attention_mask'].shape[1]
    return np.exp(1 - (np.max([len_orig, len_generated])/\
                       np.min([len_orig, len_generated])))
    
    
def get_similarity(sent_orig, sent_generated, model, 
                   tokenizer, length_penalty=True, alpha=0.4):
    '''
    Get sentence similarity score between two sentences.
    '''
    embedding_orig =  get_sent_embedding(sent_orig, model, tokenizer)
    embedding_generated =  get_sent_embedding(sent_generated, model, tokenizer)
    sim = torch.nn.functional.cosine_similarity(embedding_orig, 
                                                embedding_generated, dim=1)
    if length_penalty:
        penalty = len_pen(sent_orig, sent_generated)
        sim *= penalty**alpha
    return sim
    

In [3]:
# Example

# get_similarity('the world is a vampire', 'hello world', model, tokenizer)

# get_similarity('me and my friends really really like u', 
#                'My friends and I quite like you.', model, tokenizer)

## Define model and dataset we're working with

In [4]:
data_dir = '../data/pseudo/' 
model_name = 'shakespeare_binary' 
dataset = 'shakespeare'
mode = 'dev'
binary = True
joint = False
joint_transfer_tasks = ['formality', 'emo']
num_return_sequences = 3

if not binary:
    in_filename = f'{dataset}_{mode}_cross_predict_transfers.csv'
else:
    in_filename = f'{dataset}_{mode}_binary_cross_predict_transfers.csv'
full_path = os.path.join(data_dir, dataset, in_filename)

## Load Data:
Be careful not to overwrite the dataframe before results have been saved.

In [5]:
parallel_df = pd.read_csv(full_path)
parallel_df.head()

Unnamed: 0,paraphrase,orig_text,transfered1,transfered2,transfered3,pred_abstract_orig,pred_shakespeare_orig,pred_abstract_para,pred_shakespeare_para,pred_abstract_transfered1,pred_shakespeare_transfered1,pred_abstract_transfered2,pred_shakespeare_transfered2,pred_abstract_transfered3,pred_shakespeare_transfered3,shakespeare_diff1,shakespeare_diff2,shakespeare_diff3,shakespeare_diff_max
0,I'm sure you won't marry her.,"But thus, I trust, you will not marry her.",I am sure you will not marry her.,I know you will not marry her.,I am sure thou willst not marry her.,0.040968,0.997252,0.000403,0.000557,0.000232,0.00253,0.000307,0.050229,0.167474,0.999019,0.994722,0.947023,0.001767,0.994722
1,stand in front of the hearse!,Stand from the hearse.,Stand in front of hearse.,Stand in the hearse.,Stand in the hearse’s front!,0.115325,0.998852,0.00032,0.012139,0.00127,0.590538,0.056994,0.998052,0.196209,0.999079,0.408314,0.0008,0.000227,0.408314
2,"I'm not going to walk out of the door, but som...","I have no will to wander forth of doors, Yet s...","I’ll not walk out of my door, But something le...","I’ll not walk out of my door, But something le...","I’ll not walk out of my door, But something le...",0.138364,0.998991,0.000409,0.000621,0.223029,0.999028,0.156101,0.998955,0.183395,0.998951,3.7e-05,3.6e-05,4e-05,4e-05
3,how do you mean removing him?,"How do you mean, removing of him?",How mean you to remove him?,How dost thou mean to remove him?,How dost thou mean removing thee?,0.002919,0.918759,0.000276,0.00098,0.006401,0.981065,0.263301,0.999117,0.238432,0.99912,0.062306,0.080357,0.080361,0.080361
4,"O Thou, I'm a captain, and I'm a gracious eye ...","O Thou, whose captain I account myself, Look o...","O Thou, i' th' captain, and my gracious eye up...","O Thou, i' th' captain, and gracious eye upon ...","O Thou, captain, and gracious eye upon my forces.",0.337572,0.999085,0.110731,0.998729,0.352569,0.999073,0.353515,0.999066,0.33141,0.999099,1.1e-05,1.8e-05,1.4e-05,1.8e-05


In [6]:
def get_sim_scores(row, orig_col, generated_cols, model, tokenizer):
    '''
    Get semantic similarity between an original 
    text and a set of generated texts. 
    
    This function is meant for applying to a row of a pd DataFrame 
    where 'col' is the name of the column that 
    contains the original text, 'generated_cols' are the names of
    the columns of generated text, and 'row' is the row of the DataFrame. 
    '''
    scores = {}
    sent_orig = row[orig_col]
    for col in generated_cols:
        sent_generated = row[col]
        score = get_similarity(sent_orig, sent_generated, model, tokenizer)
        scores[f"sim_score_{col}"] = score.item()
    return scores

## Run the similarity model comparing transfers, originals, and paraphrases

In [7]:
orig_col = 'orig_text'
para_col = 'paraphrase'
generated_cols = ['transfered1', 'transfered2', 'transfered3']
orig_score_cols = [f"sim_score_orig_{col}" for col in generated_cols]
para_score_cols = [f"sim_score_para_{col}" for col in generated_cols]
temp = parallel_df.progress_apply(lambda x: get_sim_scores(x, orig_col, 
                                                           generated_cols, 
                                                           model, tokenizer), axis=1, result_type="expand")
parallel_df[orig_score_cols] = temp
temp = parallel_df.progress_apply(lambda x: get_sim_scores(x, para_col, 
                                                           generated_cols, 
                                                           model, tokenizer), axis=1, result_type="expand")
parallel_df[para_score_cols] = temp

100%|██████████| 9453/9453 [23:38<00:00,  6.66it/s]
100%|██████████| 9453/9453 [31:33<00:00,  4.99it/s]


In [8]:
parallel_df['sim_score_orig_para'] = parallel_df.progress_apply(lambda x: get_sim_scores(x, orig_col, 
                                                           [para_col], 
                                                           model, tokenizer), axis=1, result_type="expand")


100%|██████████| 9453/9453 [10:46<00:00, 14.63it/s]


## Get best similarities for each of the transfers
Also save results.

In [9]:
parallel_df['best_sim_orig'] = parallel_df.apply(lambda x : 
                                                 np.max([x['sim_score_orig_transfered1'], 
                                                         x['sim_score_orig_transfered2'], 
                                                       x['sim_score_orig_transfered3']]), axis=1)
parallel_df['best_sim_para'] = parallel_df.apply(lambda x : np.max([x['sim_score_para_transfered1'], 
                                                       x['sim_score_para_transfered2'], 
                                                       x['sim_score_para_transfered3']]), axis=1)

if not binary:
    out_filename = f'{dataset}_{mode}_cross_predict_transfers_sim_scores.csv'
else:
    out_filename = f'{dataset}_{mode}_binary_cross_predict_transfers_sim_scores.csv'
full_path = os.path.join(data_dir, dataset, out_filename)
parallel_df.to_csv(full_path, index=False)

In [10]:
if not binary:
    in_filename = f'{dataset}_{mode}_cross_predict_transfers_sim_scores.csv'
else:
    in_filename = f'{dataset}_{mode}_binary_cross_predict_transfers_sim_scores.csv'
full_path = os.path.join(data_dir, dataset, in_filename)
parallel_df = pd.read_csv(full_path)

## Compute style differences

In [11]:
parallel_df.columns

Index(['paraphrase', 'orig_text', 'transfered1', 'transfered2', 'transfered3',
       'pred_abstract_orig', 'pred_shakespeare_orig', 'pred_abstract_para',
       'pred_shakespeare_para', 'pred_abstract_transfered1',
       'pred_shakespeare_transfered1', 'pred_abstract_transfered2',
       'pred_shakespeare_transfered2', 'pred_abstract_transfered3',
       'pred_shakespeare_transfered3', 'shakespeare_diff1',
       'shakespeare_diff2', 'shakespeare_diff3', 'shakespeare_diff_max',
       'sim_score_orig_transfered1', 'sim_score_orig_transfered2',
       'sim_score_orig_transfered3', 'sim_score_para_transfered1',
       'sim_score_para_transfered2', 'sim_score_para_transfered3',
       'sim_score_orig_para', 'best_sim_orig', 'best_sim_para'],
      dtype='object')

In [None]:
if not joint:
    parallel_df[f'{dataset}_diff1'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered1'])
    parallel_df[f'{dataset}_diff2'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered2'])
    parallel_df[f'{dataset}_diff3'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered3'])

    parallel_df[f'{dataset}_para_diff1'] =  abs(parallel_df[f'pred_{dataset}_para'] - parallel_df[f'pred_{dataset}_transfered1'])
    parallel_df[f'{dataset}_para_diff2'] =  abs(parallel_df[f'pred_{dataset}_para'] - parallel_df[f'pred_{dataset}_transfered2'])
    parallel_df[f'{dataset}_para_diff3'] =  abs(parallel_df[f'pred_{dataset}_para'] - parallel_df[f'pred_{dataset}_transfered3'])

    parallel_df[f'{dataset}_para_orig_diff'] = abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_para'])
    
    parallel_df[f'{dataset}_orig_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{dataset}_diff1'], 
                                                       x[f'{dataset}_diff2'], 
                                                       x[f'{dataset}_diff3']]), axis=1)

    parallel_df[f'{dataset}_para_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{dataset}_para_diff1'], 
                                                      x[f'{dataset}_para_diff2'], 
                                                      x[f'{dataset}_para_diff3']]), axis=1)
else:
    for joint_transfer_task in joint_transfer_tasks:
        parallel_df[f'{joint_transfer_task}_diff1'] =  abs(parallel_df[f'pred_{joint_transfer_task}_orig'] - parallel_df[f'pred_{joint_transfer_task}_transfered1'])
        parallel_df[f'{joint_transfer_task}_diff2'] =  abs(parallel_df[f'pred_{joint_transfer_task}_orig'] - parallel_df[f'pred_{joint_transfer_task}_transfered2'])
        parallel_df[f'{joint_transfer_task}_diff3'] =  abs(parallel_df[f'pred_{joint_transfer_task}_orig'] - parallel_df[f'pred_{joint_transfer_task}_transfered3'])

        parallel_df[f'{joint_transfer_task}_para_diff1'] =  abs(parallel_df[f'pred_{joint_transfer_task}_para'] - parallel_df[f'pred_{joint_transfer_task}_transfered1'])
        parallel_df[f'{joint_transfer_task}_para_diff2'] =  abs(parallel_df[f'pred_{joint_transfer_task}_para'] - parallel_df[f'pred_{joint_transfer_task}_transfered2'])
        parallel_df[f'{joint_transfer_task}_para_diff3'] =  abs(parallel_df[f'pred_{joint_transfer_task}_para'] - parallel_df[f'pred_{joint_transfer_task}_transfered3'])

        parallel_df[f'{joint_transfer_task}_para_orig_diff'] = abs(parallel_df[f'pred_{joint_transfer_task}_orig'] - parallel_df[f'pred_{joint_transfer_task}_para'])
        
        parallel_df[f'{joint_transfer_task}_orig_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{joint_transfer_task}_diff1'], 
                                                   x[f'{joint_transfer_task}_diff2'], 
                                                   x[f'{joint_transfer_task}_diff3']]), axis=1)

        parallel_df[f'{joint_transfer_task}_para_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{joint_transfer_task}_para_diff1'], 
                                                   x[f'{joint_transfer_task}_para_diff2'], 
                                                   x[f'{joint_transfer_task}_para_diff3']]), axis=1)

Get the summary stats for style transfer eval metrics and for semantic similarity.

In [None]:
parallel_df['best_sim_orig'].describe()

In [None]:
para_mean = round(parallel_df['best_sim_para'].mean(), 4)
para_std = round(parallel_df['best_sim_para'].std(), 4)
orig_mean = round(parallel_df['best_sim_orig'].mean(), 4)
orig_std = round(parallel_df['best_sim_orig'].std(), 4)
print(f'{para_mean} ({para_std})')
print(f'{orig_mean} ({orig_std})')

In [None]:
round(parallel_df[f'{dataset}_para_diff_max'].mean(), 4)

In [None]:
parallel_df[f'{dataset}_para_diff_max'].describe()

Get the summary stats for style transfer eval metrics and for semantic similarity disaggregated by original class label

In [None]:
parallel_df[parallel_df['para_bucket']=='low']['best_sim_orig'].describe()

In [None]:
parallel_df[parallel_df['para_bucket']=='mid']['best_sim_orig'].describe()

In [None]:
parallel_df[parallel_df['para_bucket']=='low']['best_sim_para'].describe()

In [None]:
parallel_df[parallel_df['para_bucket']=='mid']['best_sim_para'].describe()

In [None]:
parallel_df[parallel_df['para_bucket']=='low'][f'{dataset}_orig_diff_max'].describe()

In [None]:
parallel_df[parallel_df['para_bucket']=='mid'][f'{dataset}_orig_diff_max'].describe()

In [None]:
parallel_df.tail()

In [None]:
out_filename = f'{dataset}_{mode}_cross_predict_transfers_sim_scores.csv'
full_path = os.path.join(data_dir, dataset, out_filename)
parallel_df.to_csv(full_path, index=False)

In [None]:
parallel_df.columns

In [None]:
if not joint:
    print(f"## Summary Stats for {dataset} {mode}")
    print('| Metric     | Mean | Std Dev|')
    print('| ----------- | ----------- |--------|')
    print(f"| Style difference between original and transfers | {parallel_df[f'{dataset}_orig_diff_max'].mean() : .4f} | {parallel_df[f'{dataset}_orig_diff_max'].std(): .4f} |")
    print(f"| Style difference between paraphrase and transfers | {parallel_df[f'{dataset}_para_diff_max'].mean() : .4f} | {parallel_df[f'{dataset}_para_diff_max'].std(): .4f} |")
    print(f"| Semantic similarity between original and transfers | {parallel_df['best_sim_orig'].mean(): .4f} | {parallel_df['best_sim_orig'].std(): .4f} |")
    print(f"| Semantic similarity between paraphrase and transfers | {parallel_df['best_sim_para'].mean(): .4f} | {parallel_df['best_sim_para'].std(): .4f} |")

In [None]:
if joint:
    for joint_transfer_task in joint_transfer_tasks:
        orig_diff_mean = parallel_df[f'{joint_transfer_task}_orig_diff_max'].mean()
        orig_diff_std = parallel_df[f'{joint_transfer_task}_orig_diff_max'].std()
        print(f'orig_diff {joint_transfer_task} {orig_diff_mean :.4f} ({orig_diff_std :.4f}) ')
        para_diff_mean = parallel_df[f'{joint_transfer_task}_para_diff_max'].mean()
        para_diff_std = parallel_df[f'{joint_transfer_task}_para_diff_max'].std()
        print(f'para_diff {joint_transfer_task} {para_diff_mean :.4f} ({para_diff_std :.4f}) ')

In [None]:
# round(parallel_df.groupby(['formality_para_bucket', 'emo_para_bucket']).size() / len(parallel_df), 4)

In [None]:
# sns.scatterplot(data=parallel_df, x="pred_formality_orig", y='pred_emo_orig')
# plt.savefig('example_jointdist.png')

In [None]:
bucket_bounds = {'shakespeare' : {'low' : (0., 0.1), 'mid' : (0.1, 0.9), 'high' : (0.9, 1.)},
'formality' : {'low' : (0., 0.2), 'mid' : (0.2, 0.7), 'high' : (0.7, 1.)},
 'emo' : {'low' : (0., 0.25), 'mid' : (0.25, 0.7), 'high' : (0.7, 1.)},
          'abstract' : {'low' : (0., 0.1), 'mid' : (0.1, 0.9), 'high' : (0.9, 1.)}}

def get_bucket(bucket_bounds, pred, task):
    bucks = bucket_bounds[task]
    if pred < bucks['low'][1]:
        return 'low'
    elif pred < bucks['mid'][1]:
        return 'mid'
    else:
        return 'high'
    
    
if joint:
    
    for joint_transfer_task in joint_transfer_tasks:
        for i in range(num_return_sequences):
            parallel_df[f'{joint_transfer_task}_transfered{i+1}_bucket'] = \
            parallel_df.apply(lambda x : get_bucket(bucket_bounds, 
                                                    x[f'pred_{joint_transfer_task}_transfered{i+1}'], 
                                                    joint_transfer_task), 
                              axis=1)

    for joint_transfer_task in joint_transfer_tasks:
            print(f"{joint_transfer_task}")
            bool1 = (parallel_df[f'{joint_transfer_task}_transfered1_bucket'] == \
                     parallel_df[f'{joint_transfer_task}_orig_bucket'])
            bool2 = (parallel_df[f'{joint_transfer_task}_transfered2_bucket'] == \
                    parallel_df[f'{joint_transfer_task}_orig_bucket'])
            bool3 = (parallel_df[f'{joint_transfer_task}_transfered3_bucket'] == \
                     parallel_df[f'{joint_transfer_task}_orig_bucket'])
            print(round((bool1 | bool2 | bool3).mean(), 4))
            
else:
    if not binary:
        for i in range(num_return_sequences):
            parallel_df[f'transfered{i+1}_bucket'] = \
            parallel_df.apply(lambda x : get_bucket(bucket_bounds, 
                                                    x[f'pred_{dataset}_transfered{i+1}'], 
                                                    dataset), 
                              axis=1)

        print(f"{dataset}")
        bool1 = (parallel_df[f'transfered1_bucket'] == \
                 parallel_df[f'oring_bucket'])
        bool2 = (parallel_df[f'transfered2_bucket'] == \
                parallel_df[f'oring_bucket'])
        bool3 = (parallel_df[f'transfered3_bucket'] == \
                 parallel_df[f'oring_bucket'])
        print(f'accuracy of at least one {round((bool1 | bool2 | bool3).mean(), 4)}') 
        print(f'accuracy of at all {round((np.mean([bool1.mean(), bool2.mean(), bool3.mean()])), 4)}')
        
    else:
        for i in range(num_return_sequences):
            parallel_df[f'transfered{i+1}_bucket'] = \
            parallel_df.apply(lambda x : get_bucket(bucket_bounds, 
                                                    x[f'pred_{dataset}_transfered{i+1}'], 
                                                    dataset), 
                              axis=1)

        print(f"{dataset}")
        bool1 = (parallel_df[f'transfered1_bucket'] == 'high')
        bool2 = (parallel_df[f'transfered2_bucket'] == 'high')
        bool3 = (parallel_df[f'transfered3_bucket'] == 'high')
        print(f'accuracy of at least one {round((bool1 | bool2 | bool3).mean(), 4)}') 
        print(f'accuracy of at all {round((np.mean([bool1.mean(), bool2.mean(), bool3.mean()])), 4)}')

In [None]:
if binary:
    binary_trans_buckets = (parallel_df[f'transfered1_bucket'].value_counts() + \
    parallel_df[f'transfered2_bucket'].value_counts() + 
    parallel_df[f'transfered3_bucket'].value_counts()) / (len(parallel_df) * 3)
    print(f"Transfer buckets rates \n{binary_trans_buckets}")

In [None]:
# Look at some examples

random_sample = parallel_df.sample().iloc[0]
print(random_sample)
print('\n')
print(f"orig text:\n{random_sample['orig_text']}")
print(f"paraphrase:\n{random_sample['paraphrase']}")
print(f"transfers:\n{random_sample['transfered1']}")
print(f"{random_sample['transfered2']}")
print(f"{random_sample['transfered3']}")

### Plotting and creating summary markdown tables

In [None]:
results_dir = f'../results/{dataset}/{mode}'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

fig, axs = plt.subplots(1,1,figsize=(15,10))
axs.set_title(f"Distribution of Style Classification Differences Between Original Text and Transfer {dataset}")
axs.set_xlabel(f"Style Difference")
axs.set_ylabel(f"Counts")
plt.hist(parallel_df[f'{dataset}_orig_diff_max'].values, density=False);
plt.savefig(os.path.join(results_dir, f'{dataset}_orig_diffs.png'))

fig, axs = plt.subplots(1,1,figsize=(15,10))
axs.set_title(f"Distribution of Semantic Similarity Between Original Text and Transfer {dataset}")
axs.set_xlabel(f"Similarity")
axs.set_ylabel(f"Counts")
plt.hist(parallel_df['best_sim_orig'].values, density=False);
plt.savefig(os.path.join(results_dir, f'{dataset}_orig_sims.png'))

with open(os.path.join(results_dir, f'{dataset}_{mode}_orig_diffs.md'), 'w') as summaryfile:
    summaryfile.write(f"## Summary Stats for {dataset} {mode}\n")
    summaryfile.write('| Metric     | Mean | Std Dev|\n')
    summaryfile.write('| ----------- | ----------- |--------|\n')
    summaryfile.write(f"| Style difference between original and transfers | {parallel_df[f'{dataset}_orig_diff_max'].mean() : .4f} | {parallel_df[f'{dataset}_orig_diff_max'].std(): .4f} |\n")
    summaryfile.write(f"| Style difference between paraphrase and transfers | {parallel_df[f'{dataset}_para_diff_max'].mean() : .4f} | {parallel_df[f'{dataset}_para_diff_max'].std(): .4f} |\n")
    summaryfile.write(f"| Semantic similarity between original and transfers | {parallel_df['best_sim_orig'].mean(): .4f} | {parallel_df['best_sim_orig'].std(): .4f} |\n")
    summaryfile.write(f"| Semantic similarity between paraphrase and transfers | {parallel_df['best_sim_para'].mean(): .4f} | {parallel_df['best_sim_para'].std(): .4f} |\n")

for target_style in parallel_df['oring_bucket'].unique().tolist():
    fig, axs = plt.subplots(1,1,figsize=(15,10))
    axs.set_title(f"Distribution of Style Classification Differences Between Original Text and Transfer (Target Style {dataset} {target_style})")
    axs.set_xlabel(f"Style Difference")
    axs.set_ylabel(f"Counts")
    plt.hist(parallel_df[parallel_df['oring_bucket']==target_style][f'{dataset}_orig_diff_max'].values, density=False);
    plt.savefig(os.path.join(results_dir, f'{dataset}_{target_style}_orig_diffs.png'))
    
    fig, axs = plt.subplots(1,1,figsize=(15,10))
    axs.set_title(f"Distribution of Semantic Similarity Between Original Text and Transfer (Target Style {dataset} {target_style})")
    axs.set_xlabel(f"Similarity")
    axs.set_ylabel(f"Counts")
    plt.hist(parallel_df[parallel_df['oring_bucket']==target_style]['best_sim_orig'].values, density=False);
    plt.savefig(os.path.join(results_dir, f'{dataset}_{target_style}_orig_sims.png'))
    
    filtered = parallel_df[parallel_df['oring_bucket']==target_style]
    with open(os.path.join(results_dir, f'{dataset}_{mode}_{target_style}_orig_diffs.md'), 'w') as summaryfile:
        summaryfile.write(f"## Summary Stats for {dataset} {mode} (Target Style {dataset} {target_style})\n")
        summaryfile.write('| Metric     | Mean | Std Dev|\n')
        summaryfile.write('| ----------- | ----------- |--------|\n')
        summaryfile.write(f"| Style difference between original and transfers | {filtered[f'{dataset}_orig_diff_max'].mean() : .4f} | {filtered[f'{dataset}_orig_diff_max'].std(): .4f} |\n")
        summaryfile.write(f"| Style difference between paraphrase and transfers | {filtered[f'{dataset}_para_diff_max'].mean() : .4f} | {filtered[f'{dataset}_para_diff_max'].std(): .4f} |\n")
        summaryfile.write(f"| Semantic similarity between original and transfers | {filtered['best_sim_orig'].mean(): .4f} | {filtered['best_sim_orig'].std(): .4f} |\n")
        summaryfile.write(f"| Semantic similarity between paraphrase and transfers | {filtered['best_sim_para'].mean(): .4f} | {filtered['best_sim_para'].std(): .4f} |\n")

In [None]:
fig, axs = plt.subplots(1,1,figsize=(15,10))
axs.set_title(f"Distribution of Style Classification Differences Between Paraphrased Text and Transfer {dataset}")
axs.set_xlabel(f"Style Difference")
axs.set_ylabel(f"Counts")
plt.hist(parallel_df[f'{dataset}_para_diff_max'].values, density=False);
plt.savefig(os.path.join(results_dir, f'{dataset}_para_diffs.png'))

fig, axs = plt.subplots(1,1,figsize=(15,10))
axs.set_title(f"Distribution of Semantic Similarity Between Paraphrased Text and Transfer {dataset}")
axs.set_xlabel(f"Similarity")
axs.set_ylabel(f"Counts")
plt.hist(parallel_df['best_sim_para'].values, density=False);
plt.savefig(os.path.join(results_dir, f'{dataset}_para_sims.png'))

with open(os.path.join(results_dir, f'{dataset}_{mode}_para_diffs.md'), 'w') as summaryfile:
    summaryfile.write(f"## Summary Stats for {dataset} {mode}\n")
    summaryfile.write('| Metric     | Mean | Std Dev|\n')
    summaryfile.write('| ----------- | ----------- |--------|\n')
    summaryfile.write(f"| Style difference between paraphrased and transfers | {parallel_df[f'{dataset}_para_diff_max'].mean() : .4f} | {parallel_df[f'{dataset}_para_diff_max'].std(): .4f} |\n")
    summaryfile.write(f"| Style difference between paraphrase and transfers | {parallel_df[f'{dataset}_para_diff_max'].mean() : .4f} | {parallel_df[f'{dataset}_para_diff_max'].std(): .4f} |\n")
    summaryfile.write(f"| Semantic similarity between paraphrased and transfers | {parallel_df['best_sim_para'].mean(): .4f} | {parallel_df['best_sim_para'].std(): .4f} |\n")
    summaryfile.write(f"| Semantic similarity between paraphrase and transfers | {parallel_df['best_sim_para'].mean(): .4f} | {parallel_df['best_sim_para'].std(): .4f} |\n")

for target_style in parallel_df['oring_bucket'].unique().tolist():
    fig, axs = plt.subplots(1,1,figsize=(15,10))
    axs.set_title(f"Distribution of Style Classification Differences Between Paraphrased Text and Transfer (Target Style {dataset} {target_style})")
    axs.set_xlabel(f"Style Difference")
    axs.set_ylabel(f"Counts")
    plt.hist(parallel_df[parallel_df['oring_bucket']==target_style][f'{dataset}_para_diff_max'].values, density=False);
    plt.savefig(os.path.join(results_dir, f'{dataset}_{target_style}_para_diffs.png'))
    
    fig, axs = plt.subplots(1,1,figsize=(15,10))
    axs.set_title(f"Distribution of Semantic Similarity Between Paraphrased Text and Transfer (Target Style {dataset} {target_style})")
    axs.set_xlabel(f"Similarity")
    axs.set_ylabel(f"Counts")
    plt.hist(parallel_df[parallel_df['oring_bucket']==target_style]['best_sim_para'].values, density=False);
    plt.savefig(os.path.join(results_dir, f'{dataset}_{target_style}_para_sims.png'))
    
    filtered = parallel_df[parallel_df['oring_bucket']==target_style]
    with open(os.path.join(results_dir, f'{dataset}_{mode}_{target_style}_para_diffs.md'), 'w') as summaryfile:
        summaryfile.write(f"## Summary Stats for {dataset} {mode} (Target Style {dataset} {target_style})\n")
        summaryfile.write('| Metric     | Mean | Std Dev|\n')
        summaryfile.write('| ----------- | ----------- |--------|\n')
        summaryfile.write(f"| Style difference between paraphrased and transfers | {filtered[f'{dataset}_para_diff_max'].mean() : .4f} | {filtered[f'{dataset}_para_diff_max'].std(): .4f} |\n")
        summaryfile.write(f"| Style difference between paraphrase and transfers | {filtered[f'{dataset}_para_diff_max'].mean() : .4f} | {filtered[f'{dataset}_para_diff_max'].std(): .4f} |\n")
        summaryfile.write(f"| Semantic similarity between paraphrased and transfers | {filtered['best_sim_para'].mean(): .4f} | {filtered['best_sim_para'].std(): .4f} |\n")
        summaryfile.write(f"| Semantic similarity between paraphrase and transfers | {filtered['best_sim_para'].mean(): .4f} | {filtered['best_sim_para'].std(): .4f} |\n")