In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import torch

from transformers import AutoTokenizer, AutoModel

model_name = "sentence-transformers/stsb-distilbert-base"

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [2]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_sent_embedding(sent, model, tokenizer): 
    #Tokenize sentences
    encoded_input = tokenizer(sent, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt')
    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    #Perform pooling. In this case, mean pooling
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embedding
    
def len_pen(sent_orig, sent_generated):
    encoded_input_orig = tokenizer(sent_orig, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt') 
    len_orig = encoded_input_orig['attention_mask'].shape[1]
    
    encoded_input_generated = tokenizer(sent_generated, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt') 
    len_generated = encoded_input_generated['attention_mask'].shape[1]
    return np.exp(1 - (np.max([len_orig, len_generated])/\
                       np.min([len_orig, len_generated])))
    
    
def get_similarity(sent_orig, sent_generated, model, 
                   tokenizer, length_penalty=True, alpha=0.4):
    embedding_orig =  get_sent_embedding(sent_orig, model, tokenizer)
    embedding_generated =  get_sent_embedding(sent_generated, model, tokenizer)
    sim = torch.nn.functional.cosine_similarity(embedding_orig, 
                                                embedding_generated, dim=1)
    if length_penalty:
        penalty = len_pen(sent_orig, sent_generated)
        sim *= penalty**alpha
    return sim
    

In [3]:
get_similarity('the world is a vampire', 'hello world', model, tokenizer)

tensor([-0.1039])

In [4]:
get_similarity('me and my friends really really like u', 'My friends and I quite like you.', model, tokenizer)

tensor([0.8769])

In [5]:
data_dir = '../data/pseudo/'
dataset = 'shakespeare'
mode = 'dev'
base_filename = f'{dataset}_{mode}_cross_predict_transfers.csv'
full_path = os.path.join(data_dir, dataset, base_filename)


parallel_df = pd.read_csv(full_path)

In [6]:
parallel_df.head()

Unnamed: 0,paraphrase,para_bucket,orig_text,oring_bucket,transfered1,transfered2,transfered3,pred_abstract_orig,pred_shakespeare_orig,pred_abstract_transfered1,pred_shakespeare_transfered1,pred_abstract_transfered2,pred_shakespeare_transfered2,pred_abstract_transfered3,pred_shakespeare_transfered3
0,I'm sure you won't marry her.,low,"But thus, I trust, you will not marry her.",mid,I am sure you will not marry her.,I know you will not marry her.,I am sure you’ll not marry her.,0.000403,0.000557,0.000232,0.00253,0.000307,0.050229,0.122297,0.998928
1,stand in front of the hearse!,low,Stand from the hearse.,mid,Stand in front of the hearse!,Stand in front of hearse!,Stand before the hearse!,0.00032,0.012139,0.00032,0.012139,0.000438,0.040431,0.062622,0.99839
2,"I'm not going to walk out of the door, but som...",low,"I have no will to wander forth of doors, Yet s...",high,"I’ll not walk out of the door, But something l...","I’ll not walk out of the door, but something l...","I will not walk out of the door, But something...",0.000409,0.000621,0.164337,0.998977,0.164337,0.998977,0.000302,0.026706
3,how do you mean removing him?,mid,"How do you mean, removing of him?",high,How mean you to remove him?,How dost thou mean removing him?,How mean you removing him?,0.000276,0.00098,0.006401,0.981065,0.204749,0.998964,0.006354,0.977889
4,I'm yours forever.,low,I am your own for ever.,high,I am yours forever.,I am thyself forever.,I am thy lord forever.,0.000332,0.000957,0.058799,0.997922,0.229949,0.999094,0.320483,0.999098


In [7]:
def get_sim_scores(row, orig_col, generated_cols, model, tokenizer):
    '''
    Get semantic similarity between an original 
    text and a set of generated texts. 
    
    This function is meant for applying to a row of a pd DataFrame 
    where 'col' is the name of the column that 
    contains the original text, 'generated_cols' are the names of
    the columns of generated text, and 'row' is the row of the DataFrame. 
    '''
    scores = {}
    sent_orig = row[orig_col]
    for col in generated_cols:
        sent_generated = row[col]
        score = get_similarity(sent_orig, sent_generated, model, tokenizer)
        scores[f"sim_score_{col}"] = score.item()
    return scores

In [8]:
parallel_df.head()

Unnamed: 0,paraphrase,para_bucket,orig_text,oring_bucket,transfered1,transfered2,transfered3,pred_abstract_orig,pred_shakespeare_orig,pred_abstract_transfered1,pred_shakespeare_transfered1,pred_abstract_transfered2,pred_shakespeare_transfered2,pred_abstract_transfered3,pred_shakespeare_transfered3
0,I'm sure you won't marry her.,low,"But thus, I trust, you will not marry her.",mid,I am sure you will not marry her.,I know you will not marry her.,I am sure you’ll not marry her.,0.000403,0.000557,0.000232,0.00253,0.000307,0.050229,0.122297,0.998928
1,stand in front of the hearse!,low,Stand from the hearse.,mid,Stand in front of the hearse!,Stand in front of hearse!,Stand before the hearse!,0.00032,0.012139,0.00032,0.012139,0.000438,0.040431,0.062622,0.99839
2,"I'm not going to walk out of the door, but som...",low,"I have no will to wander forth of doors, Yet s...",high,"I’ll not walk out of the door, But something l...","I’ll not walk out of the door, but something l...","I will not walk out of the door, But something...",0.000409,0.000621,0.164337,0.998977,0.164337,0.998977,0.000302,0.026706
3,how do you mean removing him?,mid,"How do you mean, removing of him?",high,How mean you to remove him?,How dost thou mean removing him?,How mean you removing him?,0.000276,0.00098,0.006401,0.981065,0.204749,0.998964,0.006354,0.977889
4,I'm yours forever.,low,I am your own for ever.,high,I am yours forever.,I am thyself forever.,I am thy lord forever.,0.000332,0.000957,0.058799,0.997922,0.229949,0.999094,0.320483,0.999098


In [9]:
orig_col = 'orig_text'
para_col = 'paraphrase'
#generated_cols = ['paraphrased1', 'paraphrased2', 'paraphrased3']
generated_cols = ['transfered1', 'transfered2', 'transfered3']
orig_score_cols = [f"sim_score_orig_{col}" for col in generated_cols]
para_score_cols = [f"sim_score_para_{col}" for col in generated_cols]
temp = parallel_df.progress_apply(lambda x: get_sim_scores(x, orig_col, 
                                                           generated_cols, 
                                                           model, tokenizer), axis=1, result_type="expand")
parallel_df[orig_score_cols] = temp
temp = parallel_df.progress_apply(lambda x: get_sim_scores(x, para_col, 
                                                           generated_cols, 
                                                           model, tokenizer), axis=1, result_type="expand")
parallel_df[para_score_cols] = temp

100%|██████████| 7763/7763 [17:05<00:00,  7.57it/s]
100%|██████████| 7763/7763 [15:54<00:00,  8.13it/s]


In [None]:
parallel_df['sim_score_orig_para'] = parallel_df.progress_apply(lambda x: get_sim_scores(x, orig_col, 
                                                           [para_col], 
                                                           model, tokenizer), axis=1)

 25%|██▌       | 1960/7763 [01:29<04:26, 21.80it/s]

In [11]:
parallel_df['best_sim_orig'] = parallel_df.apply(lambda x : 
                                                 np.max([x['sim_score_orig_transfered1'], 
                                                         x['sim_score_orig_transfered2'], 
                                                       x['sim_score_orig_transfered3']]), axis=1)
parallel_df['best_sim_para'] = parallel_df.apply(lambda x : np.max([x['sim_score_para_transfered1'], 
                                                       x['sim_score_para_transfered2'], 
                                                       x['sim_score_para_transfered3']]), axis=1)

In [12]:
parallel_df[f'{dataset}_diff1'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered1'])
parallel_df[f'{dataset}_diff2'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered2'])
parallel_df[f'{dataset}_diff3'] =  abs(parallel_df[f'pred_{dataset}_orig'] - parallel_df[f'pred_{dataset}_transfered3'])

In [14]:
parallel_df[f'{dataset}_diff_max'] = parallel_df.apply(lambda x : np.max([x[f'{dataset}_diff1'], 
                                                       x[f'{dataset}_diff2'], 
                                                       x[f'{dataset}_diff3']]), axis=1)

Get the summary stats for style transfer eval metrics and for semantic similarity.

In [16]:
parallel_df['best_sim_orig'].describe()

count    7763.000000
mean        0.729944
std         0.161465
min         0.024768
25%         0.625262
50%         0.751123
75%         0.850518
max         1.000000
Name: best_sim_orig, dtype: float64

In [17]:
parallel_df['best_sim_para'].describe()

count    7763.000000
mean        0.878192
std         0.114952
min         0.193337
25%         0.825335
50%         0.907810
75%         0.959470
max         1.000000
Name: best_sim_para, dtype: float64

In [None]:
parallel_df[f'{dataset}_diff_max'].describe()

Get the summary stats for style transfer eval metrics and for semantic similarity disaggregated by original class label

In [10]:
parallel_df[parallel_df['label']==0]['best_sim'].describe()

KeyError: 'label'

In [None]:
parallel_df[parallel_df['label']==1]['best_sim'].describe()

In [None]:
parallel_df[parallel_df['label']==1][f'{dataset}_diff_max'].describe()

In [None]:
parallel_df[parallel_df['label']==1][f'{dataset}_diff_max'].describe()

In [None]:
parallel_df.head()

In [None]:
# orig_col = 'text'
# #generated_cols = ['paraphrased1', 'paraphrased2', 'paraphrased3']
# generated_cols = ['transfered1', 'transfered2', 'transfered3']
# score_cols = [f"sim_score_{col}" for col in generated_cols]

# thing = parallel_df.progress_apply(lambda x: get_sim_scores(x, orig_col, generated_cols,
#                                                             model, tokenizer), 
#                                                      axis=1, result_type="expand")
# parallel_df[score_cols] = thing

In [None]:
out_filename = f'{dataset}_{mode}_cross_predict_transfers_sim_scores.csv'
full_path = os.path.join(data_dir, dataset, out_filename)
parallel_df_head.to_csv(full_path, index=False)