In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import torch

from transformers import AutoTokenizer, AutoModel

model_name = "sentence-transformers/stsb-distilbert-base"

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_sent_embedding(sent, model, tokenizer): 
    #Tokenize sentences
    encoded_input = tokenizer(sent, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt')
    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        
    #Perform pooling. In this case, mean pooling
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embedding
    
def len_pen(sent_orig, sent_generated):
    encoded_input_orig = tokenizer(sent_orig, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt') 
    len_orig = encoded_input_orig['attention_mask'].shape[1]
    
    encoded_input_generated = tokenizer(sent_generated, padding=True, truncation=True, 
                              max_length=128, return_tensors='pt') 
    len_generated = encoded_input_generated['attention_mask'].shape[1]
    return np.exp(1 - (np.max([len_orig, len_generated])/\
                       np.min([len_orig, len_generated])))
    
    
def get_similarity(sent_orig, sent_generated, model, 
                   tokenizer, length_penalty=True, alpha=0.4):
    embedding_orig =  get_sent_embedding(sent_orig, model, tokenizer)
    embedding_generated =  get_sent_embedding(sent_generated, model, tokenizer)
    sim = torch.nn.functional.cosine_similarity(embedding_orig, 
                                                embedding_generated, dim=1)
    if length_penalty:
        penalty = len_pen(sent_orig, sent_generated)
        sim *= penalty**alpha
    return sim
    

In [None]:
get_similarity('the world is a vampire', 'hello world', model, tokenizer)

In [None]:
get_similarity('me and my friends really really like u', 'My friends and I quite like you.', model, tokenizer)

In [None]:
data_dir = '../data/processed_filtered'
dataset = 'formality'
base_filename = 'formality_train_cross_predict_transfers.csv'
full_path = os.path.join(data_dir, dataset, base_filename)


parallel_df = pd.read_csv(full_path)

In [None]:
def get_sim_scores(row, orig_col, generated_cols, model, tokenizer):
    '''
    Get semantic similarity between an original 
    text and a set of generated texts. 
    
    This function is meant for applying to a row of a pd DataFrame 
    where 'col' is the name of the column that 
    contains the original text, 'generated_cols' are the names of
    the columns of generated text, and 'row' is the row of the DataFrame. 
    '''
    scores = {}
    sent_orig = row[orig_col]
    for col in generated_cols:
        sent_generated = row[col]
        score = get_similarity(sent_orig, sent_generated, model, tokenizer)
        scores[f"sim_score_{col}"] = score.item()
    return scores

In [None]:
parallel_df_head = parallel_df.head(5000)

In [None]:
orig_col = 'text'
#generated_cols = ['paraphrased1', 'paraphrased2', 'paraphrased3']
generated_cols = ['transfered1', 'transfered2', 'transfered3']
score_cols = [f"sim_score_{col}" for col in generated_cols]
thing = parallel_df_head.progress_apply(lambda x: get_sim_scores(x, 
                                                                 orig_col, 
                                                                 generated_cols, 
                                                                 model, 
                                                                 tokenizer), axis=1, result_type="expand")

parallel_df_head[score_cols] = thing

In [None]:
parallel_df_head

In [None]:
parallel_df_head['best_sim'] = parallel_df_head.apply(lambda x : np.max([x['sim_score_transfered1'], 
                                                       x['sim_score_transfered2'], 
                                                       x['sim_score_transfered3']]), axis=1)

In [None]:
parallel_df_head['formality_diff1'] =  abs(parallel_df_head['pred_formality_orig'] - parallel_df_head['pred_formality_transfered1'])
parallel_df_head['formality_diff2'] =  abs(parallel_df_head['pred_formality_orig'] - parallel_df_head['pred_formality_transfered2'])
parallel_df_head['formality_diff3'] =  abs(parallel_df_head['pred_formality_orig'] - parallel_df_head['pred_formality_transfered3'])

In [None]:
parallel_df_head['formality_diff_max'] = parallel_df_head.apply(lambda x : np.max([x['formality_diff1'], 
                                                       x['formality_diff2'], 
                                                       x['formality_diff3']]), axis=1)

Get the summary stats for style transfer eval metrics and for semantic similarity.

In [None]:
parallel_df_head['best_sim'].describe()

In [None]:
parallel_df_head['formality_diff_max'].describe()

Get the summary stats for style transfer eval metrics and for semantic similarity disaggregated by original class label

In [None]:
parallel_df_head[parallel_df_head['label']==0]['best_sim'].describe()

In [None]:
parallel_df_head[parallel_df_head['label']==1]['best_sim'].describe()

In [None]:
parallel_df_head[parallel_df_head['label']==1]['formality_diff_max'].describe()

In [None]:
parallel_df_head[parallel_df_head['label']==1]['formality_diff_max'].describe()

In [None]:
parallel_df_head

In [None]:
# orig_col = 'text'
# #generated_cols = ['paraphrased1', 'paraphrased2', 'paraphrased3']
# generated_cols = ['transfered1', 'transfered2', 'transfered3']
# score_cols = [f"sim_score_{col}" for col in generated_cols]

# thing = parallel_df.progress_apply(lambda x: get_sim_scores(x, orig_col, generated_cols,
#                                                             model, tokenizer), 
#                                                      axis=1, result_type="expand")
# parallel_df[score_cols] = thing

In [None]:
out_filename = 'formality_train_cross_predict_transfers_sim_scores_head_5000.csv'
full_path = os.path.join(data_dir, dataset, out_filename)
parallel_df_head.to_csv(full_path, index=False)