In [1]:
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer

In [2]:
model_name_or_path = '/home/chen/SimCSE/result/my-sup-simcse-bert-large-hard_neg1-batch-512-stsb'
model = AutoModel.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [3]:
import pandas as pd
import spacy
from tqdm import tqdm
nlp = spacy.load('ja_ginza')

In [4]:
dataset_path = '/home/chen/relevant_content_words/dataset/QAbot-relevant.csv'
df = pd.read_csv(dataset_path, sep = '\t', header = None, names = ["sent1", "sent2"])

In [5]:
def cos_sim(v1, v2) -> float:
    return np.dot(v1,v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [6]:
#Mean embedding
def get_embeddings(inputs):
    with torch.no_grad():
            outputs = model(**inputs, return_dict=True)
            last_hidden = outputs.last_hidden_state
            pooler_output = outputs.pooler_output
            
    return ((last_hidden * inputs['attention_mask'].unsqueeze(-1)).sum(1) / inputs['attention_mask'].sum(-1).unsqueeze(-1)).cpu()        

In [7]:
pos_total_list = []
for i in tqdm(range(len(df))):
    pos_list = []
    cos_relevant_score = []
    sentence1 = df['sent1'][i]
    sentence2 = df['sent2'][i]
    sent1_tk = tokenizer(sentence1, return_tensors = 'pt', padding = True, max_length = 32, truncation = True)
    sent2_tk = tokenizer(sentence2, return_tensors = 'pt', padding = True, max_length = 32, truncation = True)
    sent1_tk = sent1_tk.to(device)
    sent2_tk = sent2_tk.to(device)
    sent1_embedding = get_embeddings(sent1_tk)
    sent2_embedding = get_embeddings(sent2_tk)
    cossim_sent1andsent2 = cos_sim(np.squeeze(sent1_embedding), np.squeeze(sent2_embedding))
    doc_sent1 = nlp(sentence1)
    doc_sent2 = nlp(sentence2)
    duplicate_tokens = []
    for token in doc_sent1:
        if token.text in [j.text for j in doc_sent2]:
            duplicate_tokens.append(token.text)

        re_sent1 = sentence1.replace(token.text, '')
        re_sent2 = sentence2.replace(token.text, '')
            
        re_sent1_tk = tokenizer(re_sent1, return_tensors = 'pt', 
                                    padding = True, max_length = 32, truncation = True)
        re_sent2_tk = tokenizer(re_sent2, return_tensors = 'pt', 
                                    padding = True, max_length = 32, truncation = True)
        
        re_sent1_tk = re_sent1_tk.to(device)
        re_sent2_tk = re_sent2_tk.to(device)
        
        re_sent1_embedding = get_embeddings(re_sent1_tk)
        re_sent2_embedding = get_embeddings(re_sent2_tk)
        re_cossim_1 = cos_sim(re_sent1_embedding.squeeze(), sent2_embedding.squeeze())
        re_cossim_2 = cos_sim(sent1_embedding.squeeze(), re_sent2_embedding.squeeze())
        pos_list.append(token.pos_)
        relevant_score = cossim_sent1andsent2 - min(re_cossim_1, re_cossim_2)
        cos_relevant_score.append(relevant_score)    
        
    for token in doc_sent2:
        if token.text in duplicate_tokens:
            continue
        else:
            re_sent1 = sentence1.replace(token.text, '')
            re_sent2 = sentence2.replace(token.text, '')
            
            re_sent1_tk = tokenizer(re_sent1, return_tensors = 'pt', 
                                    padding = True, max_length = 32, truncation = True)
            re_sent2_tk = tokenizer(re_sent2, return_tensors = 'pt', 
                                    padding = True, max_length = 32, truncation = True)
            
            re_sent1_tk = re_sent1_tk.to(device)
            re_sent2_tk = re_sent2_tk.to(device)
            
            re_sent1_embedding = get_embeddings(re_sent1_tk)
            re_sent2_embedding = get_embeddings(re_sent2_tk)
            re_cossim_1 = cos_sim(re_sent1_embedding.squeeze(), sent2_embedding.squeeze())
            re_cossim_2 = cos_sim(sent1_embedding.squeeze(), re_sent2_embedding.squeeze())
            pos_list.append(token.pos_)
            relevant_score = cossim_sent1andsent2 - min(re_cossim_1, re_cossim_2)
            cos_relevant_score.append(relevant_score)
            
    max_indexs = [index for index, item in enumerate(cos_relevant_score) if item == max(cos_relevant_score)] 
    for ii in range(len(max_indexs)):
        idx = max_indexs[ii]
        pos_total_list.append(pos_list[idx])

100%|███████████████████████████████████████████| 53/53 [01:15<00:00,  1.43s/it]


In [8]:
len(pos_total_list)

62

In [9]:
pos_total_list[:20]

['AUX',
 'ADP',
 'ADP',
 'NOUN',
 'ADP',
 'ADP',
 'ADP',
 'AUX',
 'VERB',
 'AUX',
 'NOUN',
 'ADP',
 'NOUN',
 'VERB',
 'NOUN',
 'AUX',
 'NOUN',
 'NOUN',
 'AUX',
 'ADP']

In [9]:
def unique(lists):
    list_set = set(lists)
    unique_list = (list(list_set))
    for x in unique_list:
        print(x)

In [10]:
unique(pos_total_list)

VERB
ADP
PROPN
ADJ
X
NOUN
NUM
PUNCT


In [30]:
spacy.explain("X")

'other'

In [31]:
spacy.explain("SYM")

'symbol'

In [11]:
PART_num, SYM_num, X_num, AUX_num, ADP_num, ADV_num, ADJ_num, NUM_num = 0,0,0,0,0,0,0,0
SCONJ_num, PUNCT_num, PROPN_num, NOUN_num, VERB_num, CCONJ_num, PRON_num = 0,0,0,0,0,0,0
for i in range(len(pos_total_list)):
    if pos_total_list[i] == "SYM":
        SYM_num+=1
    elif pos_total_list[i] == "X":
        X_num+=1
    elif pos_total_list[i] == "AUX":
        AUX_num+=1
    elif pos_total_list[i] == "ADP":
        ADP_num+=1
    elif pos_total_list[i] == "ADV":
        ADV_num+=1
    elif pos_total_list[i] == "ADJ":
        ADJ_num+=1
    elif pos_total_list[i] == "NUM":
        NUM_num+=1
    elif pos_total_list[i] == "SCONJ":
        SCONJ_num+=1
    elif pos_total_list[i] == "PUNCT":
        PUNCT_num+=1
    elif pos_total_list[i] == "PROPN":
        PROPN_num+=1
    elif pos_total_list[i] == "NOUN":
        NOUN_num+=1
    elif pos_total_list[i] == "VERB":
        VERB_num+=1
    elif pos_total_list[i] == "PART":
        PART_num+=1
    elif pos_total_list[i] == "CCONJ":
        CCONJ_num+=1
    elif pos_total_list[i] == "PRON":
        PRON_num+=1

In [12]:
results = []

In [13]:
print('NOUN_per:'+"%.2f" %(NOUN_num / len(pos_total_list) *100))

NOUN_per:56.45


In [14]:
results.append('SYM_per:'+"%.2f" %(SYM_num / len(pos_total_list) *100))

In [15]:
results.append('X_per:'+"%.2f" %(X_num / len(pos_total_list) *100))

In [16]:
results.append('AUX_per:'+"%.2f" %(AUX_num / len(pos_total_list) *100))

In [17]:
results.append('ADP_per:'+"%.2f" %(ADP_num / len(pos_total_list) *100))

In [18]:
results.append('ADV_per:'+"%.2f" %(ADV_num / len(pos_total_list) *100))

In [19]:
results.append('ADJ_per:'+"%.2f" %(ADJ_num / len(pos_total_list) *100))

In [20]:
results.append('NUM_per:'+"%.2f" %(NUM_num / len(pos_total_list) *100))

In [21]:
results.append('SCONJ_per:'+"%.2f" %(SCONJ_num / len(pos_total_list) *100))

In [22]:
results.append('PUNCT_per:'+"%.2f" %(PUNCT_num / len(pos_total_list) *100))

In [23]:
results.append('PROPN_per:'+"%.2f" %(PROPN_num / len(pos_total_list) *100))

In [24]:
results.append('NOUN_per:'+"%.2f" %(NOUN_num / len(pos_total_list) *100))

In [25]:
results.append('VERB_per:'+"%.2f" %(VERB_num / len(pos_total_list) *100))

In [26]:
results.append('PART_per:'+"%.2f" %(PART_num / len(pos_total_list) *100))

In [27]:
results.append('CCONJ_per:'+"%.2f" %(CCONJ_num / len(pos_total_list) *100))

In [28]:
results.append('PRON_per:'+"%.2f" %(PRON_num / len(pos_total_list) *100))

In [29]:
results

['SYM_per:0.00',
 'X_per:3.23',
 'AUX_per:0.00',
 'ADP_per:3.23',
 'ADV_per:0.00',
 'ADJ_per:9.68',
 'NUM_per:1.61',
 'SCONJ_per:0.00',
 'PUNCT_per:6.45',
 'PROPN_per:14.52',
 'NOUN_per:56.45',
 'VERB_per:4.84',
 'PART_per:0.00',
 'CCONJ_per:0.00',
 'PRON_per:0.00']

In [30]:
file_out = 'SimCSE_large_sup_QAbot_relevantwords.txt'
with open(file_out, 'w', encoding = 'utf-8') as f:
    for i in range(len(results)):
        f.write(results[i].strip() + '\n')
     
        
f.close() 