### `02b_features_dist_with_BERT_ED`
final modification: 2024/5/2, 10:27

#### 1. loda model and library

In [1]:
import os, sys
from pathlib import Path

CURRENT_DIR = os.path.join(Path().resolve())
sys.path.append(str(CURRENT_DIR)+"/../src/")

In [2]:
import pandas as pd
import numpy as np
import tqdm
import re
import itertools

from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch

In [4]:
#load bert model
#ref1: https://huggingface.co/tohoku-nlp/bert-base-japanese-v3
#ref2: https://www.sbert.net/docs/usage/semantic_textual_similarity.html
model = SentenceTransformer("tohoku-nlp/bert-base-japanese-v2")

No sentence-transformers model found with name tohoku-nlp/bert-base-japanese-v2. Creating a new one with MEAN pooling.


In [5]:
#test
sentences1 = [
    "日本語は難しい"
]

sentences2 = [
    "日本語は難しい"
]

embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

In [6]:
type(embeddings1)

torch.Tensor

In [7]:
# compute euclidian distance
torch.cdist(embeddings1, embeddings2)[0][0].item()

0.0

In [9]:
#test2
sentence1 = ["日本語は難しい"]
sentence2 = ["日本語は難解だ"]

embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

torch.cdist(embedding1, embedding2)[0][0].item()

7.515835762023926

#### 2. prepare euclid distancees for `3.3. Semantic distances of the topic words, vehicle words, and the features calculated by the language model (BERT) on the metaphor form selection`

##### 2.1 load raw data

In [10]:
RAW_SORTING_DIR = "../data/MetConceptAct_Exp2_Sorting.csv"

raw_sorting_df = pd.read_csv(RAW_SORTING_DIR)
raw_sorting_df.head()

Unnamed: 0,success,timeout,failed_images,failed_audio,failed_video,trial_type,trial_index,time_elapsed,internal_node_id,cwid,...,finloc_T_y,finloc_V_x,finloc_V_y,finloc_F1_x,finloc_F1_y,finloc_F2_x,finloc_F2_y,finloc_F3_x,finloc_F3_y,stimulus
0,1.0,,Array,Array,Array,preload,0,2,0.0-0.0,4287323,...,,,,,,,,,,
1,1.0,,,,,fullscreen,1,2649,0.0-1.0,4287323,...,,,,,,,,,,
2,,,,,,survey-html-form,2,5753,0.0-2.0,4287323,...,,,,,,,,,,
3,,,,,,survey-html-form,3,12969,0.0-3.0,4287323,...,,,,,,,,,,
4,,,,,,survey-html-form,4,21255,0.0-4.0,4287323,...,,,,,,,,,,


In [11]:
raw_sorting_df.columns

Index(['success', 'timeout', 'failed_images', 'failed_audio', 'failed_video',
       'trial_type', 'trial_index', 'time_elapsed', 'internal_node_id', 'cwid',
       'Condition', 'StartTime', 'EndTime', 'rt', 'response', 'task',
       'init_locations', 'moves', 'final_locations', 'TrialType', 'SetType',
       'FinStimID', 'NID', 'Topic', 'Vehicle', 'NumFeatures', 'F1', 'F2', 'F3',
       'TargetDirTopic', 'TargetDirVehicle', 'TargetDirF1', 'TargetDirF2',
       'TargetDirF3', 'iniloc_T_x', 'iniloc_T_y', 'iniloc_V_x', 'iniloc_V_y',
       'iniloc_F1_x', 'iniloc_F1_y', 'iniloc_F2_x', 'iniloc_F2_y',
       'iniloc_F3_x', 'iniloc_F3_y', 'finloc_T_x', 'finloc_T_y', 'finloc_V_x',
       'finloc_V_y', 'finloc_F1_x', 'finloc_F1_y', 'finloc_F2_x',
       'finloc_F2_y', 'finloc_F3_x', 'finloc_F3_y', 'stimulus'],
      dtype='object')

##### 2.2 pick topic, vehicle, features, and metaphors ID

In [12]:
topic_vehicle_features_df = raw_sorting_df.query('cwid in [4287323, 2613017, 4765315] and NID not in ["NaN", "E06", "E13", "E48"]')
topic_vehicle_features_df = topic_vehicle_features_df.loc[:,["cwid", "NID", "Topic", "Vehicle", "F1", "F2", "F3"]]
topic_vehicle_features_df = topic_vehicle_features_df.dropna()
topic_vehicle_features_df.reset_index()

Unnamed: 0,index,cwid,NID,Topic,Vehicle,F1,F2,F3
0,12,4287323,26,真珠,水滴,美しい,丸い,透き通っている
1,13,4287323,63,時間,洪水,止められない,流れる,逆らえない
2,25,4287323,86,学校,工場,人がたくさんいる,画一的だ,生産する
3,26,4287323,80,煙草,時限爆弾,寿命を縮める,危険だ,燃える
4,27,4287323,39,研究,登山,険しい道のり,達成感がある,苦しい
5,28,4287323,12,衝撃,電気,一瞬の出来事だ,突然来る,ビリビリする
6,30,4287323,88,批判,メス,人を傷つける,鋭い,痛いところをつく
7,33,4287323,14,暴動,嵐,激しい,人を巻き込む,突然起こる
8,37,4287323,38,蝶,踊り子,美しい,舞う,華やかだ
9,41,4287323,81,夕日,銅貨,丸い,輝いている,赤い


In [13]:
#remove yomigana in Japanese
#ref1: https://teratail.com/questions/210373
#ref2: https://takake-blog.com/python-regular-expression/
#ref3: https://uxmilk.jp/8662

reg = '（.+?）'
re.sub(pattern=reg, repl="", string="牢獄（ろうごく）")

'牢獄'

In [15]:
REG = '（.+?）'

def remove_brackets(text, reg=REG):
    """
    remove yomigana
    """
    try:
        return_text = re.sub(pattern=reg, repl="", string=text)
    except:
        return_text = text
 
    return return_text

#test
remove_brackets("牢獄（ろうごく）")

'牢獄'

In [16]:
topic_vehicle_features_df["Topic"] = topic_vehicle_features_df.Topic.map(remove_brackets)
topic_vehicle_features_df["Vehicle"] = topic_vehicle_features_df.Vehicle.map(remove_brackets)
topic_vehicle_features_df["F1"] = topic_vehicle_features_df["F1"].map(remove_brackets)
topic_vehicle_features_df["F2"] = topic_vehicle_features_df["F2"].map(remove_brackets)
topic_vehicle_features_df["F3"] = topic_vehicle_features_df["F3"].map(remove_brackets)
topic_vehicle_features_df = topic_vehicle_features_df.reset_index()
topic_vehicle_features_df

Unnamed: 0,index,cwid,NID,Topic,Vehicle,F1,F2,F3
0,12,4287323,26,真珠,水滴,美しい,丸い,透き通っている
1,13,4287323,63,時間,洪水,止められない,流れる,逆らえない
2,25,4287323,86,学校,工場,人がたくさんいる,画一的だ,生産する
3,26,4287323,80,煙草,時限爆弾,寿命を縮める,危険だ,燃える
4,27,4287323,39,研究,登山,険しい道のり,達成感がある,苦しい
5,28,4287323,12,衝撃,電気,一瞬の出来事だ,突然来る,ビリビリする
6,30,4287323,88,批判,メス,人を傷つける,鋭い,痛いところをつく
7,33,4287323,14,暴動,嵐,激しい,人を巻き込む,突然起こる
8,37,4287323,38,蝶,踊り子,美しい,舞う,華やかだ
9,41,4287323,81,夕日,銅貨,丸い,輝いている,赤い


##### 2.3.1 calc euclid distances
-ref: https://pytorch.org/docs/stable/generated/torch.cdist.html

In [17]:
def calc_euclidian_distance(sentence1, sentence2):
    sentence1 = [sentence1]
    sentence2 = [sentence2]

    embedding1 = model.encode(sentence1, convert_to_tensor=True)
    embedding2 = model.encode(sentence2, convert_to_tensor=True)

    euclidian_distance = torch.cdist(embedding1, embedding2)[0][0].item()

    return euclidian_distance
    
#test
calc_euclidian_distance("日本語は難しい", "日本語は難解だ")

7.515835762023926

In [18]:
topic_vehicle_features_df["EucDistBert_T_V"] = topic_vehicle_features_df[['Topic', 'Vehicle']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_T_F1"] = topic_vehicle_features_df[['Topic', 'F1']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_T_F2"] = topic_vehicle_features_df[['Topic', 'F2']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_T_F3"] = topic_vehicle_features_df[['Topic', 'F3']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_V_F1"] = topic_vehicle_features_df[['Vehicle', 'F1']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_V_F2"] = topic_vehicle_features_df[['Vehicle', 'F2']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_V_F3"] = topic_vehicle_features_df[['Vehicle', 'F3']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_F1_F2"] = topic_vehicle_features_df[['F1', 'F2']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_F1_F3"] = topic_vehicle_features_df[['F1', 'F3']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)
topic_vehicle_features_df["EucDistBert_F2_F3"] = topic_vehicle_features_df[['F2', 'F3']].apply(lambda x: calc_euclidian_distance(x[0], x[1]), axis=1)

topic_vehicle_features_df

Unnamed: 0,index,cwid,NID,Topic,Vehicle,F1,F2,F3,EucDistBert_T_V,EucDistBert_T_F1,EucDistBert_T_F2,EucDistBert_T_F3,EucDistBert_V_F1,EucDistBert_V_F2,EucDistBert_V_F3,EucDistBert_F1_F2,EucDistBert_F1_F3,EucDistBert_F2_F3
0,12,4287323,26,真珠,水滴,美しい,丸い,透き通っている,9.875226,12.486851,9.280494,14.90414,13.240928,11.608192,13.915242,11.098179,13.887902,15.110683
1,13,4287323,63,時間,洪水,止められない,流れる,逆らえない,10.010055,14.054848,10.034571,13.772678,12.49616,8.422661,12.470832,12.682328,11.764668,12.700509
2,25,4287323,86,学校,工場,人がたくさんいる,画一的だ,生産する,6.92751,17.43231,12.548472,11.047716,17.470016,13.04873,10.70139,14.832829,16.516108,12.112259
3,26,4287323,80,煙草,時限爆弾,寿命を縮める,危険だ,燃える,11.86595,13.1639,10.417119,8.354392,12.190483,12.9901,12.521012,13.729238,13.812178,11.070451
4,27,4287323,39,研究,登山,険しい道のり,達成感がある,苦しい,9.417645,14.179992,13.092621,14.367789,13.073853,13.416916,15.106904,12.988675,16.190887,15.521581
5,28,4287323,12,衝撃,電気,一瞬の出来事だ,突然来る,ビリビリする,8.82713,12.939189,10.81779,14.287148,13.102613,11.336025,13.758604,10.907162,13.196715,11.98499
6,30,4287323,88,批判,メス,人を傷つける,鋭い,痛いところをつく,10.392945,13.400344,13.609049,13.841883,13.61032,13.913806,14.620938,14.863021,11.655198,14.40244
7,33,4287323,14,暴動,嵐,激しい,人を巻き込む,突然起こる,8.507006,10.549999,12.308897,9.946824,9.850345,13.117043,9.561074,14.072675,10.805176,11.853616
8,37,4287323,38,蝶,踊り子,美しい,舞う,華やかだ,10.047037,10.94115,9.685325,11.07678,13.340741,10.625051,11.693384,11.451913,12.084574,10.592228
9,41,4287323,81,夕日,銅貨,丸い,輝いている,赤い,12.625702,11.92107,14.608787,11.198457,11.960226,14.937016,12.315863,13.598611,8.247584,14.013779


In [30]:
topic_vehicle_features_df.to_csv("../result/words_to_bert_edited_with_euclid_distance.csv", index=False)