# Setup

In [3]:
import pandas as pd

In [4]:
import numpy as np

In [5]:
import gensim

# Tokenization
We will focus on specific kinds of part-of-speech (POS), i.e.,
- Adjectives
- Verbs and nouns that form verbs when "suru" is added as a suffix
- nouns

## POS tagging

In [6]:
!mecab -o ./Results/output01_self.txt.mecab ./Results/output01_self.txt
!mecab -o ./Results/output01_target.txt.mecab ./Results/output01_target.txt

## Morphological Analysis

In [7]:
def my_morphol(filename):
	sentences = []
	sentence = []
	previous_line = ''
	MID = -1
	with open(filename, mode='r') as f:
		for line in f:
			# headerに相当する最初の行をスキップ
			# 文頭 or 文中
			if line != 'EOS\n':
				fields = line.split('\t')
				attr = fields[1].split(',')
				# 変数名行
				if previous_line == '':
					continue
				# MID行
				elif fields[0] == 'MID':
					continue
				# 文頭
				elif previous_line == 'EOS\n' and attr[1] == '数':
					MID = int(fields[0])
					sentence.append(MID)
				# 文中
				else:
					morph = {'surface': fields[0], 'base': attr[6], 'pos': attr[0], 'pos1': attr[1]}
					sentence.append(morph)
			# 文末（EOS行）
			else:
				if MID > 0 and len(sentence) > 0:
					sentences.append(sentence)
					sentence = []
					MID = -1
				elif MID > 0 and len(sentence) == 0:
					sentences.append([])
					sentence = []
					MID = -1

			previous_line = line

	return sentences

In [8]:
# SELF
filename = './Results/output01_self.txt.mecab'
sentences_self = my_morphol(filename)

In [9]:
sentences_self[0]

[2,
 {'surface': '.', 'base': '*\n', 'pos': '名詞', 'pos1': 'サ変接続'},
 {'surface': '0', 'base': '*\n', 'pos': '名詞', 'pos1': '数'},
 {'surface': '回答', 'base': '回答', 'pos': '名詞', 'pos1': 'サ変接続'},
 {'surface': '梨', 'base': '梨', 'pos': '名詞', 'pos1': '一般'}]

In [10]:
len(sentences_self)

479

In [11]:
# TARGET
filename = './Results/output01_target.txt.mecab'
sentences_target = my_morphol(filename)

In [12]:
sentences_target[0]

[2,
 {'surface': '.', 'base': '*\n', 'pos': '名詞', 'pos1': 'サ変接続'},
 {'surface': '0', 'base': '*\n', 'pos': '名詞', 'pos1': '数'}]

In [13]:
len(sentences_target)

479

## Tokenization

In [14]:
def my_tokenization(sentences):
	sentences_token = []
	sentence_token = []
	for sentence in sentences:
		# sentence[0] = MID
		sentence_token.append(sentence[0])

		for morph in sentence[1:]:
			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
			# Words for Traits
			if morph['pos'] == '形容詞' and morph['pos1'] == '自立':
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

			# Words for Behaviors
			elif morph['pos'] == '動詞' and morph['pos1'] == '自立':
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続':
				if morph['base'] != '*\n':	# e.g, {'surface': '､', 'base': '*\n', 'pos': '名詞', 'pos1': 'サ変接続'}
					sentence_token.append(morph['base'])

			# Words for Stereotype etc
			elif morph['pos'] == '名詞' and morph['pos1'] == '一般':
				if morph['base'] != '*\n':	# e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞':
				if morph['base'] != '*\n':	# {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

		sentences_token.append(sentence_token)
		sentence_token = []

	return sentences_token

In [None]:
# def my_tokenization(sentences):
# 	sentences_token = []
# 	sentence_token = []
# 	for sentence in sentences:
# 		# sentence[0] = MID
# 		sentence_token.append(sentence[0])
# 
# 		for morph in sentence[1:]:
# 			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
# 			# Words for Traits
# 			if morph['pos'] == '形容詞' and morph['pos1'] == '自立': # pos = 形容詞, pos1 = 自立
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹': # pos = 名詞, pos1 = 形容動詞語幹
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹': # pos = 名詞, pos1 = ナイ形容詞語幹
# 				sentence_token.append(morph['base'])
# 
# 			# Words for Behaviors
# 			elif morph['pos'] == '動詞' and morph['pos1'] == '自立': # pos = 動詞, pos1 = 自立
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続': # pos = 名詞, pos1 = サ変接続, 
# 				sentence_token.append(morph['base'])
# 
# 			# Words for Stereotype etc
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '一般': # pos = 名詞, 一般
# 				if morph['base'] == '*\n': # e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
# 					sentence_token.append(morph['surface'])
# 				else:
# 					sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞': # pos = 名詞, 固有名詞
# 				if morph['base'] == '*\n': # {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
# 					sentence_token.append(morph['surface'])
# 				else:
# 					sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞': # pos = 名詞, 代名詞
# 				sentence_token.append(morph['base'])
# 
# 		sentences_token.append(sentence_token)
# 		sentence_token = []
# 
# 	return sentences_token

In [15]:
# SELF
sentences_token_self = my_tokenization(sentences_self)

In [16]:
sentences_token_self[0]

[2, '回答', '梨']

In [17]:
# TARGET
sentences_token_target = my_tokenization(sentences_target)

In [18]:
sentences_token_target[0]

[2]

# Indices

## Word Vec Similarity (self vs. target)

### Preparing a word vec model

In [19]:
from gensim.models import Word2Vec
model_path = '../../Materials/word2vec.gensim.model'
model = Word2Vec.load(model_path)

In [20]:
# 確認：類似語
model.wv.most_similar(positive=['Social'], topn=10)

[('Economic', 0.9120144248008728),
 ('Organization', 0.9098771810531616),
 ('science', 0.9062338471412659),
 ('Law', 0.9048188924789429),
 ('Studies', 0.9007666110992432),
 ('Education', 0.8942281007766724),
 ('Political', 0.8923444747924805),
 ('Society', 0.8902304172515869),
 ('Science', 0.8886737823486328),
 ('Medicine', 0.8862507939338684)]

In [21]:
# 確認：分散表現
word_vec = model.wv[u'単語']
print(word_vec)
print(np.transpose(word_vec).shape)

[-0.0544568   0.13679808 -0.35749108  0.05034312 -0.018448    0.15091987
 -0.12394528 -0.09055351 -0.20597099 -0.1876517   0.1110284   0.07684731
 -0.07806271 -0.0162644  -0.18043248  0.10543583  0.19625992  0.05441505
 -0.41463816  0.29697278  0.11950846  0.08052836 -0.09025036  0.02078868
  0.16672397 -0.19404823  0.08641643  0.09545647 -0.06334688 -0.12846425
  0.05050173 -0.10663079  0.1275091   0.09031986  0.09797987  0.05163022
  0.0304911   0.02613543  0.17335036 -0.18157065  0.0181381   0.02991033
  0.24255605  0.07176003  0.03419382  0.13056698 -0.03153648 -0.09767581
  0.05309673  0.09953102]
(50,)


### Averaged word vec for self

In [22]:
# Compute averaged vectors for SELF
mid_list = []
word_vec_avg_list = []
for i, tokens in enumerate(sentences_token_self):

    word_vec_arr = []
    for token in tokens[1:]:
        try:
            word_vec_arr.append(model.wv[token])
        except KeyError as e:
            print(f'KeyError: MID = {tokens[0]}, idx = {i}, ', e)

    if len(word_vec_arr) > 0:
        word_vec_arr = np.asarray(word_vec_arr)
        word_vec_avg = np.average(word_vec_arr, axis=0)
    else:
        print(f'Warning: MID = {tokens[0]}, idx = {i}, the word_vec_arr is empty, replaced by NaN.')
        word_vec_avg = np.nan

    mid_list.append(tokens[0])
    word_vec_avg_list.append(word_vec_avg)

word_vec_df_self = pd.DataFrame([mid_list, word_vec_avg_list], index=['MID', 'word_vec_avg_self']).T
word_vec_df_self.set_index('MID', inplace=True)

KeyError: MID = 47, idx = 18,  "Key 'はなしかける' not present"
KeyError: MID = 85, idx = 33,  "Key '取りえ' not present"
KeyError: MID = 92, idx = 36,  "Key '弁える' not present"
KeyError: MID = 112, idx = 45,  "Key '誘える' not present"
KeyError: MID = 122, idx = 50,  "Key 'しれる' not present"
KeyError: MID = 136, idx = 59,  "Key '煮えくり返る' not present"
KeyError: MID = 169, idx = 75,  "Key 'めんどい' not present"
KeyError: MID = 196, idx = 91,  "Key 'おっくう' not present"
KeyError: MID = 205, idx = 95,  "Key '思いだす' not present"
KeyError: MID = 214, idx = 98,  "Key 'しなう' not present"
KeyError: MID = 225, idx = 103,  "Key '尽くせる' not present"
KeyError: MID = 225, idx = 103,  "Key '尽くせる' not present"
KeyError: MID = 225, idx = 103,  "Key '尽くせる' not present"
KeyError: MID = 267, idx = 127,  "Key 'なげる' not present"
KeyError: MID = 273, idx = 131,  "Key 'いらち' not present"
KeyError: MID = 309, idx = 151,  "Key 'ことわれる' not present"
KeyError: MID = 309, idx = 151,  "Key 'まっとうする' not present"
KeyError: MID = 320, idx = 

In [23]:
word_vec_df_self.head()

Unnamed: 0_level_0,word_vec_avg_self
MID,Unnamed: 1_level_1
2,"[-0.12554576, -0.009165149, -0.12818073, -0.08..."
5,"[-0.017000489, 0.021428036, 0.16119394, -0.009..."
7,"[0.038041864, 0.046753965, 0.08819873, 0.00072..."
8,"[-0.06192059, -0.11488108, 0.11417292, -0.0890..."
11,"[0.08219984, 0.03136838, 0.149636, 0.06478363,..."


In [24]:
word_vec_df_self.tail()

Unnamed: 0_level_0,word_vec_avg_self
MID,Unnamed: 1_level_1
920,"[0.07969623, -0.03585144, 0.0949019, -0.020791..."
921,"[0.11347691, 0.0692842, 0.0929859, -0.01711248..."
922,"[-0.039919835, 0.022903582, 0.13555361, 0.0990..."
923,"[0.086014, -0.08224651, 0.10696781, 0.04868111..."
924,"[0.074476875, 0.0024441672, 0.101361, 0.010703..."


In [25]:
len(word_vec_df_self)

479

### Averaged word vec for target

In [26]:
# Compute averaged vectors for TARGET
mid_list = []
word_vec_avg_list = []
for i, tokens in enumerate(sentences_token_target):

    word_vec_arr = []
    for token in tokens[1:]:
        try:
            word_vec_arr.append(model.wv[token])
        except KeyError as e:
            print(f'KeyError: MID = {tokens[0]}, idx = {i}, ', e)

    if len(word_vec_arr) > 0:
        word_vec_arr = np.asarray(word_vec_arr)
        word_vec_avg = np.average(word_vec_arr, axis=0)
    else:
        print(f'Warning: MID = {tokens[0]}, idx = {i}, the word_vec_arr is empty, replaced by NaN.')
        word_vec_avg = np.nan

    mid_list.append(tokens[0])
    word_vec_avg_list.append(word_vec_avg)

word_vec_df_target = pd.DataFrame([mid_list, word_vec_avg_list], index=['MID', 'word_vec_avg_target']).T
word_vec_df_target.set_index('MID', inplace=True)

KeyError: MID = 30, idx = 9,  "Key 'でしゃばる' not present"
KeyError: MID = 40, idx = 14,  "Key 'かんじる' not present"
KeyError: MID = 64, idx = 22,  "Key '少い' not present"
KeyError: MID = 78, idx = 28,  "Key 'しれる' not present"
KeyError: MID = 169, idx = 75,  "Key 'しれる' not present"
KeyError: MID = 193, idx = 89,  "Key 'お互い様' not present"
KeyError: MID = 202, idx = 94,  "Key 'いけ好かない' not present"
KeyError: MID = 284, idx = 138,  "Key '面倒い' not present"
KeyError: MID = 312, idx = 154,  "Key 'やりこなす' not present"
KeyError: MID = 333, idx = 168,  "Key 'あたためる' not present"
KeyError: MID = 338, idx = 170,  "Key 'かんじる' not present"
KeyError: MID = 341, idx = 172,  "Key 'かんじる' not present"
KeyError: MID = 342, idx = 173,  "Key 'そこねる' not present"
KeyError: MID = 354, idx = 180,  "Key '気安い' not present"
KeyError: MID = 359, idx = 183,  "Key 'やりとげる' not present"
KeyError: MID = 432, idx = 222,  "Key 'しれる' not present"
KeyError: MID = 432, idx = 222,  "Key '思い切る' not present"
KeyError: MID = 435, idx = 

In [27]:
word_vec_df_target.head()

Unnamed: 0_level_0,word_vec_avg_target
MID,Unnamed: 1_level_1
2.0,
5.0,"[0.006501836, 0.06303416, 0.18272737, 0.021416..."
7.0,"[0.20633665, -0.07933207, 0.15059035, -0.01459..."
8.0,"[0.12936483, 0.014520218, -0.005053561, -0.024..."
11.0,


In [28]:
word_vec_df_target.tail()

Unnamed: 0_level_0,word_vec_avg_target
MID,Unnamed: 1_level_1
920,"[0.03636287, -0.050786536, 0.035727605, 0.0401..."
921,"[0.0512184, -0.032259293, 0.0640789, -0.060070..."
922,"[-0.052868187, 0.00424375, 0.15772448, 0.01693..."
923,"[0.08895031, -0.031109171, 0.096120544, 0.0478..."
924,"[0.04499862, 0.028445652, 0.08161835, 0.033476..."


In [29]:
len(word_vec_df_target)

479

### Merge the two dataframes (self and target)

In [30]:
word_vec_df_both = pd.merge(word_vec_df_self, word_vec_df_target, on='MID', how='outer', indicator=True)

In [31]:
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,"[-0.12554576, -0.009165149, -0.12818073, -0.08...",,both
5,"[-0.017000489, 0.021428036, 0.16119394, -0.009...","[0.006501836, 0.06303416, 0.18272737, 0.021416...",both
7,"[0.038041864, 0.046753965, 0.08819873, 0.00072...","[0.20633665, -0.07933207, 0.15059035, -0.01459...",both
8,"[-0.06192059, -0.11488108, 0.11417292, -0.0890...","[0.12936483, 0.014520218, -0.005053561, -0.024...",both
11,"[0.08219984, 0.03136838, 0.149636, 0.06478363,...",,both


In [32]:
word_vec_df_both.loc[word_vec_df_both['_merge'] != 'both', :]

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Word Vec Similarity between Self and Target
cf. https://wakame-msds.com/similarity/
- Euclidean Distance: the curse of dimensionality
- Manhattan Distance: better than the Euclidean
- Cosine Similarity: does not take accoung or the magnitude of each vector (only directions)

In [33]:
# Remove NaN
word_vec_df_both = word_vec_df_both.dropna()
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,"[-0.017000489, 0.021428036, 0.16119394, -0.009...","[0.006501836, 0.06303416, 0.18272737, 0.021416...",both
7,"[0.038041864, 0.046753965, 0.08819873, 0.00072...","[0.20633665, -0.07933207, 0.15059035, -0.01459...",both
8,"[-0.06192059, -0.11488108, 0.11417292, -0.0890...","[0.12936483, 0.014520218, -0.005053561, -0.024...",both
24,"[0.055738714, 0.007035163, 0.107713476, 0.1542...","[0.14732212, 0.024135005, 0.045808703, 0.07718...",both
27,"[0.097340696, -0.0072463155, 0.11122257, 0.002...","[0.07632324, 0.00717866, 0.11950432, -0.000946...",both


In [34]:
# Euclidean Distance
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
import math

def euclidean_distance(x, y):
    if type(x) == np.ndarray and type(x == np.ndarray):
        return math.sqrt(sum(pow(xi - yi, 2) for xi, yi in zip(x, y)))

def euclidean_distance_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [euclidean_distance(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['euclidean_distance'] = euclidean_distance_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

  if type(x) == np.ndarray and type(x == np.ndarray):


Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,"[-0.017000489, 0.021428036, 0.16119394, -0.009...","[0.006501836, 0.06303416, 0.18272737, 0.021416...",both,0.309452
7,"[0.038041864, 0.046753965, 0.08819873, 0.00072...","[0.20633665, -0.07933207, 0.15059035, -0.01459...",both,0.391612
8,"[-0.06192059, -0.11488108, 0.11417292, -0.0890...","[0.12936483, 0.014520218, -0.005053561, -0.024...",both,1.097579
24,"[0.055738714, 0.007035163, 0.107713476, 0.1542...","[0.14732212, 0.024135005, 0.045808703, 0.07718...",both,0.568951
27,"[0.097340696, -0.0072463155, 0.11122257, 0.002...","[0.07632324, 0.00717866, 0.11950432, -0.000946...",both,0.293323


In [35]:
# Manhattan Distance
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
def manhattan_distance(x, y):
    return sum(abs(xi - yi) for xi, yi in zip(x, y))

def manhattan_distance_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [manhattan_distance(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['manhattan_distance'] = manhattan_distance_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[-0.017000489, 0.021428036, 0.16119394, -0.009...","[0.006501836, 0.06303416, 0.18272737, 0.021416...",both,0.309452,1.766482
7,"[0.038041864, 0.046753965, 0.08819873, 0.00072...","[0.20633665, -0.07933207, 0.15059035, -0.01459...",both,0.391612,2.110458
8,"[-0.06192059, -0.11488108, 0.11417292, -0.0890...","[0.12936483, 0.014520218, -0.005053561, -0.024...",both,1.097579,6.200497
24,"[0.055738714, 0.007035163, 0.107713476, 0.1542...","[0.14732212, 0.024135005, 0.045808703, 0.07718...",both,0.568951,3.310129
27,"[0.097340696, -0.0072463155, 0.11122257, 0.002...","[0.07632324, 0.00717866, 0.11950432, -0.000946...",both,0.293323,1.612408


In [36]:
# Cosine Similairty
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
def square_rooted(x):
    return round(math.sqrt(sum([xi*xi for xi in x])), 3)

def cosine_similarity(x, y):
    numerator = sum(xi*yi for xi, yi in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return round(numerator / float(denominator), 3)

def cosine_similarity_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [cosine_similarity(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['cosine_similarity'] = cosine_similarity_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance,cosine_similarity
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,"[-0.017000489, 0.021428036, 0.16119394, -0.009...","[0.006501836, 0.06303416, 0.18272737, 0.021416...",both,0.309452,1.766482,0.918
7,"[0.038041864, 0.046753965, 0.08819873, 0.00072...","[0.20633665, -0.07933207, 0.15059035, -0.01459...",both,0.391612,2.110458,0.748
8,"[-0.06192059, -0.11488108, 0.11417292, -0.0890...","[0.12936483, 0.014520218, -0.005053561, -0.024...",both,1.097579,6.200497,0.398
24,"[0.055738714, 0.007035163, 0.107713476, 0.1542...","[0.14732212, 0.024135005, 0.045808703, 0.07718...",both,0.568951,3.310129,0.775
27,"[0.097340696, -0.0072463155, 0.11122257, 0.002...","[0.07632324, 0.00717866, 0.11950432, -0.000946...",both,0.293323,1.612408,0.878


# Transformation
High IOS group (5, 6, 7) and Low IOS group (1, 2, and 3)

In [38]:
df = pd.read_csv('./Results/output01_all.csv', header=0, index_col=0)
df.head()

Unnamed: 0_level_0,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,target_sex,...,self_activeness_score,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.0,2024/02/16-12:21:18,2024/02/16-12:24:31,193.0,1.0,45.0,28.0,1.0,5.0,3.0,2.0,...,5.0,2.0,6.0,5.0,1.0,4.0,7.0,6.0,COMP,5.0
5.0,2024/02/16-12:23:21,2024/02/16-12:25:38,137.0,1.0,36.0,28.0,1.0,2.0,4.0,2.0,...,4.0,2.0,5.0,3.0,6.0,2.0,5.0,3.0,COMP,9.0
7.0,2024/02/16-12:20:52,2024/02/16-12:25:45,293.0,1.0,54.0,12.0,1.0,5.0,1.0,2.0,...,5.0,3.0,1.0,7.0,1.0,6.0,6.0,2.0,COMP,5.0
8.0,2024/02/16-12:24:28,2024/02/16-12:26:11,103.0,1.0,36.0,14.0,1.0,1.0,1.0,2.0,...,3.0,3.0,1.0,7.0,1.0,5.0,4.0,7.0,COMP,1.0
11.0,2024/02/16-12:23:46,2024/02/16-12:26:26,160.0,1.0,56.0,13.0,1.0,2.0,1.0,2.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,COMP,2.0


In [39]:
HighIOS_indices = df['IOS_score'] > 4
LowIOS_indices = df['IOS_score'] < 4

In [40]:
# df にIOS_group列を追加
df['IOS_group'] = ['NA'] * len(df)
df.loc[HighIOS_indices, ['IOS_group']] = 'HighIOS'
df.loc[LowIOS_indices, ['IOS_group']] = 'LowIOS'
df.head()

Unnamed: 0_level_0,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,target_sex,...,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID,IOS_group
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.0,2024/02/16-12:21:18,2024/02/16-12:24:31,193.0,1.0,45.0,28.0,1.0,5.0,3.0,2.0,...,2.0,6.0,5.0,1.0,4.0,7.0,6.0,COMP,5.0,HighIOS
5.0,2024/02/16-12:23:21,2024/02/16-12:25:38,137.0,1.0,36.0,28.0,1.0,2.0,4.0,2.0,...,2.0,5.0,3.0,6.0,2.0,5.0,3.0,COMP,9.0,LowIOS
7.0,2024/02/16-12:20:52,2024/02/16-12:25:45,293.0,1.0,54.0,12.0,1.0,5.0,1.0,2.0,...,3.0,1.0,7.0,1.0,6.0,6.0,2.0,COMP,5.0,HighIOS
8.0,2024/02/16-12:24:28,2024/02/16-12:26:11,103.0,1.0,36.0,14.0,1.0,1.0,1.0,2.0,...,3.0,1.0,7.0,1.0,5.0,4.0,7.0,COMP,1.0,LowIOS
11.0,2024/02/16-12:23:46,2024/02/16-12:26:26,160.0,1.0,56.0,13.0,1.0,2.0,1.0,2.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,COMP,2.0,LowIOS


# Output

In [None]:
df.loc[:, ['IOS_score', 'IOS_group']].head()

In [41]:
# overlap_magnitude_df と df.loc[:, ['IOS_score', 'IOS_group']] をマージ（key=MID)
out_df = pd.merge(df.loc[:, ['IOS_score', 'IOS_group']], word_vec_df_both, on='MID')
# # さらにword_vec_df_bothをマージ
# out_df = pd.merge(out_df, word_vec_df_both, on='MID')

out_df.head()

Unnamed: 0_level_0,IOS_score,IOS_group,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance,cosine_similarity
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5.0,2.0,LowIOS,"[-0.017000489, 0.021428036, 0.16119394, -0.009...","[0.006501836, 0.06303416, 0.18272737, 0.021416...",both,0.309452,1.766482,0.918
7.0,5.0,HighIOS,"[0.038041864, 0.046753965, 0.08819873, 0.00072...","[0.20633665, -0.07933207, 0.15059035, -0.01459...",both,0.391612,2.110458,0.748
8.0,1.0,LowIOS,"[-0.06192059, -0.11488108, 0.11417292, -0.0890...","[0.12936483, 0.014520218, -0.005053561, -0.024...",both,1.097579,6.200497,0.398
24.0,4.0,,"[0.055738714, 0.007035163, 0.107713476, 0.1542...","[0.14732212, 0.024135005, 0.045808703, 0.07718...",both,0.568951,3.310129,0.775
27.0,1.0,LowIOS,"[0.097340696, -0.0072463155, 0.11122257, 0.002...","[0.07632324, 0.00717866, 0.11950432, -0.000946...",both,0.293323,1.612408,0.878


In [42]:
# # overlap_magnitude_df と df.loc[:, ['MID', 'IOS_group']] をマージ（key=MID)
# out_df = pd.merge(overlap_magnitude_df, df.loc[:, ['MID', 'IOS_score', 'IOS_group']], on='MID')

filename = './Results/output02.csv'
out_df.to_csv(filename, index=True)

In [44]:
# 確認用
sentences_token_self_df = pd.DataFrame(sentences_token_self).rename(columns={0: 'MID'})
sentences_token_self_df = sentences_token_self_df.set_index('MID')
sentences_token_self_df.to_csv('./Results/sentences_token_self_df.csv', index=False)

sentences_token_target_df = pd.DataFrame(sentences_token_target).rename(columns={0: 'MID'})
sentences_token_target_df = sentences_token_target_df.set_index('MID')
sentences_token_target_df.to_csv('./Results/sentences_token_target_df.csv', index=False)