#  Setup

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import gensim
# https://appdividend.com/2023/03/22/valueerror-numpy-ndarray-size-changed-may-indicate-binary-incompatibility/
# If you got the "ValueError: numpy.ndarray size changed ...," try one of those:
# [1] pip install --user (if needed) --upgrade numpy==1.22.4 (A NumPy version >=1.16.5 and <1.23.0 is required.)
# [2] conda install numpy=1.22.3 (numpy=1.22.4 was not found in anaconda)

# Tokenization
We will focus on specific kinds of part-of-speech (POS), i.e.,
- Adjectives
- Verbs and nouns that form verbs when "suru" is added as a suffix
- nouns

## POS tagging

In [4]:
!mecab -o ./output01_self.txt.mecab ./output01_self.txt
!mecab -o ./output01_target.txt.mecab ./output01_target.txt

## Morphological Analysis

In [5]:
def my_morphol(filename):
	sentences = []
	sentence = []
	MID = -1
	with open(filename, mode='r') as f:
		# WANT: headerに相当する最初の行をスキップ
		line = f.readline()
		while line != 'EOS\n':
			line = f.readline()
		line = f.readline() # EOS

		# WANT: MIDに相当する２行目をスキップ
		line = f.readline()
		while line != 'EOS\n':
			line = f.readline()
		previous_line = line # EOS

		for line in f:
			fields = line.split('\t')
			# 空行をスキップ
			if line == '\n' and len(fields) == 1:
				continue
			# 文頭 or 文中
			elif line != 'EOS\n':
				attr = fields[1].split(',')
				# # 変数名行
				# if previous_fields[0] == '\n':
				# 	continue
				# # MID行
				# elif fields[0] == 'MID':
				# 	continue
				# 文頭
				if previous_line == 'EOS\n' and attr[1] == '数':
					MID = int(fields[0])
					sentence.append(MID)
				# 文中
				else:
					morph = {'surface': fields[0], 'base': attr[6], 'pos': attr[0], 'pos1': attr[1]}
					sentence.append(morph)
			# 文末（EOS行）
			else:
				if MID > 0 and len(sentence) > 0:
					sentences.append(sentence)
					sentence = []
					MID = -1
				elif MID > 0 and len(sentence) == 0:
					sentences.append([])
					sentence = []
					MID = -1

			previous_line = line

	return sentences

In [6]:
# SELF
filename = './output01_self.txt.mecab'
sentences_self = my_morphol(filename)

In [7]:
sentences_self[1]

[2,
 {'surface': '相手', 'base': '相手', 'pos': '名詞', 'pos1': '一般'},
 {'surface': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'surface': '悪気', 'base': '悪気', 'pos': '名詞', 'pos1': '一般'},
 {'surface': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'surface': 'なかっ', 'base': 'ない', 'pos': '形容詞', 'pos1': '自立'},
 {'surface': 'たら', 'base': 'た', 'pos': '助動詞', 'pos1': '*'},
 {'surface': '仕方', 'base': '仕方', 'pos': '名詞', 'pos1': 'ナイ形容詞語幹'},
 {'surface': 'ない', 'base': 'ない', 'pos': '助動詞', 'pos1': '*'},
 {'surface': 'ので', 'base': 'ので', 'pos': '助詞', 'pos1': '接続助詞'},
 {'surface': '別け隔て', 'base': '別け隔て', 'pos': '名詞', 'pos1': '一般'},
 {'surface': 'なく', 'base': 'ない', 'pos': '形容詞', 'pos1': '自立'},
 {'surface': '接し', 'base': '接す', 'pos': '動詞', 'pos1': '自立'},
 {'surface': 'てる', 'base': 'てる', 'pos': '動詞', 'pos1': '非自立'},
 {'surface': 'つもり', 'base': 'つもり', 'pos': '名詞', 'pos1': '非自立'},
 {'surface': 'な', 'base': 'だ', 'pos': '助動詞', 'pos1': '*'},
 {'surface': 'ので', 'base': 'ので', 'pos': '助詞', 'pos1': '接続助詞'},
 {'su

In [8]:
# TARGET
filename = './output01_target.txt.mecab'
sentences_target = my_morphol(filename)

In [9]:
sentences_target[0]

[1,
 {'surface': '怒ら', 'base': '怒る', 'pos': '動詞', 'pos1': '自立'},
 {'surface': 'ない', 'base': 'ない', 'pos': '助動詞', 'pos1': '*'},
 {'surface': '優し', 'base': '優しい', 'pos': '形容詞', 'pos1': '自立'},
 {'surface': 'そう', 'base': 'そう', 'pos': '名詞', 'pos1': '接尾'},
 {'surface': 'で', 'base': 'だ', 'pos': '助動詞', 'pos1': '*'},
 {'surface': 'お人好し', 'base': 'お人好し', 'pos': '名詞', 'pos1': '一般'},
 {'surface': '仕事', 'base': '仕事', 'pos': '名詞', 'pos1': 'サ変接続'},
 {'surface': 'を', 'base': 'を', 'pos': '助詞', 'pos1': '格助詞'},
 {'surface': '抱え込む', 'base': '抱え込む', 'pos': '動詞', 'pos1': '自立'},
 {'surface': 'チェック', 'base': 'チェック', 'pos': '名詞', 'pos1': 'サ変接続'},
 {'surface': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'surface': '細かい', 'base': '細かい', 'pos': '形容詞', 'pos1': '自立'},
 {'surface': '基本', 'base': '基本', 'pos': '名詞', 'pos1': '一般'},
 {'surface': '受け身', 'base': '受け身', 'pos': '名詞', 'pos1': '一般'},
 {'surface': '話', 'base': '話', 'pos': '名詞', 'pos1': '接尾'},
 {'surface': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'s

## Tokenization

In [10]:
def my_tokenization(sentences):
	sentences_token = []
	sentence_token = []
	for sentence in sentences:
		# sentence[0] = MID
		sentence_token.append(sentence[0])

		for morph in sentence[1:]:
			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
			# Words for Traits
			if morph['pos'] == '形容詞' and morph['pos1'] == '自立':
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

			# Words for Behaviors
			elif morph['pos'] == '動詞' and morph['pos1'] == '自立':
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続':
				if morph['base'] != '*\n':	# e.g, {'surface': '､', 'base': '*\n', 'pos': '名詞', 'pos1': 'サ変接続'}
					sentence_token.append(morph['base'])

			# Words for Stereotype etc
			elif morph['pos'] == '名詞' and morph['pos1'] == '一般':
				if morph['base'] != '*\n':	# e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞':
				if morph['base'] != '*\n':	# {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

		sentences_token.append(sentence_token)
		sentence_token = []

	return sentences_token

In [11]:
# SELF
sentences_token_self = my_tokenization(sentences_self)

In [12]:
sentences_token_self[0]

[1,
 '怒る',
 'ない',
 '道',
 '聞く',
 '世話好き',
 'リーダー',
 '肌',
 '計画',
 'する',
 '動く',
 '行動',
 'ある',
 '友達',
 'いる']

In [13]:
# TARGET
sentences_token_target = my_tokenization(sentences_target)

In [14]:
sentences_token_target[0]

[1, '怒る', '優しい', 'お人好し', '仕事', '抱え込む', 'チェック', '細かい', '基本', '受け身', '下手']

# Indices

## Overlapping magnitude

### An unoriented incidnece matrix A
- unique words in rows
- self and target in columns (self in the first column; target in the second column)
- When word w_i represented in i-th row, is used to describe the self, the corresponding element a_i1 is 1, otherwise, 0.

### A self-other overlap matrix
tranpose(A) * A

### Overlapping magnitude
n_12 / (n_11 + n_22 - n_12)

In [15]:
mid_list = []
wc_self = []
wc_target = []
wc_intercept = []
overlap_magnitude_list = []
for tokens_self, tokens_target in zip(sentences_token_self, sentences_token_target):
	
	try:
		tokens_self[0] == tokens_target[0] # compare the MIDs
	except:
		print('The MIDs do not match: the self mid is ' + str(tokens_self[0]) + ', while the target mid is ' + str(tokens_target[0]))
	else:

		if len(tokens_self) > 1 and len(tokens_target) > 1:
			# Create a dictionary
			d = {
				'word': tokens_self[1:] + tokens_target[1:],
				'cond': ['self'] * (len(tokens_self[1:])) + ['target'] * (len(tokens_target[1:]))
			}
			# Create a dataframe
			df = pd.DataFrame(data=d)
			# Create a crosstab
			ctab = pd.crosstab(df['word'], df['cond'])

			# Create an Unoriented Incidence Matrix
			ctab_bin = ctab > 0		# 0 for 0, otherwise (>0) 1
			incidence_matrix = ctab_bin * 1 # covnert (True, False) to (1, 0)
	
			# Create a Co-Membership Matrix
			comembership_matrix = incidence_matrix.T.dot(incidence_matrix)

			# Compute an Overlap Magnitude
			n11 = comembership_matrix.loc['self', 'self']
			n22 = comembership_matrix.loc['target', 'target']
			n12 = comembership_matrix.loc['self', 'target']
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) == 1 and len(tokens_target) > 1: # no words for self except for MID
			# Compute an Overlap Magnitude
			n11 = 0
			n22 = len(set(tokens_target[1:]))
			n12 = 0
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) > 1 and len(tokens_target) == 1: # no words for target except for MID
			# Compute an Overlap Magnitude
			n11 = len(set(tokens_self[1:]))
			n22 = 0
			n12 = 0
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) == 1 and len(tokens_target) == 1:
			# Compute an Overlap Magnitude
			n11 = 0
			n22 = 0
			n12 = 0
			overlap_magnitude = np.nan
		else:
			# will be replaced with NaN in a DataFrame
			overlap_magnitude = np.nan 

		mid_list.append(tokens_self[0])
		wc_self.append(n11)
		wc_target.append(n22)
		wc_intercept.append(n12)
		overlap_magnitude_list.append(overlap_magnitude)

In [16]:
overlap_magnitude_df = pd.DataFrame({
	'MID': mid_list,
	'wc_self': wc_self,
	'wc_target': wc_target,
	'wc_intercept': wc_intercept,
	'Overlap_score': overlap_magnitude_list})
overlap_magnitude_df = overlap_magnitude_df.set_index('MID')
len(overlap_magnitude_df)

138

In [17]:
overlap_magnitude_df.head()

Unnamed: 0_level_0,wc_self,wc_target,wc_intercept,Overlap_score
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,14,10,1,0.043478
2,15,12,3,0.125
3,28,0,0,0.0
4,13,15,3,0.12
5,28,18,6,0.15


In [18]:
overlap_magnitude_df.tail()

Unnamed: 0_level_0,wc_self,wc_target,wc_intercept,Overlap_score
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
136,27,19,6,0.15
137,1,0,0,0.0
138,13,1,0,0.0
139,20,13,2,0.064516
140,15,8,1,0.045455


## Word Vec Similarity (self vs. target)

### Preparing a word vec model
- [Word2Vec](https://code.google.com/archive/p/word2vec/)
- [GloVe](http://nlp.stanford.edu/projects/glove/)
- [fastText](https://drive.google.com/open?id=0ByFQ96A4DgSPNFdleG1GaHcxQzA)
[いますぐ使える単語埋め込みベクトルのリスト](https://qiita.com/Hironsan/items/8f7d35f0a36e0f99752c)

In [19]:
from gensim.models import Word2Vec
model_path = '../../Materials/word2vec.gensim.model'
model = Word2Vec.load(model_path)

In [20]:
# 確認１：類似語
model.wv.most_similar(positive=['Social'], topn=10)

[('Economic', 0.9120144248008728),
 ('Organization', 0.9098771810531616),
 ('science', 0.9062338471412659),
 ('Law', 0.9048188924789429),
 ('Studies', 0.9007666110992432),
 ('Education', 0.8942281007766724),
 ('Political', 0.8923444747924805),
 ('Society', 0.8902304172515869),
 ('Science', 0.8886737823486328),
 ('Medicine', 0.8862507939338684)]

In [21]:
# 確認２：分散表現
word_vec = model.wv[u'単語']
print(word_vec)
print(np.transpose(word_vec).shape)

[-0.0544568   0.13679808 -0.35749108  0.05034312 -0.018448    0.15091987
 -0.12394528 -0.09055351 -0.20597099 -0.1876517   0.1110284   0.07684731
 -0.07806271 -0.0162644  -0.18043248  0.10543583  0.19625992  0.05441505
 -0.41463816  0.29697278  0.11950846  0.08052836 -0.09025036  0.02078868
  0.16672397 -0.19404823  0.08641643  0.09545647 -0.06334688 -0.12846425
  0.05050173 -0.10663079  0.1275091   0.09031986  0.09797987  0.05163022
  0.0304911   0.02613543  0.17335036 -0.18157065  0.0181381   0.02991033
  0.24255605  0.07176003  0.03419382  0.13056698 -0.03153648 -0.09767581
  0.05309673  0.09953102]
(50,)


### Averaged word vec for self 

In [22]:
# Test
# A token
print('***The firs token: \n' + str(sentences_token_self[0][1]))
print('***Vector for the first token: \n' + str(model.wv[sentences_token_self[0][1]]))
# Tokens for a description for the self
print('***Tokens for the first description for the self: \n' + str(sentences_token_self[0][1:]))

***The firs token: 
怒る
***Vector for the first token: 
[ 0.13081324 -0.08809011  0.09587991 -0.10896447 -0.1243605   0.00587449
  0.12759636  0.12842664 -0.14187886 -0.1501565   0.19304368 -0.1680705
  0.0319627  -0.0188497   0.02146796  0.10404103 -0.01553827 -0.05345729
 -0.12333147  0.08823296  0.26104563  0.19263682  0.03365196  0.21341951
 -0.03339016 -0.1434582   0.1023384  -0.12956187  0.14000835  0.13419922
  0.06674863  0.0335009  -0.24297833  0.00332542 -0.02205168  0.20526797
 -0.00167527  0.18880299  0.17019303 -0.19389708  0.2300993   0.14236334
 -0.03916963  0.23734419  0.02310023 -0.16109885 -0.00754123  0.3246466
  0.17463765 -0.13177094]
***Tokens for the first description for the self: 
['怒る', 'ない', '道', '聞く', '世話好き', 'リーダー', '肌', '計画', 'する', '動く', '行動', 'ある', '友達', 'いる']


In [23]:
# Compute averaged vectors for SELF
mid_list = []
word_vec_avg_list = []
for i, tokens in enumerate(sentences_token_self):

    word_vec_arr = []
    for token in tokens[1:]:
        try:
            word_vec_arr.append(model.wv[token])
        except KeyError as e:
            print(f'KeyError: MID = {tokens[0]}, idx = {i}, ', e)

    if len(word_vec_arr) > 0:
        word_vec_arr = np.asarray(word_vec_arr)
        word_vec_avg = np.average(word_vec_arr, axis=0)
    else:
        print(f'Warning: MID = {tokens[0]}, idx = {i}, the word_vec_arr is empty, replaced by NaN.')
        word_vec_avg = np.nan

    mid_list.append(tokens[0])
    word_vec_avg_list.append(word_vec_avg)

word_vec_df_self = pd.DataFrame([mid_list, word_vec_avg_list], index=['MID', 'word_vec_avg_self']).T
word_vec_df_self.set_index('MID', inplace=True)

KeyError: MID = 2, idx = 1,  "Key '別け隔て' not present"
KeyError: MID = 4, idx = 3,  "Key 'ゆるす' not present"
KeyError: MID = 5, idx = 4,  "Key '思い切る' not present"
KeyError: MID = 7, idx = 6,  "Key '出不精' not present"
KeyError: MID = 8, idx = 7,  "Key '言い逃れる' not present"
KeyError: MID = 18, idx = 17,  "Key '知らん顔' not present"
KeyError: MID = 18, idx = 17,  "Key '振り回せる' not present"
KeyError: MID = 39, idx = 38,  "Key '人込み' not present"
KeyError: MID = 41, idx = 40,  "Key 'ひねくれる' not present"
KeyError: MID = 52, idx = 51,  "Key '面倒い' not present"
KeyError: MID = 57, idx = 55,  "Key '悪しい' not present"
KeyError: MID = 67, idx = 65,  "Key 'したう' not present"
KeyError: MID = 97, idx = 95,  "Key '事新しい' not present"
KeyError: MID = 107, idx = 105,  "Key 'しれる' not present"
KeyError: MID = 110, idx = 108,  "Key '思いきる' not present"
KeyError: MID = 111, idx = 109,  "Key 'かまう' not present"
KeyError: MID = 111, idx = 109,  "Key '黙り込む' not present"
KeyError: MID = 115, idx = 113,  "Key 'あふる' not present

In [24]:
word_vec_df_self.head()

Unnamed: 0_level_0,word_vec_avg_self
MID,Unnamed: 1_level_1
1,"[0.1058292, 0.029083064, 0.059157465, 0.029480..."
2,"[0.048972588, 0.03272291, 0.11012402, -0.00098..."
3,"[0.079579644, 0.03545095, 0.07576721, 0.017408..."
4,"[0.02119412, -0.0402195, 0.012135086, 0.015629..."
5,"[0.06807982, 0.03903689, 0.106884375, 0.046159..."


In [25]:
word_vec_df_self.tail()

Unnamed: 0_level_0,word_vec_avg_self
MID,Unnamed: 1_level_1
136,"[0.09338969, -0.00031238527, 0.10654243, 0.003..."
137,"[-0.12244472, 0.08506945, 0.37974653, 0.122191..."
138,"[0.08418046, 0.01697837, 0.02567623, -0.053410..."
139,"[0.08695435, 0.030569127, 0.03894617, 0.054244..."
140,"[0.12781338, -0.007158957, 0.07934391, -0.0213..."


In [26]:
len(word_vec_df_self)

138

### Averaged word vec for target

In [27]:
# Compute averaged vectors for TARGET
mid_list = []
word_vec_avg_list = []
for i, tokens in enumerate(sentences_token_target):

    word_vec_arr = []
    for token in tokens[1:]:
        try:
            word_vec_arr.append(model.wv[token])
        except KeyError as e:
            print(f'KeyError: MID = {tokens[0]}, idx = {i}, ', e)

    if len(word_vec_arr) > 0:
        word_vec_arr = np.asarray(word_vec_arr)
        word_vec_avg = np.average(word_vec_arr, axis=0)
    else:
        print(f'Warning: MID = {tokens[0]}, idx = {i}, the word_vec_arr is empty, replaced by NaN.')
        word_vec_avg = np.nan

    mid_list.append(tokens[0])
    word_vec_avg_list.append(word_vec_avg)

word_vec_df_target = pd.DataFrame([mid_list, word_vec_avg_list], index=['MID', 'word_vec_avg_target']).T
word_vec_df_target.set_index('MID', inplace=True)

KeyError: MID = 15, idx = 14,  "Key '考え事' not present"
KeyError: MID = 18, idx = 17,  "Key 'ねたむ' not present"
KeyError: MID = 39, idx = 38,  "Key '人込み' not present"
KeyError: MID = 39, idx = 38,  "Key '人込み' not present"
KeyError: MID = 41, idx = 40,  "Key '察す' not present"
KeyError: MID = 51, idx = 50,  "Key '仲良い' not present"
KeyError: MID = 66, idx = 64,  "Key '仲良い' not present"
KeyError: MID = 67, idx = 65,  "Key 'さておく' not present"
KeyError: MID = 69, idx = 67,  "Key '面倒い' not present"
KeyError: MID = 79, idx = 77,  "Key 'こわもて' not present"
KeyError: MID = 93, idx = 91,  "Key 'つられる' not present"
KeyError: MID = 95, idx = 93,  "Key 'しれる' not present"
KeyError: MID = 109, idx = 107,  "Key '断れる' not present"
KeyError: MID = 111, idx = 109,  "Key 'ムキになる' not present"
KeyError: MID = 111, idx = 109,  "Key '寝転ぶ' not present"
KeyError: MID = 111, idx = 109,  "Key '隣り合わせる' not present"
KeyError: MID = 127, idx = 125,  "Key '取り残す' not present"
KeyError: MID = 132, idx = 129,  "Key 'むくれる' no

In [28]:
word_vec_df_target.head()

Unnamed: 0_level_0,word_vec_avg_target
MID,Unnamed: 1_level_1
1.0,"[0.102810584, -0.01622817, 0.12231994, 0.00109..."
2.0,"[0.10208222, 0.010917697, 0.067205645, 0.06214..."
3.0,
4.0,"[0.038187366, -0.007282614, 0.14105798, 0.0177..."
5.0,"[0.1120254, -0.00926766, 0.12189267, 0.0372730..."


In [29]:
word_vec_df_target.tail()

Unnamed: 0_level_0,word_vec_avg_target
MID,Unnamed: 1_level_1
136.0,"[0.044927604, -0.060322974, 0.16509342, -0.020..."
137.0,
138.0,"[0.17474043, 0.22523512, 0.13636838, 0.0266636..."
139.0,"[0.10080279, 0.03757489, 0.17397055, -0.043149..."
140.0,"[0.19102705, 0.026352024, 0.06196414, 0.029856..."


In [30]:
len(word_vec_df_target)

138

### Merge the two DataFrames (self and target)

In [31]:
word_vec_df_both = pd.merge(word_vec_df_self, word_vec_df_target, on='MID', how='outer', indicator=True)

In [32]:
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[0.1058292, 0.029083064, 0.059157465, 0.029480...","[0.102810584, -0.01622817, 0.12231994, 0.00109...",both
2,"[0.048972588, 0.03272291, 0.11012402, -0.00098...","[0.10208222, 0.010917697, 0.067205645, 0.06214...",both
3,"[0.079579644, 0.03545095, 0.07576721, 0.017408...",,both
4,"[0.02119412, -0.0402195, 0.012135086, 0.015629...","[0.038187366, -0.007282614, 0.14105798, 0.0177...",both
5,"[0.06807982, 0.03903689, 0.106884375, 0.046159...","[0.1120254, -0.00926766, 0.12189267, 0.0372730...",both


In [33]:
# Check
word_vec_df_both.loc[word_vec_df_both['_merge'] != 'both', :]

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Word Vec Similarity between Self and Target
cf. https://wakame-msds.com/similarity/
- Euclidean Distance: the curse of dimensionality
- Manhattan Distance: better than the Euclidean
- Cosine Similarity: does not take accoung or the magnitude of each vector (only directions)

In [34]:
type(word_vec_df_both['word_vec_avg_self'][1][0])

numpy.float32

In [35]:
# Remove NaN
word_vec_df_both = word_vec_df_both.dropna()
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[0.1058292, 0.029083064, 0.059157465, 0.029480...","[0.102810584, -0.01622817, 0.12231994, 0.00109...",both
2,"[0.048972588, 0.03272291, 0.11012402, -0.00098...","[0.10208222, 0.010917697, 0.067205645, 0.06214...",both
4,"[0.02119412, -0.0402195, 0.012135086, 0.015629...","[0.038187366, -0.007282614, 0.14105798, 0.0177...",both
5,"[0.06807982, 0.03903689, 0.106884375, 0.046159...","[0.1120254, -0.00926766, 0.12189267, 0.0372730...",both
6,"[0.040560987, 0.007932171, 0.108004756, 0.0216...","[0.08884796, 0.045296524, 0.11056251, 0.036384...",both


In [36]:
# Euclidean Distance
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
import math

def euclidean_distance(x, y):
    if type(x) == np.ndarray and type(x == np.ndarray):
        return math.sqrt(sum(pow(xi - yi, 2) for xi, yi in zip(x, y)))

def euclidean_distance_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [euclidean_distance(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['euclidean_distance'] = euclidean_distance_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

  if type(x) == np.ndarray and type(x == np.ndarray):


Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"[0.1058292, 0.029083064, 0.059157465, 0.029480...","[0.102810584, -0.01622817, 0.12231994, 0.00109...",both,0.3922
2,"[0.048972588, 0.03272291, 0.11012402, -0.00098...","[0.10208222, 0.010917697, 0.067205645, 0.06214...",both,0.258167
4,"[0.02119412, -0.0402195, 0.012135086, 0.015629...","[0.038187366, -0.007282614, 0.14105798, 0.0177...",both,0.389469
5,"[0.06807982, 0.03903689, 0.106884375, 0.046159...","[0.1120254, -0.00926766, 0.12189267, 0.0372730...",both,0.216277
6,"[0.040560987, 0.007932171, 0.108004756, 0.0216...","[0.08884796, 0.045296524, 0.11056251, 0.036384...",both,0.297282


In [37]:
# Manhattan Distance
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
def manhattan_distance(x, y):
    return sum(abs(xi - yi) for xi, yi in zip(x, y))

def manhattan_distance_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [manhattan_distance(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['manhattan_distance'] = manhattan_distance_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"[0.1058292, 0.029083064, 0.059157465, 0.029480...","[0.102810584, -0.01622817, 0.12231994, 0.00109...",both,0.3922,2.212569
2,"[0.048972588, 0.03272291, 0.11012402, -0.00098...","[0.10208222, 0.010917697, 0.067205645, 0.06214...",both,0.258167,1.495546
4,"[0.02119412, -0.0402195, 0.012135086, 0.015629...","[0.038187366, -0.007282614, 0.14105798, 0.0177...",both,0.389469,2.212853
5,"[0.06807982, 0.03903689, 0.106884375, 0.046159...","[0.1120254, -0.00926766, 0.12189267, 0.0372730...",both,0.216277,1.35805
6,"[0.040560987, 0.007932171, 0.108004756, 0.0216...","[0.08884796, 0.045296524, 0.11056251, 0.036384...",both,0.297282,1.619236


In [38]:
# Cosine Similairty
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
def square_rooted(x):
    return round(math.sqrt(sum([xi*xi for xi in x])), 3)

def cosine_similarity(x, y):
    numerator = sum(xi*yi for xi, yi in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return round(numerator / float(denominator), 3)

def cosine_similarity_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [cosine_similarity(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['cosine_similarity'] = cosine_similarity_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance,cosine_similarity
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[0.1058292, 0.029083064, 0.059157465, 0.029480...","[0.102810584, -0.01622817, 0.12231994, 0.00109...",both,0.3922,2.212569,0.759
2,"[0.048972588, 0.03272291, 0.11012402, -0.00098...","[0.10208222, 0.010917697, 0.067205645, 0.06214...",both,0.258167,1.495546,0.879
4,"[0.02119412, -0.0402195, 0.012135086, 0.015629...","[0.038187366, -0.007282614, 0.14105798, 0.0177...",both,0.389469,2.212853,0.688
5,"[0.06807982, 0.03903689, 0.106884375, 0.046159...","[0.1120254, -0.00926766, 0.12189267, 0.0372730...",both,0.216277,1.35805,0.906
6,"[0.040560987, 0.007932171, 0.108004756, 0.0216...","[0.08884796, 0.045296524, 0.11056251, 0.036384...",both,0.297282,1.619236,0.833


# Transformation

## High and Low IOS groups
- High IOS group (5, 6, 7)
- Low IOS group (1, 2, and 3)

In [39]:
df = pd.read_csv('./output01_all.csv', header=0, index_col=0)
df.head()

Unnamed: 0_level_0,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,target_sex,...,self_activeness_score,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023/05/19-12:49:28,2023/05/19-12:55:05,337.0,2,40,23,1,2,2,1,...,1,2,1,7,2,7,7,2,COMP,2
2,2023/05/19-12:56:08,2023/05/19-13:06:05,597.0,2,36,11,1,3,2,1,...,6,5,2,6,2,5,2,6,COMP,3
3,2023/05/19-13:05:26,2023/05/19-13:13:17,471.0,1,43,23,1,4,3,2,...,5,2,5,4,2,5,3,6,COMP,4
4,2023/05/19-13:06:19,2023/05/19-13:18:47,748.0,1,55,16,1,6,1,2,...,6,1,4,7,5,4,2,7,COMP,6
5,2023/05/19-13:02:30,2023/05/19-13:19:05,995.0,2,53,7,1,1,3,1,...,6,4,3,4,5,3,2,5,COMP,1


In [40]:
HighIOS_indices = df['IOS_score'] > 4
LowIOS_indices = df['IOS_score'] < 4

In [41]:
# df に IOS_group列を追加
df['IOS_group'] = ['NA'] * len(df)
df.loc[HighIOS_indices, ['IOS_group']] = 'HighIOS'
df.loc[LowIOS_indices, ['IOS_group']] = 'LowIOS'
df.head()

Unnamed: 0_level_0,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,target_sex,...,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID,IOS_group
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2023/05/19-12:49:28,2023/05/19-12:55:05,337.0,2,40,23,1,2,2,1,...,2,1,7,2,7,7,2,COMP,2,LowIOS
2,2023/05/19-12:56:08,2023/05/19-13:06:05,597.0,2,36,11,1,3,2,1,...,5,2,6,2,5,2,6,COMP,3,LowIOS
3,2023/05/19-13:05:26,2023/05/19-13:13:17,471.0,1,43,23,1,4,3,2,...,2,5,4,2,5,3,6,COMP,4,
4,2023/05/19-13:06:19,2023/05/19-13:18:47,748.0,1,55,16,1,6,1,2,...,1,4,7,5,4,2,7,COMP,6,HighIOS
5,2023/05/19-13:02:30,2023/05/19-13:19:05,995.0,2,53,7,1,1,3,1,...,4,3,4,5,3,2,5,COMP,1,LowIOS


# Output

In [42]:
overlap_magnitude_df.head()

Unnamed: 0_level_0,wc_self,wc_target,wc_intercept,Overlap_score
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,14,10,1,0.043478
2,15,12,3,0.125
3,28,0,0,0.0
4,13,15,3,0.12
5,28,18,6,0.15


In [43]:
df.loc[:, ['IOS_score', 'IOS_group']].head()

Unnamed: 0_level_0,IOS_score,IOS_group
MID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,LowIOS
2,3,LowIOS
3,4,
4,6,HighIOS
5,1,LowIOS


In [44]:
# overlap_magnitude_df と df.loc[:, ['IOS_score', 'IOS_group']] をマージ（key=MID)
out_df = pd.merge(df.loc[:, ['IOS_score', 'IOS_group']], overlap_magnitude_df, on='MID')
# さらにword_vec_df_bothをマージ
out_df = pd.merge(out_df, word_vec_df_both, on='MID')

out_df.head()

Unnamed: 0_level_0,IOS_score,IOS_group,wc_self,wc_target,wc_intercept,Overlap_score,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance,cosine_similarity
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2,LowIOS,14,10,1,0.043478,"[0.1058292, 0.029083064, 0.059157465, 0.029480...","[0.102810584, -0.01622817, 0.12231994, 0.00109...",both,0.3922,2.212569,0.759
2,3,LowIOS,15,12,3,0.125,"[0.048972588, 0.03272291, 0.11012402, -0.00098...","[0.10208222, 0.010917697, 0.067205645, 0.06214...",both,0.258167,1.495546,0.879
4,6,HighIOS,13,15,3,0.12,"[0.02119412, -0.0402195, 0.012135086, 0.015629...","[0.038187366, -0.007282614, 0.14105798, 0.0177...",both,0.389469,2.212853,0.688
5,1,LowIOS,28,18,6,0.15,"[0.06807982, 0.03903689, 0.106884375, 0.046159...","[0.1120254, -0.00926766, 0.12189267, 0.0372730...",both,0.216277,1.35805,0.906
6,5,HighIOS,12,11,3,0.15,"[0.040560987, 0.007932171, 0.108004756, 0.0216...","[0.08884796, 0.045296524, 0.11056251, 0.036384...",both,0.297282,1.619236,0.833


In [45]:
filename = './output02.csv'
out_df.to_csv(filename, index=True)

In [None]:
# 確認用
# sentences_token_self_df = pd.DataFrame(sentences_token_self).rename(columns={0: 'MID'})
# sentences_token_self_df.set_index('MID')
# sentences_token_self_df.to_csv('./sentences_token_self_df.csv', index=False)
# 
# sentences_token_target_df = pd.DataFrame(sentences_token_target).rename(columns={0: 'MID'})
# sentences_token_target_df.set_index('MID')
# sentences_token_target_df.to_csv('./sentences_token_target_df.csv', index=False)