# Setup

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import gensim

# Tokenization
We will focus on specific kinds of part-of-speech (POS), i.e.,
- Adjectives
- Verbs and nouns that form verbs when "suru" is added as a suffix
- nouns

## POS tagging

In [4]:
!mecab -o ./output01_self.txt.mecab ./output01_self.txt
!mecab -o ./output01_target.txt.mecab ./output01_target.txt

## Morphological Analysis

In [5]:
def my_morphol(filename):
	sentences = []
	sentence = []
	previous_line = ''
	MID = -1
	with open(filename, mode='r') as f:
		for line in f:
			# headerに相当する最初の行をスキップ
			# 文頭 or 文中
			if line != 'EOS\n':
				fields = line.split('\t')
				attr = fields[1].split(',')
				# 変数名行
				if previous_line == '':
					continue
				# MID行
				elif fields[0] == 'MID':
					continue
				# 文頭
				elif previous_line == 'EOS\n' and attr[1] == '数':
					MID = int(fields[0])
					sentence.append(MID)
				# 文中
				else:
					morph = {'surface': fields[0], 'base': attr[6], 'pos': attr[0], 'pos1': attr[1]}
					sentence.append(morph)
			# 文末（EOS行）
			else:
				if MID > 0 and len(sentence) > 0:
					sentences.append(sentence)
					sentence = []
					MID = -1
				elif MID > 0 and len(sentence) == 0:
					sentences.append([])
					sentence = []
					MID = -1

			previous_line = line

	return sentences

In [6]:
# SELF
filename = './output01_self.txt.mecab'
sentences_self = my_morphol(filename)

In [7]:
sentences_self[0]

[2,
 {'surface': '愛想', 'base': '愛想', 'pos': '名詞', 'pos1': '一般'},
 {'surface': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'surface': 'ない', 'base': 'ない', 'pos': '形容詞', 'pos1': '自立'}]

In [8]:
len(sentences_self)

88

In [9]:
# TARGET
filename = './output01_target.txt.mecab'
sentences_target = my_morphol(filename)

In [10]:
sentences_target[0]

[2]

In [11]:
len(sentences_target)

88

## Tokenization

In [12]:
def my_tokenization(sentences):
	sentences_token = []
	sentence_token = []
	for sentence in sentences:
		# sentence[0] = MID
		sentence_token.append(sentence[0])

		for morph in sentence[1:]:
			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
#			# Words for Traits
#			if morph['pos'] == '形容詞' and morph['pos1'] == '自立':
#				sentence_token.append(morph['base'])
#			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹':
#				if morph['base'] != '*\n':
#					sentence_token.append(morph['base'])
#			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹':
#				if morph['base'] != '*\n':
#					sentence_token.append(morph['base'])
#
#			# Words for Behaviors
#			elif morph['pos'] == '動詞' and morph['pos1'] == '自立':
#				sentence_token.append(morph['base'])
#			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続':
#				if morph['base'] != '*\n':	# e.g, {'surface': '､', 'base': '*\n', 'pos': '名詞', 'pos1': 'サ変接続'}
#					sentence_token.append(morph['base'])

			# Words for Stereotype etc
			if morph['pos'] == '名詞' and morph['pos1'] == '一般':
				if morph['base'] != '*\n':	# e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞':
				if morph['base'] != '*\n':	# {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

		sentences_token.append(sentence_token)
		sentence_token = []

	return sentences_token

In [13]:
# def my_tokenization(sentences):
# 	sentences_token = []
# 	sentence_token = []
# 	for sentence in sentences:
# 		# sentence[0] = MID
# 		sentence_token.append(sentence[0])
# 
# 		for morph in sentence[1:]:
# 			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
# 			# Words for Traits
# 			if morph['pos'] == '形容詞' and morph['pos1'] == '自立': # pos = 形容詞, pos1 = 自立
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹': # pos = 名詞, pos1 = 形容動詞語幹
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹': # pos = 名詞, pos1 = ナイ形容詞語幹
# 				sentence_token.append(morph['base'])
# 
# 			# Words for Behaviors
# 			elif morph['pos'] == '動詞' and morph['pos1'] == '自立': # pos = 動詞, pos1 = 自立
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続': # pos = 名詞, pos1 = サ変接続, 
# 				sentence_token.append(morph['base'])
# 
# 			# Words for Stereotype etc
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '一般': # pos = 名詞, 一般
# 				if morph['base'] == '*\n': # e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
# 					sentence_token.append(morph['surface'])
# 				else:
# 					sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞': # pos = 名詞, 固有名詞
# 				if morph['base'] == '*\n': # {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
# 					sentence_token.append(morph['surface'])
# 				else:
# 					sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞': # pos = 名詞, 代名詞
# 				sentence_token.append(morph['base'])
# 
# 		sentences_token.append(sentence_token)
# 		sentence_token = []
# 
# 	return sentences_token

In [14]:
# SELF
sentences_token_self = my_tokenization(sentences_self)

In [15]:
sentences_token_self[0]

[2, '愛想']

In [16]:
# TARGET
sentences_token_target = my_tokenization(sentences_target)

In [17]:
sentences_token_target[0]

[2]

# Indices

## Overlapping magnitude

### An unoriented incidnece matrix A
- unique words in rows
- self and target in columns (self in the first column; target in the second column)
- When word w_i represented in i-th row, is used to describe the self, the corresponding element a_i1 is 1, otherwise, 0.

### A self-other overlap matrix
tranpose(A) * A

### Overlapping magnitude
n_12 / (n_11 + n_22 - n_12)

In [18]:
mid_list = []
wc_self = []
wc_target = []
wc_intercept = []
overlap_magnitude_list = []
for tokens_self, tokens_target in zip(sentences_token_self, sentences_token_target):
	
	try:
		tokens_self[0] == tokens_target[0] # compare the MIDs
	except:
		print('The MIDs do not match: the self mid is ' + str(tokens_self[0]) + ', while the target mid is ' + str(tokens_target[0]))
	else:

		if len(tokens_self) > 1 and len(tokens_target) > 1:
			# Create a dictionary
			d = {
				'word': tokens_self[1:] + tokens_target[1:],
				'cond': ['self'] * (len(tokens_self[1:])) + ['target'] * (len(tokens_target[1:]))
			}
			# Create a dataframe
			df = pd.DataFrame(data=d)
			# Create a crosstab
			ctab = pd.crosstab(df['word'], df['cond'])

			# Create an Unoriented Incidence Matrix
			ctab_bin = ctab > 0		# 0 for 0, otherwise (>0) 1
			incidence_matrix = ctab_bin * 1 # covnert (True, False) to (1, 0)
	
			# Create a Co-Membership Matrix
			comembership_matrix = incidence_matrix.T.dot(incidence_matrix)

			# Compute an Overlap Magnitude
			n11 = comembership_matrix.loc['self', 'self']
			n22 = comembership_matrix.loc['target', 'target']
			n12 = comembership_matrix.loc['self', 'target']
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) == 1 and len(tokens_target) > 1: # no words for self except for MID
			# Compute an Overlap Magnitude
			n11 = 0
			n22 = len(set(tokens_target[1:]))
			n12 = 0
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) > 1 and len(tokens_target) == 1: # no words for target except for MID
			# Compute an Overlap Magnitude
			n11 = len(set(tokens_self[1:]))
			n22 = 0
			n12 = 0
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) == 1 and len(tokens_target) == 1:
			# Compute an Overlap Magnitude
			n11 = 0
			n22 = 0
			n12 = 0
			overlap_magnitude = np.nan
		else:
			# will be replaced with NaN in a DataFrame
			overlap_magnitude = np.nan 

		mid_list.append(tokens_self[0])
		wc_self.append(n11)
		wc_target.append(n22)
		wc_intercept.append(n12)
		overlap_magnitude_list.append(overlap_magnitude)


In [19]:
overlap_magnitude_df = pd.DataFrame({
	'MID': mid_list,
	'wc_self': wc_self,
	'wc_target': wc_target,
	'wc_intercept': wc_intercept,
	'Overlap_score': overlap_magnitude_list})
overlap_magnitude_df = overlap_magnitude_df.set_index('MID')
len(overlap_magnitude_df)

88

In [20]:
overlap_magnitude_df.head()

Unnamed: 0_level_0,wc_self,wc_target,wc_intercept,Overlap_score
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,0,0,0.0
3,5,3,0,0.0
5,0,0,0,
8,0,0,0,
9,2,0,0,0.0


In [21]:
overlap_magnitude_df.tail()

Unnamed: 0_level_0,wc_self,wc_target,wc_intercept,Overlap_score
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
174,2,5,0,0.0
176,6,5,3,0.375
177,1,0,0,0.0
181,0,0,0,
182,6,15,2,0.105263


## Word Vec Similarity (self vs. target)

### Preparing a word vec model

In [22]:
from gensim.models import Word2Vec
model_path = '../../Materials/word2vec.gensim.model'
model = Word2Vec.load(model_path)

In [23]:
# 確認：類似語
model.wv.most_similar(positive=['Social'], topn=10)

[('Economic', 0.9120144248008728),
 ('Organization', 0.9098771810531616),
 ('science', 0.9062338471412659),
 ('Law', 0.9048188924789429),
 ('Studies', 0.9007666110992432),
 ('Education', 0.8942281007766724),
 ('Political', 0.8923444747924805),
 ('Society', 0.8902304172515869),
 ('Science', 0.8886737823486328),
 ('Medicine', 0.8862507939338684)]

In [24]:
# 確認：分散表現
word_vec = model.wv[u'単語']
print(word_vec)
print(np.transpose(word_vec).shape)

[-0.0544568   0.13679808 -0.35749108  0.05034312 -0.018448    0.15091987
 -0.12394528 -0.09055351 -0.20597099 -0.1876517   0.1110284   0.07684731
 -0.07806271 -0.0162644  -0.18043248  0.10543583  0.19625992  0.05441505
 -0.41463816  0.29697278  0.11950846  0.08052836 -0.09025036  0.02078868
  0.16672397 -0.19404823  0.08641643  0.09545647 -0.06334688 -0.12846425
  0.05050173 -0.10663079  0.1275091   0.09031986  0.09797987  0.05163022
  0.0304911   0.02613543  0.17335036 -0.18157065  0.0181381   0.02991033
  0.24255605  0.07176003  0.03419382  0.13056698 -0.03153648 -0.09767581
  0.05309673  0.09953102]
(50,)


### Averaged word vec for self

In [25]:
# Compute averaged vectors for SELF
mid_list = []
word_vec_avg_list = []
for i, tokens in enumerate(sentences_token_self):

    word_vec_arr = []
    for token in tokens[1:]:
        try:
            word_vec_arr.append(model.wv[token])
        except KeyError as e:
            print(f'KeyError: MID = {tokens[0]}, idx = {i}, ', e)

    if len(word_vec_arr) > 0:
        word_vec_arr = np.asarray(word_vec_arr)
        word_vec_avg = np.average(word_vec_arr, axis=0)
    else:
        print(f'Warning: MID = {tokens[0]}, idx = {i}, the word_vec_arr is empty, replaced by NaN.')
        word_vec_avg = np.nan

    mid_list.append(tokens[0])
    word_vec_avg_list.append(word_vec_avg)

word_vec_df_self = pd.DataFrame([mid_list, word_vec_avg_list], index=['MID', 'word_vec_avg_self']).T
word_vec_df_self.set_index('MID', inplace=True)

KeyError: MID = 17, idx = 7,  "Key '別け隔て' not present"
KeyError: MID = 64, idx = 21,  "Key '出不精' not present"
KeyError: MID = 103, idx = 47,  "Key '知らん顔' not present"
KeyError: MID = 182, idx = 87,  "Key '人込み' not present"


In [26]:
word_vec_df_self.head()

Unnamed: 0_level_0,word_vec_avg_self
MID,Unnamed: 1_level_1
2.0,"[0.08338137, -0.06351369, 0.041429847, -0.0717..."
3.0,"[0.12970965, -0.02112623, -0.022533875, 0.0432..."
5.0,
8.0,
9.0,"[0.069289185, -0.061326534, 0.100008816, 0.039..."


In [27]:
word_vec_df_self.tail()

Unnamed: 0_level_0,word_vec_avg_self
MID,Unnamed: 1_level_1
174.0,"[0.18372181, 0.06282128, 0.11855986, -0.020985..."
176.0,"[0.051711638, -0.01296172, -0.023216402, 0.080..."
177.0,"[-0.02619045, 0.10630223, -0.21030313, -0.0440..."
181.0,
182.0,"[0.09903046, 0.0692933, -0.02205129, -0.058867..."


In [28]:
len(word_vec_df_self)

88

### Averaged word vec for target

In [29]:
# Compute averaged vectors for TARGET
mid_list = []
word_vec_avg_list = []
for i, tokens in enumerate(sentences_token_target):

    word_vec_arr = []
    for token in tokens[1:]:
        try:
            word_vec_arr.append(model.wv[token])
        except KeyError as e:
            print(f'KeyError: MID = {tokens[0]}, idx = {i}, ', e)

    if len(word_vec_arr) > 0:
        word_vec_arr = np.asarray(word_vec_arr)
        word_vec_avg = np.average(word_vec_arr, axis=0)
    else:
        print(f'Warning: MID = {tokens[0]}, idx = {i}, the word_vec_arr is empty, replaced by NaN.')
        word_vec_avg = np.nan

    mid_list.append(tokens[0])
    word_vec_avg_list.append(word_vec_avg)

word_vec_df_target = pd.DataFrame([mid_list, word_vec_avg_list], index=['MID', 'word_vec_avg_target']).T
word_vec_df_target.set_index('MID', inplace=True)

KeyError: MID = 72, idx = 25,  "Key '物おじ' not present"
KeyError: MID = 95, idx = 42,  "Key '考え事' not present"
KeyError: MID = 182, idx = 87,  "Key '人込み' not present"
KeyError: MID = 182, idx = 87,  "Key '人込み' not present"


In [30]:
word_vec_df_target.head()

Unnamed: 0_level_0,word_vec_avg_target
MID,Unnamed: 1_level_1
2.0,
3.0,"[0.10904028, -0.08473143, 0.056752145, 0.05782..."
5.0,
8.0,
9.0,


In [31]:
word_vec_df_target.tail()

Unnamed: 0_level_0,word_vec_avg_target
MID,Unnamed: 1_level_1
174.0,"[0.10767321, 0.014301893, 0.10745349, 0.005642..."
176.0,"[0.0326714, 0.04333663, -0.028047377, 0.038297..."
177.0,
181.0,
182.0,"[0.060379803, 0.04116907, -0.0798686, 0.035706..."


In [32]:
len(word_vec_df_target)

88

### Merge the two dataframes (self and target)

In [33]:
word_vec_df_both = pd.merge(word_vec_df_self, word_vec_df_target, on='MID', how='outer', indicator=True)

In [34]:
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,"[0.08338137, -0.06351369, 0.041429847, -0.0717...",,both
3.0,"[0.12970965, -0.02112623, -0.022533875, 0.0432...","[0.10904028, -0.08473143, 0.056752145, 0.05782...",both
5.0,,,both
8.0,,,both
9.0,"[0.069289185, -0.061326534, 0.100008816, 0.039...",,both


In [35]:
word_vec_df_both.loc[word_vec_df_both['_merge'] != 'both', :]

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Word Vec Similarity between Self and Target
cf. https://wakame-msds.com/similarity/
- Euclidean Distance: the curse of dimensionality
- Manhattan Distance: better than the Euclidean
- Cosine Similarity: does not take accoung or the magnitude of each vector (only directions)

In [36]:
# Remove NaN
word_vec_df_both = word_vec_df_both.dropna()
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,"[0.12970965, -0.02112623, -0.022533875, 0.0432...","[0.10904028, -0.08473143, 0.056752145, 0.05782...",both
11,"[0.044672675, -0.02641947, 0.1916748, 0.042756...","[0.04378919, 0.024327224, -0.09181204, 0.06682...",both
13,"[-0.0073869806, 0.10435431, -0.17571743, 0.071...","[0.1713229, 0.031187793, 0.0027970274, -0.0218...",both
17,"[0.006195314, -0.04341352, -0.065547384, -0.01...","[0.100749515, -0.034773186, 0.013341621, 0.099...",both
22,"[0.040373057, 0.014811024, 0.16814518, 0.08294...","[0.043547124, 0.005184115, 0.026600244, 0.0438...",both


In [37]:
# Euclidean Distance
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
import math

def euclidean_distance(x, y):
    if type(x) == np.ndarray and type(x == np.ndarray):
        return math.sqrt(sum(pow(xi - yi, 2) for xi, yi in zip(x, y)))

def euclidean_distance_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [euclidean_distance(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['euclidean_distance'] = euclidean_distance_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

  if type(x) == np.ndarray and type(x == np.ndarray):


Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,"[0.12970965, -0.02112623, -0.022533875, 0.0432...","[0.10904028, -0.08473143, 0.056752145, 0.05782...",both,0.581706
11,"[0.044672675, -0.02641947, 0.1916748, 0.042756...","[0.04378919, 0.024327224, -0.09181204, 0.06682...",both,0.980559
13,"[-0.0073869806, 0.10435431, -0.17571743, 0.071...","[0.1713229, 0.031187793, 0.0027970274, -0.0218...",both,0.980611
17,"[0.006195314, -0.04341352, -0.065547384, -0.01...","[0.100749515, -0.034773186, 0.013341621, 0.099...",both,0.870356
22,"[0.040373057, 0.014811024, 0.16814518, 0.08294...","[0.043547124, 0.005184115, 0.026600244, 0.0438...",both,0.629677


In [38]:
# Manhattan Distance
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
def manhattan_distance(x, y):
    return sum(abs(xi - yi) for xi, yi in zip(x, y))

def manhattan_distance_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [manhattan_distance(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['manhattan_distance'] = manhattan_distance_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,"[0.12970965, -0.02112623, -0.022533875, 0.0432...","[0.10904028, -0.08473143, 0.056752145, 0.05782...",both,0.581706,3.300618
11,"[0.044672675, -0.02641947, 0.1916748, 0.042756...","[0.04378919, 0.024327224, -0.09181204, 0.06682...",both,0.980559,5.501461
13,"[-0.0073869806, 0.10435431, -0.17571743, 0.071...","[0.1713229, 0.031187793, 0.0027970274, -0.0218...",both,0.980611,5.779664
17,"[0.006195314, -0.04341352, -0.065547384, -0.01...","[0.100749515, -0.034773186, 0.013341621, 0.099...",both,0.870356,5.114403
22,"[0.040373057, 0.014811024, 0.16814518, 0.08294...","[0.043547124, 0.005184115, 0.026600244, 0.0438...",both,0.629677,3.41052


In [39]:
# Cosine Similairty
# https://ashukumar27.medium.com/similarity-functions-in-python-aa6dfe721035
def square_rooted(x):
    return round(math.sqrt(sum([xi*xi for xi in x])), 3)

def cosine_similarity(x, y):
    numerator = sum(xi*yi for xi, yi in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return round(numerator / float(denominator), 3)

def cosine_similarity_df(x_ser, y_ser):
    if type(x_ser) == pd.core.series.Series and type(y_ser) == pd.core.series.Series:
        return [cosine_similarity(x, y) for x, y in zip(x_ser, y_ser)]

word_vec_df_both['cosine_similarity'] = cosine_similarity_df(word_vec_df_both["word_vec_avg_self"], word_vec_df_both["word_vec_avg_target"])
word_vec_df_both.head()

Unnamed: 0_level_0,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance,cosine_similarity
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,"[0.12970965, -0.02112623, -0.022533875, 0.0432...","[0.10904028, -0.08473143, 0.056752145, 0.05782...",both,0.581706,3.300618,0.626
11,"[0.044672675, -0.02641947, 0.1916748, 0.042756...","[0.04378919, 0.024327224, -0.09181204, 0.06682...",both,0.980559,5.501461,0.352
13,"[-0.0073869806, 0.10435431, -0.17571743, 0.071...","[0.1713229, 0.031187793, 0.0027970274, -0.0218...",both,0.980611,5.779664,0.203
17,"[0.006195314, -0.04341352, -0.065547384, -0.01...","[0.100749515, -0.034773186, 0.013341621, 0.099...",both,0.870356,5.114403,0.384
22,"[0.040373057, 0.014811024, 0.16814518, 0.08294...","[0.043547124, 0.005184115, 0.026600244, 0.0438...",both,0.629677,3.41052,0.615


# Transformation
High IOS group (5, 6, 7) and Low IOS group (1, 2, and 3)

In [40]:
df = pd.read_csv('./output01_all.csv', header=0, index_col=0)
df.head()

Unnamed: 0_level_0,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,target_sex,...,self_activeness_score,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2023/05/19-12:49:10,2023/05/19-12:52:07,00:02:57,1,40,7,1,5,2,2,...,2,2,5,5,3,3,3,4,COMP,5
3,2023/05/19-12:49:28,2023/05/19-12:55:05,00:05:37,2,40,23,1,2,2,1,...,1,2,1,7,2,7,7,2,COMP,2
5,2023/05/19-12:54:24,2023/05/19-12:57:31,00:03:07,1,46,18,1,2,3,1,...,4,4,3,5,4,4,3,4,COMP,2
8,2023/05/19-12:57:23,2023/05/19-13:00:14,00:02:51,2,39,13,1,7,4,2,...,7,1,7,5,7,4,7,1,COMP,14
9,2023/05/19-12:55:51,2023/05/19-13:01:43,00:05:52,1,49,13,1,1,4,2,...,3,5,3,2,2,4,3,5,COMP,8


In [41]:
HighIOS_indices = df['IOS_score'] > 4
LowIOS_indices = df['IOS_score'] < 4

In [42]:
# df にIOS_group列を追加
df['IOS_group'] = ['NA'] * len(df)
df.loc[HighIOS_indices, ['IOS_group']] = 'HighIOS'
df.loc[LowIOS_indices, ['IOS_group']] = 'LowIOS'
df.head()

Unnamed: 0_level_0,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,target_sex,...,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID,IOS_group
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2023/05/19-12:49:10,2023/05/19-12:52:07,00:02:57,1,40,7,1,5,2,2,...,2,5,5,3,3,3,4,COMP,5,HighIOS
3,2023/05/19-12:49:28,2023/05/19-12:55:05,00:05:37,2,40,23,1,2,2,1,...,2,1,7,2,7,7,2,COMP,2,LowIOS
5,2023/05/19-12:54:24,2023/05/19-12:57:31,00:03:07,1,46,18,1,2,3,1,...,4,3,5,4,4,3,4,COMP,2,LowIOS
8,2023/05/19-12:57:23,2023/05/19-13:00:14,00:02:51,2,39,13,1,7,4,2,...,1,7,5,7,4,7,1,COMP,14,HighIOS
9,2023/05/19-12:55:51,2023/05/19-13:01:43,00:05:52,1,49,13,1,1,4,2,...,5,3,2,2,4,3,5,COMP,8,LowIOS


# Output

In [43]:
overlap_magnitude_df.head()

Unnamed: 0_level_0,wc_self,wc_target,wc_intercept,Overlap_score
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,0,0,0.0
3,5,3,0,0.0
5,0,0,0,
8,0,0,0,
9,2,0,0,0.0


In [44]:
df.loc[:, ['IOS_score', 'IOS_group']].head()

Unnamed: 0_level_0,IOS_score,IOS_group
MID,Unnamed: 1_level_1,Unnamed: 2_level_1
2,5,HighIOS
3,2,LowIOS
5,2,LowIOS
8,7,HighIOS
9,1,LowIOS


In [45]:
# overlap_magnitude_df と df.loc[:, ['IOS_score', 'IOS_group']] をマージ（key=MID)
out_df = pd.merge(df.loc[:, ['IOS_score', 'IOS_group']], overlap_magnitude_df, on='MID')
# さらにword_vec_df_bothをマージ
out_df = pd.merge(out_df, word_vec_df_both, on='MID')

out_df.head()

Unnamed: 0_level_0,IOS_score,IOS_group,wc_self,wc_target,wc_intercept,Overlap_score,word_vec_avg_self,word_vec_avg_target,_merge,euclidean_distance,manhattan_distance,cosine_similarity
MID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,2,LowIOS,5,3,0,0.0,"[0.12970965, -0.02112623, -0.022533875, 0.0432...","[0.10904028, -0.08473143, 0.056752145, 0.05782...",both,0.581706,3.300618,0.626
11,2,LowIOS,1,6,0,0.0,"[0.044672675, -0.02641947, 0.1916748, 0.042756...","[0.04378919, 0.024327224, -0.09181204, 0.06682...",both,0.980559,5.501461,0.352
13,6,HighIOS,2,3,0,0.0,"[-0.0073869806, 0.10435431, -0.17571743, 0.071...","[0.1713229, 0.031187793, 0.0027970274, -0.0218...",both,0.980611,5.779664,0.203
17,3,LowIOS,5,2,0,0.0,"[0.006195314, -0.04341352, -0.065547384, -0.01...","[0.100749515, -0.034773186, 0.013341621, 0.099...",both,0.870356,5.114403,0.384
22,6,HighIOS,2,5,2,0.4,"[0.040373057, 0.014811024, 0.16814518, 0.08294...","[0.043547124, 0.005184115, 0.026600244, 0.0438...",both,0.629677,3.41052,0.615


In [46]:
# # overlap_magnitude_df と df.loc[:, ['MID', 'IOS_group']] をマージ（key=MID)
# out_df = pd.merge(overlap_magnitude_df, df.loc[:, ['MID', 'IOS_score', 'IOS_group']], on='MID')

filename = './output02.csv'
out_df.to_csv(filename, index=True)

In [47]:
# 確認用
sentences_token_self_df = pd.DataFrame(sentences_token_self).rename(columns={0: 'MID'})
sentences_token_self_df = sentences_token_self_df.set_index('MID')
sentences_token_self_df.to_csv('./sentences_token_self_df.csv', index=False)

sentences_token_target_df = pd.DataFrame(sentences_token_target).rename(columns={0: 'MID'})
sentences_token_target_df = sentences_token_target_df.set_index('MID')
sentences_token_target_df.to_csv('./sentences_token_target_df.csv', index=False)