# Setup

In [1]:
import pandas as pd

In [2]:
import numpy as np

# Tokenization
We will focus on specific kinds of part-of-speech (POS), i.e.,
- Adjectives
- Verbs and nouns that form verbs when "suru" is added as a suffix
- nouns

## POS tagging

In [3]:
!mecab -o ./output01_self.txt.mecab ./output01_self.txt
!mecab -o ./output01_target.txt.mecab ./output01_target.txt

## Morphological Analysis

In [4]:
def my_morphol(filename):
	sentences = []
	sentence = []
	previous_line = ''
	MID = -1
	with open(filename, mode='r') as f:
		for line in f:
			# headerに相当する最初の行をスキップ
			# 文頭 or 文中
			if line != 'EOS\n':
				fields = line.split('\t')
				attr = fields[1].split(',')
				# 変数名行
				if previous_line == '':
					continue
				# MID行
				elif fields[0] == 'MID':
					continue
				# 文頭
				elif previous_line == 'EOS\n' and attr[1] == '数':
					MID = int(fields[0])
					sentence.append(MID)
				# 文中
				else:
					morph = {'surface': fields[0], 'base': attr[6], 'pos': attr[0], 'pos1': attr[1]}
					sentence.append(morph)
			# 文末（EOS行）
			else:
				if MID > 0 and len(sentence) > 0:
					sentences.append(sentence)
					sentence = []
					MID = -1
				elif MID > 0 and len(sentence) == 0:
					sentences.append([])
					sentence = []
					MID = -1

			previous_line = line

	return sentences

In [5]:
# SELF
filename = './output01_self.txt.mecab'
sentences_self = my_morphol(filename)

In [6]:
sentences_self[0]

[2,
 {'surface': '愛想', 'base': '愛想', 'pos': '名詞', 'pos1': '一般'},
 {'surface': 'が', 'base': 'が', 'pos': '助詞', 'pos1': '格助詞'},
 {'surface': 'ない', 'base': 'ない', 'pos': '形容詞', 'pos1': '自立'}]

In [7]:
len(sentences_self)

88

In [8]:
# TARGET
filename = './output01_target.txt.mecab'
sentences_target = my_morphol(filename)

In [9]:
sentences_target[0]

[2]

In [10]:
len(sentences_target)

88

## Tokenization

In [11]:
def my_tokenization(sentences):
	sentences_token = []
	sentence_token = []
	for sentence in sentences:
		# sentence[0] = MID
		sentence_token.append(sentence[0])

		for morph in sentence[1:]:
			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
			# Words for Traits
			if morph['pos'] == '形容詞' and morph['pos1'] == '自立':
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

			# Words for Behaviors
			elif morph['pos'] == '動詞' and morph['pos1'] == '自立':
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続':
				if morph['base'] != '*\n':	# e.g, {'surface': '､', 'base': '*\n', 'pos': '名詞', 'pos1': 'サ変接続'}
					sentence_token.append(morph['base'])

			# Words for Stereotype etc
			elif morph['pos'] == '名詞' and morph['pos1'] == '一般':
				if morph['base'] != '*\n':	# e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞':
				if morph['base'] != '*\n':	# {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
					sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞':
				if morph['base'] != '*\n':
					sentence_token.append(morph['base'])

		sentences_token.append(sentence_token)
		sentence_token = []

	return sentences_token

In [12]:
# def my_tokenization(sentences):
# 	sentences_token = []
# 	sentence_token = []
# 	for sentence in sentences:
# 		# sentence[0] = MID
# 		sentence_token.append(sentence[0])
# 
# 		for morph in sentence[1:]:
# 			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
# 			# Words for Traits
# 			if morph['pos'] == '形容詞' and morph['pos1'] == '自立': # pos = 形容詞, pos1 = 自立
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹': # pos = 名詞, pos1 = 形容動詞語幹
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹': # pos = 名詞, pos1 = ナイ形容詞語幹
# 				sentence_token.append(morph['base'])
# 
# 			# Words for Behaviors
# 			elif morph['pos'] == '動詞' and morph['pos1'] == '自立': # pos = 動詞, pos1 = 自立
# 				sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続': # pos = 名詞, pos1 = サ変接続, 
# 				sentence_token.append(morph['base'])
# 
# 			# Words for Stereotype etc
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '一般': # pos = 名詞, 一般
# 				if morph['base'] == '*\n': # e.g., {'surface': 'キャパオーバー', 'base': '*\n', 'pos': '名詞', 'pos1': '一般'},
# 					sentence_token.append(morph['surface'])
# 				else:
# 					sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞': # pos = 名詞, 固有名詞
# 				if morph['base'] == '*\n': # {'surface': 'k', 'base': '*\n', 'pos': '名詞', 'pos1': '固有名詞'}
# 					sentence_token.append(morph['surface'])
# 				else:
# 					sentence_token.append(morph['base'])
# 			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞': # pos = 名詞, 代名詞
# 				sentence_token.append(morph['base'])
# 
# 		sentences_token.append(sentence_token)
# 		sentence_token = []
# 
# 	return sentences_token

In [13]:
# SELF
sentences_token_self = my_tokenization(sentences_self)

In [14]:
sentences_token_self[0]

[2, '愛想', 'ない']

In [15]:
# TARGET
sentences_token_target = my_tokenization(sentences_target)

In [16]:
sentences_token_target[0]

[2]

# Indices
overlapping magnitude

## An unoriented incidnece matrix A
- unique words in rows
- self and target in columns (self in the first column; target in the second column)
- When word w_i represented in i-th row, is used to describe the self, the corresponding element a_i1 is 1, otherwise, 0.

## A self-other overlap matrix
tranpose(A) * A

## Overlapping magnitude
n_12 / (n_11 + n_22 - n_12)

In [17]:
mid_list = []
wc_self = []
wc_target = []
wc_intercept = []
overlap_magnitude_list = []
for tokens_self, tokens_target in zip(sentences_token_self, sentences_token_target):
	
	try:
		tokens_self[0] == tokens_target[0] # compare the MIDs
	except:
		print('The MIDs do not match: the self mid is ' + str(tokens_self[0]) + ', while the target mid is ' + str(tokens_target[0]))
	else:

		if len(tokens_self) > 1 and len(tokens_target) > 1:
			# Create a dictionary
			d = {
				'word': tokens_self[1:] + tokens_target[1:],
				'cond': ['self'] * (len(tokens_self[1:])) + ['target'] * (len(tokens_target[1:]))
			}
			# Create a dataframe
			df = pd.DataFrame(data=d)
			# Create a crosstab
			ctab = pd.crosstab(df['word'], df['cond'])

			# Create an Unoriented Incidence Matrix
			ctab_bin = ctab > 0		# 0 for 0, otherwise (>0) 1
			incidence_matrix = ctab_bin * 1 # covnert (True, False) to (1, 0)
	
			# Create a Co-Membership Matrix
			comembership_matrix = incidence_matrix.T.dot(incidence_matrix)

			# Compute an Overlap Magnitude
			n11 = comembership_matrix.loc['self', 'self']
			n22 = comembership_matrix.loc['target', 'target']
			n12 = comembership_matrix.loc['self', 'target']
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) == 1 and len(tokens_target) > 1: # no words for self except for MID
			# Compute an Overlap Magnitude
			n11 = 0
			n22 = len(set(tokens_target[1:]))
			n12 = 0
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) > 1 and len(tokens_target) == 1: # no words for target except for MID
			# Compute an Overlap Magnitude
			n11 = len(set(tokens_self[1:]))
			n22 = 0
			n12 = 0
			overlap_magnitude = n12 / (n11 + n22 - n12)
		elif len(tokens_self) == 1 and len(tokens_target) == 1:
			# Compute an Overlap Magnitude
			n11 = 0
			n22 = 0
			n12 = 0
			overlap_magnitude = np.nan
		else:
			# will be replaced with NaN in a DataFrame
			overlap_magnitude = np.nan 

		mid_list.append(tokens_self[0])
		wc_self.append(n11)
		wc_target.append(n22)
		wc_intercept.append(n12)
		overlap_magnitude_list.append(overlap_magnitude)


In [18]:
overlap_magnitude_df = pd.DataFrame({
	'MID': mid_list,
	'wc_self': wc_self,
	'wc_target': wc_target,
	'wc_intercept': wc_intercept,
	'Overlap_score': overlap_magnitude_list})
overlap_magnitude_df.set_index('MID')
len(overlap_magnitude_df)

88

In [19]:
overlap_magnitude_df.head()

Unnamed: 0,MID,wc_self,wc_target,wc_intercept,Overlap_score
0,2,2,0,0,0.0
1,3,14,10,1,0.043478
2,5,2,0,0,0.0
3,8,0,2,0,0.0
4,9,4,0,0,0.0


In [20]:
overlap_magnitude_df.tail()

Unnamed: 0,MID,wc_self,wc_target,wc_intercept,Overlap_score
83,174,5,11,0,0.0
84,176,9,9,4,0.285714
85,177,6,1,0,0.0
86,181,0,0,0,
87,182,19,38,6,0.117647


# Transformation
High IOS group (5, 6, 7) and Low IOS group (1, 2, and 3)

In [21]:
df = pd.read_csv('./output01_all.csv')
df.head()

Unnamed: 0,MID,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,...,self_activeness_score,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID
0,2,2023/05/19-12:49:10,2023/05/19-12:52:07,177.0,1,40,7,1,5,2,...,2,2,5,5,3,3,3,4,COMP,5
1,3,2023/05/19-12:49:28,2023/05/19-12:55:05,337.0,2,40,23,1,2,2,...,1,2,1,7,2,7,7,2,COMP,2
2,5,2023/05/19-12:54:24,2023/05/19-12:57:31,187.0,1,46,18,1,2,3,...,4,4,3,5,4,4,3,4,COMP,2
3,8,2023/05/19-12:57:23,2023/05/19-13:00:14,171.0,2,39,13,1,7,4,...,7,1,7,5,7,4,7,1,COMP,14
4,9,2023/05/19-12:55:51,2023/05/19-13:01:43,352.0,1,49,13,1,1,4,...,3,5,3,2,2,4,3,5,COMP,8


In [22]:
HighIOS_indices = df['IOS_score'] > 4
LowIOS_indices = df['IOS_score'] < 4

In [23]:
# df にIOS_group列を追加
df['IOS_group'] = ['Neither'] * len(df)
df.loc[HighIOS_indices, ['IOS_group']] = 'HighIOS'
df.loc[LowIOS_indices, ['IOS_group']] = 'LowIOS'
df.head()

Unnamed: 0,MID,start,end,rt,self_sex,self_age,residence,participation,IOS_score,target_likedislike,...,self_sociability_score,target_tolerance_score,target_pleasantness_score,target_responsibility_score,target_carefulness_score,target_activeness_score,target_sociability_score,completion,stratumID,IOS_group
0,2,2023/05/19-12:49:10,2023/05/19-12:52:07,177.0,1,40,7,1,5,2,...,2,5,5,3,3,3,4,COMP,5,HighIOS
1,3,2023/05/19-12:49:28,2023/05/19-12:55:05,337.0,2,40,23,1,2,2,...,2,1,7,2,7,7,2,COMP,2,LowIOS
2,5,2023/05/19-12:54:24,2023/05/19-12:57:31,187.0,1,46,18,1,2,3,...,4,3,5,4,4,3,4,COMP,2,LowIOS
3,8,2023/05/19-12:57:23,2023/05/19-13:00:14,171.0,2,39,13,1,7,4,...,1,7,5,7,4,7,1,COMP,14,HighIOS
4,9,2023/05/19-12:55:51,2023/05/19-13:01:43,352.0,1,49,13,1,1,4,...,5,3,2,2,4,3,5,COMP,8,LowIOS


# Output

In [24]:
overlap_magnitude_df.head()

Unnamed: 0,MID,wc_self,wc_target,wc_intercept,Overlap_score
0,2,2,0,0,0.0
1,3,14,10,1,0.043478
2,5,2,0,0,0.0
3,8,0,2,0,0.0
4,9,4,0,0,0.0


In [25]:
df.loc[:, ['MID', 'IOS_group']].head()

Unnamed: 0,MID,IOS_group
0,2,HighIOS
1,3,LowIOS
2,5,LowIOS
3,8,HighIOS
4,9,LowIOS


In [26]:
# overlap_magnitude_df と df.loc[:, ['MID', 'IOS_group']] をマージ（key=MID)
out_df = pd.merge(overlap_magnitude_df, df.loc[:, ['MID', 'IOS_score', 'IOS_group']], on='MID')

filename = './output02.csv'
out_df.to_csv(filename, index=False)

In [27]:
# 確認用
sentences_token_self_df = pd.DataFrame(sentences_token_self).rename(columns={0: 'MID'})
sentences_token_self_df.set_index('MID')
sentences_token_self_df.to_csv('./sentences_token_self_df.csv', index=False)

sentences_token_target_df = pd.DataFrame(sentences_token_target).rename(columns={0: 'MID'})
sentences_token_target_df.set_index('MID')
sentences_token_target_df.to_csv('./sentences_token_target_df.csv', index=False)

In [28]:
out_df.groupby('IOS_group').size()

IOS_group
HighIOS    41
LowIOS     39
Neither     8
dtype: int64