# Setup

In [17]:
import pandas as pd

# Tokenization
We will focus on specific kinds of part-of-speech (POS), i.e.,
- adjectives
- verbs and nouns that form verbs when "suru" is added as a suffix
- nouns 

## POS tagging

In [18]:
!mecab -o ./Results/output01_self.txt.mecab ./Results/output01_self.txt
!mecab -o ./Results/output01_target.txt.mecab ./Results/output01_target.txt
# !mecab -o ./test.txt.mecab ./test.txt

## Morphological Analysis

In [19]:
def my_morphol(filename):
	sentences = []
	sentence = []
	with open(filename, mode='r') as f:
		for line in f:
			if line != 'EOS\n':
				fields = line.split('\t')
				if len(fields) != 2 or fields[0] == '':
					continue
				else:
					attr = fields[1].split(',')
					if attr[6] != '*\n':
						morph = {'surface': fields[0], 'base': attr[6], 'pos': attr[0], 'pos1': attr[1]}
						sentence.append(morph)
			else:
				sentences.append(sentence)
				sentence = []
	
	return sentences

In [20]:
# filename = './test.txt.mecab'
# sentences = my_morphol(filename)
# print(sentences)

In [21]:
# SELF
filename = './Results/output01_self.txt.mecab'
sentences_self = my_morphol(filename)
# for sentence in sentences_self:
# 	print(sentence)

In [22]:
# TARGET
filename = './Results/output01_target.txt.mecab'
sentences_target = my_morphol(filename)
# for sentence in sentences_target:
# 	print(sentence)

## Tokenization

In [23]:
def my_tokenization(sentences):
	sentences_token = []
	sentence_token = []
	for sentence in sentences:
		for morph in sentence:
			# POS according to https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/NAIST-JDIC_manual.pdf
			# Words for Traits
			if morph['pos'] == '形容詞' and morph['pos1'] == '自立': # pos = 形容詞, pos1 = 自立
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '形容動詞語幹': # pos = 名詞, pos1 = 形容動詞語幹
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'ナイ形容詞語幹': # pos = 名詞, pos1 = ナイ形容詞語幹
				sentence_token.append(morph['base'])

			# Words for Behaviors
			elif morph['pos'] == '動詞' and morph['pos1'] == '自立': # pos = 動詞, pos1 = 自立
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == 'サ変接続': # pos = 名詞, pos1 = サ変接続, 
				sentence_token.append(morph['base'])

			# Words for Stereotype etc
			elif morph['pos'] == '名詞' and morph['pos1'] == '一般': # pos = 名詞, 一般
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '固有名詞': # pos = 名詞, 固有名詞
				sentence_token.append(morph['base'])
			elif morph['pos'] == '名詞' and morph['pos1'] == '代名詞': # pos = 名詞, 代名詞
				sentence_token.append(morph['base'])

		sentences_token.append(sentence_token)
		sentence_token = []

	return sentences_token

In [24]:
# sentences_token = my_tokenization(sentences)
# print(sentences_token)

In [25]:
# SELF
sentences_token_self = my_tokenization(sentences_self)
# for sentence in sentences_token_self:
# 	print(sentence)

In [26]:
# TARGET
sentences_token_target = my_tokenization(sentences_target)
# for sentence in sentences_token_target:
# 	print(sentence)

# Indices
overlapping magnitude

## An unoriented incidnece matrix A
- unique words in rows
- self and target in columns (self in the first column; target in the second column)
- When word w_i represented in i-th row, is used to describe the self, the corresponding element a_i1 is 1, otherwise, 0.

## A self-other overlap matrix
tranpose(A) * A

## Overlapping magnitude
n_12 / (n_11 + n_22 - n_12)

In [27]:
overlap_magnitude_list = []
for tokens_self, tokens_target in zip(sentences_token_self, sentences_token_target):

	if len(tokens_self) > 0 and len(tokens_target) > 0:
		# Create a dictionary
		d = {
			'word': tokens_self + tokens_target,
			'cond': ['self'] * (len(tokens_self)) + ['target'] * (len(tokens_target))
		}
		# Create a dataframe
		df = pd.DataFrame(data=d)
		# Create a crosstab
		ctab = pd.crosstab(df['word'], df['cond'])

		# Create an Unoriented Incidence Matrix
		ctab_bin = ctab > 0		# 0 for 0, otherwise (>0) 1
		incidence_matrix = ctab_bin * 1 # covnert (True, False) to (1, 0)
	
		# Create a Co-Membership Matrix
		comembership_matrix = incidence_matrix.T.dot(incidence_matrix)

		# Compute an Overlap Magnitude
		n12 = comembership_matrix.loc['self', 'target']
		n11 = comembership_matrix.loc['self', 'self']
		n22 = comembership_matrix.loc['target', 'target']
		overlap_magnitude = n12 / (n11 + n22 - n12)
	
	else:
		overlap_magnitude = 'NA'
	
	overlap_magnitude_list.append(overlap_magnitude)


In [28]:
# overlap_magnitude_list

# Transformation
High IOS group (5, 6, 7) and Low IOS group (1, 2, and 3)

In [29]:
df = pd.read_csv('./Results/output01_all.csv')
# df.head()

In [30]:
HighIOS_indices = df['IOS_score'] > 4
LowIOS_indices = df['IOS_score'] < 4

# Output

In [31]:
# out_df = pd.DataFrame({
# 	'SubID': ['NA','NA','NA','NA','NA','NA','NA','NA','NA','NA',
# 		'NA','NA','NA','NA','NA','NA','NA','NA','NA','NA'],
# 	'IOSScore': [1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
# 			4, 5, 6, 7, 1, 2, 3, 4, 5, 6],
# 	'IOSGroup': ['H','H','H','H','H','H','H','H','H','H', 
# 			'L','L','L','L','L','L','L','L','L','L'],
# 	'OverlapScore': [71, 72, 72, 75, 78, 81, 82, 83, 89, 91,
# 		81, 81, 84, 88, 88, 89, 90, 90, 90, 91]})

In [32]:
out_df = df[['MID', 'IOS_score']].copy()
out_df['Overlap_score'] = overlap_magnitude_list
out_df['IOS_group'] = ['Neither'] * len(overlap_magnitude_list)
out_df.loc[HighIOS_indices, ['IOS_group']] = 'HighIOS'
out_df.loc[LowIOS_indices, ['IOS_group']] = 'LowIOS'

filename = './Results/output02.csv'
out_df.to_csv(filename, index=False)