In [95]:
import os
import pandas as pd

In [96]:
data_path = './data/manual_annotate/jwydra_100_105'
output_path = os.path.join(data_path, 'training_data.csv')

stanza_path = './data/stanza_annotate/dev_annotations.jsonl'

subj_path = os.path.join(data_path, 'dev_annotations_subj.jsonl')
verb_path = os.path.join(data_path, 'dev_annotations_verb.jsonl')
obj_path = os.path.join(data_path, 'dev_annotations_obj.jsonl')

apv_path = os.path.join(data_path, 'dev_annotations_active.jsonl')
scv_path = os.path.join(data_path, 'dev_annotations_simple_verb.jsonl')
hv_path = os.path.join(data_path, 'dev_annotations_no_hidden_verb.jsonl')

In [97]:
subj_df = pd.read_json(subj_path, lines=True, orient="records")[['sent-index', 'subj_manual']]
# verb_df = pd.read_json(verb_path, lines=True, orient="records")[['sent-index', 'verb_manual']]
obj_df = pd.read_json(obj_path, lines=True, orient="records")[['sent-index', 'obj_manual']]

svo_df = pd.merge(subj_df, obj_df, on='sent-index', how='outer')
# svo_df = pd.merge(svo_df, verb_df, on='sent-index', how='outer')
svo_df['svo_dist'] = (svo_df['obj_manual'].apply(lambda x: max([i for _, i in x]))
                      - svo_df['subj_manual'].apply(lambda x: min([i for _, i in x]))).abs()

svo_df = svo_df[['sent-index', 'svo_dist']]
training_data = svo_df
svo_df.head()

Unnamed: 0,sent-index,svo_dist
0,359_428385-0-4-2,9
1,359_428385-0-4-3,5
2,359_428385-0-5-0,7
3,359_428385-0-6-0,11
4,359_428385-0-6-1,7


In [98]:
# 0 = 'passive' 1 = 'active' -1 = '<DON'T KNOW>'
apv_df = pd.read_json(apv_path, lines=True, orient="records")[['sent-index', 'active_voice_manual']]

training_data = pd.merge(training_data, apv_df, on='sent-index', how='outer')
apv_df.head()

Unnamed: 0,sent-index,active_voice_manual
0,359_428385-0-4-2,0
1,359_428385-0-4-3,1
2,359_428385-0-5-0,1
3,359_428385-0-6-0,1
4,359_428385-0-6-1,1


In [99]:
# 0 = 'not simplest form of verb' 1 = 'simple verb' -1 = '<DON'T KNOW>'
scv_df = pd.read_json(scv_path, lines=True, orient="records")[['sent-index', 'simple_verb_manual']]

training_data = pd.merge(training_data, scv_df, on='sent-index', how='outer')
scv_df.head()

Unnamed: 0,sent-index,simple_verb_manual
0,359_428385-0-4-2,0
1,359_428385-0-4-3,1
2,359_428385-0-5-0,1
3,359_428385-0-6-0,1
4,359_428385-0-6-1,1


In [100]:
# 0 = 'hidden verb(s)' 1 = 'no hidden verbs' -1 = '<DON'T KNOW>'
hv_df = pd.read_json(hv_path, lines=True, orient="records")[['sent-index', 'no_hidden_verb_manual']]

training_data = pd.merge(training_data, hv_df, on='sent-index', how='outer')
hv_df.head()

Unnamed: 0,sent-index,no_hidden_verb_manual
0,359_428385-0-4-2,1
1,359_428385-0-4-3,1
2,359_428385-0-5-0,1
3,359_428385-0-6-0,1
4,359_428385-0-6-1,1


In [101]:
stanza_df = pd.read_json(stanza_path, lines=True, orient="records")
sentences_df = stanza_df[stanza_df['sent-index'].isin(training_data['sent-index'])][['sent-index', 'sent', 'sentence_dict']]
sentences_df['sentence_len'] = sentences_df['sentence_dict'].apply(lambda x: len([token_dict['word'] for token_dict in x['tokens']]))
sentences_df.drop('sentence_dict', axis=1, inplace=True)

training_data = pd.merge(sentences_df, training_data, on='sent-index', how='outer')
sentences_df.head()

Unnamed: 0,sent-index,sent,sentence_len
100,359_428385-0-4-2,It has been announced that it will be picking ...,35
101,359_428385-0-4-3,"This year, the only new show is ""Mob City"", a ...",23
102,359_428385-0-5-0,AMCTV.com is an website provides special featu...,22
103,359_428385-0-6-0,This online feature allows viewers to see what...,25
104,359_428385-0-6-1,"Another the schedule allows users to do, is kn...",17


In [102]:
training_data.rename(columns={'active_voice_manual': 'apv',
                              'simple_verb_manual': 'scv',
                              'no_hidden_verb_manual': 'hv'},
                     inplace=True)

training_data['svo_dist_norm'] = training_data['svo_dist'] / training_data['sentence_len']
training_data.drop('sentence_len', axis=1, inplace=True)
training_data.to_csv(output_path)
training_data.head()

Unnamed: 0,sent-index,sent,svo_dist,apv,scv,hv,svo_dist_norm
0,359_428385-0-4-2,It has been announced that it will be picking ...,9,0,0,1,0.257143
1,359_428385-0-4-3,"This year, the only new show is ""Mob City"", a ...",5,1,1,1,0.217391
2,359_428385-0-5-0,AMCTV.com is an website provides special featu...,7,1,1,1,0.318182
3,359_428385-0-6-0,This online feature allows viewers to see what...,11,1,1,1,0.44
4,359_428385-0-6-1,"Another the schedule allows users to do, is kn...",7,1,1,1,0.411765
