Author: Omar El Malki (omar.elmalki@epfl.ch)

### ConceptNet Node extraction from first 10 ROCStories

In [None]:
import pandas as pd
import os
from tqdm import tqdm
import ast

tqdm.pandas()

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [None]:
os.chdir('../../src/')

In [None]:
# Read ROCStories into pandas DataFrame
roc_stories_path_csv = "../generated/concepnet-node-extraction/ROCStories_resolved_with_entities_first10.csv"
roc_stories_df = pd.read_csv(roc_stories_path_csv, sep='\t', header=0)

In [None]:
roc_stories_df.head(10)

#### Turn entity list into strings before applying CoCo-Ex

In [None]:
def entities_to_string(entities):
    """
    Transform list of entities into a single string
    :param triples:
    :return: str
    """
    s = ""
    l = ast.literal_eval(entities)
    for entity in l:
        s += entity + ", "
    return s

In [None]:
roc_stories_entities_df = roc_stories_df[['storyid', 'srl_entities1', 'srl_entities2', 'srl_entities3', 'srl_entities4', 'srl_entities5']]

In [None]:
roc_stories_entities_df.head(10)

In [None]:
for n in range(1, 6):
    roc_stories_entities_df[f'entities_str_t{n}'] = roc_stories_entities_df[f'srl_entities{n}'] \
        .progress_apply(lambda t: entities_to_string(t))

In [None]:
roc_stories_entities_string_df = roc_stories_entities_df[
    ['storyid', 'entities_str_t1', 'entities_str_t2', 'entities_str_t3', 'entities_str_t4', 'entities_str_t5']]

In [None]:
roc_stories_entities_string_df.to_csv(
    '../generated/conceptnet-node-extraction/ROCStories_resolved_entities_entity_extraction_input.csv', sep='\t', header=None, index=False)

In [None]:
roc_stories_entities_string_df.head(10)

In [None]:
os.chdir('../lib/CoCo-Ex/')

In [None]:
%%time
entity_extraction_script_name = 'CoCo-Ex_entity_extraction.py'
input_csv = "../../generated/conceptnet-node-extraction/ROCStories_resolved_entities_entity_extraction_input_first10.csv"
output_tsv = "../../generated/conceptnet-node-extraction/ROCStories_resolved_entities_entity_extraction_output_first10.tsv"
os.system(f'python3 {entity_extraction_script_name} {input_csv} {output_tsv}')

In [None]:
ezdf = pd.read_csv(output_tsv, sep='\t', header=0)

In [None]:
ezdf.head(10)

In [None]:
output_filtered_tsv = "../../generated/conceptnet-node-extraction/ROCStories_resolved_entities_conceptnet_nodes_filtered_first10.tsv"
overhead_filter_script_name = 'CoCo-Ex_overhead_filter.py'
len_diff_tokenlevel = 1
len_diff_charlevel = 10
dice_coefficient = 0.8

In [None]:
os.system(f'python3 {overhead_filter_script_name} '
          f'--inputfile {output_tsv} '
          f'--outputfile {output_filtered_tsv} '
          f'--len_diff_tokenlevel {len_diff_tokenlevel} '
          f'--len_diff_charlevel {len_diff_charlevel} '
          f'--dice_coefficient {dice_coefficient}')

In [None]:
roc_stories_extracted_nodes_df = pd.read_csv(output_filtered_tsv, sep='\t', header=None,
                                             names=['storyid', 'sentence_index', 'sentence', 'nodes'])

In [None]:
roc_stories_extracted_nodes_df

In [None]:
roc_stories_pivoted_df = roc_stories_extracted_nodes_df.pivot(index='storyid', columns='sentence_index', values='nodes').reset_index()

In [None]:
roc_stories_pivoted_df.columns.name = None

In [None]:
roc_stories_pivoted_df.columns = ['storyid', 'cn_nodes1', 'cn_nodes2', 'cn_nodes3', 'cn_nodes4', 'cn_nodes5']

In [None]:
roc_stories_pivoted_df