Author: Omar El Malki (omar.elmalki@epfl.ch)

### ConceptNet Node extraction from first 10 ROCStories

In [1]:
import pandas as pd
import os
from tqdm import tqdm
import ast

tqdm.pandas()

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [2]:
os.chdir('../../src/')

In [3]:
# Read ROCStories into pandas DataFrame
roc_stories_path_csv = "../generated/semantic-role-labeling/ROCStories_resolved_with_entities_first10.csv"
roc_stories_df = pd.read_csv(roc_stories_path_csv, sep='\t', header=0)

#### Turn entity list into strings before applying CoCo-Ex

In [4]:
def entities_to_string(entities):
    """
    Transform list of entities into a single string
    :param triples:
    :return: str
    """
    s = ""
    l = ast.literal_eval(entities)
    for entity in l:
        s += entity + ", "
    return s

In [5]:
roc_stories_entities_df = roc_stories_df[['storyid', 'srl_entities1', 'srl_entities2', 'srl_entities3', 'srl_entities4', 'srl_entities5']]

In [6]:
roc_stories_entities_df

Unnamed: 0,storyid,srl_entities1,srl_entities2,srl_entities3,srl_entities4,srl_entities5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,"['David', 'David had put on a lot of weight recently', 'David', 'a lot of weight']","['David', 'David habits', 'David', 'David', 'the reason']","['David', ""David'd been eating too much fast food lately"", ""David'd"", 'too much fast food']","['David', 'going to burger places', 'David', 'to burger places', 'David', 'a vegetarian diet']","['to feel much better', 'David', 'much better']"
1,0beabab2-fb49-460e-a6e6-f35a202e3348,"['Tom', 'a very short temper']","['a guest', 'Tom very angry']","['Tom', 'a hole in the wall of Tom house']","[""Tom 's guest"", 'afraid', ""Tom 's guest""]","['Tom', 'on Tom couch', 'Tom', 'with regret about Tom actions']"
2,87da1a22-df0b-410c-b186-439700b70ba6,"['Marcus', 'clothing']","['All of Marcus clothes', 'either too formal or too casual']","['Marcus', 'to buy a pair of khakis', 'Marcus', 'a pair of khakis']",[],"['Marcus', 'happy to have the right clothes for the event', 'Marcus', 'the right clothes for the event']"
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,"['Bobby', 'Bill should buy a trailer and haul a trailer with Bill car', 'Bill', 'buy a trailer', 'Bill', 'a trailer', 'Bill', 'a trailer']","['Bill', 'a truck would be better for what Bill needed', 'a truck', 'better for what Bill needed', 'what', 'Bill']","['Bobby', 'two vehicles were much more expensive', 'two vehicles', 'much more expensive']","['Bill', 'in Bill ways']","['Bill', 'buying a truck', 'Bill', 'a truck']"
4,c71bb23b-7731-4233-8298-76ba6886cee1,"['John', 'a pastor with a very bad memory']","['John', 'to memorize John sermons many days in advance', 'John', 'John sermons', 'John']","['John', 'to learn to sing to overcome John handicap', 'John', 'to sing', 'John', 'John', 'John handicap']","['John', 'all John sermons', 'into music', 'John', 'John', 'all his sermons']","['John congregation', 'delighted', 'so', 'John']"
5,4d7b022e-25d2-4300-a9b0-24ab35f4045b,"[""Melody 's parents"", 'Melody', 'with a trip to the big aquarium']","['Melody', 'a nap']","[""Melody 's parents"", 'Melody', 'energetic and excited']","['Melody', 'sharks , tropical fish and many others']","['Melody and Melody family', 'home']"
6,8036c905-f23e-4976-83a1-85d679b5e0c2,"['The math teacher', 'a pop quiz', 'class']","['some students', 'The math teacher', 'passing out a pop quiz', 'The math teacher', 'a pop quiz']","['I', 'my pencil', 'I', 'to work', 'I']",['I'],"['I', 'I', 'confident', 'I', 'my pencil']"
7,77338898-07d4-4143-8451-284540c8b082,"['My first girlfriend', 'i']",['My first girlfriend'],"['we', 'each other']","['My first girlfriend', 'with me']","['we', ""we could n't be apart so My first girlfriend moved in with me"", 'we', 'apart', 'My first girlfriend']"
8,110fafd1-2bb7-4ffe-aac7-475706165d41,"['I', 'Charlie Horse', 'I', 'four years old']","['Charlie Horse', 'a brown stuffed horse', 'I']","['Charlie Horse', 'my best friend', 'Charlie Horse', 'at the head of my bed']","['I', 'Charlie Horse', 'next to me', 'I', 'Charlie Horse', 'soft fur']","['I', 'to listen to my radio as I fell asleep cuddling Charlie Horse', 'I', 'to my radio', 'I', 'asleep', 'I', 'Charlie Horse']"
9,13573c2e-5eed-40eb-bbe5-ed259b5c76a6,"['Laura', 'corn']","['Laura', 'to grow some in Laura backyard', 'Laura', 'some']","['them', 'The whole process of growing them', 'Laura very excited']","['Laura', 'that them required too much water', 'them', 'too much water']","['Laura', 'Laura corn garden idea']"


In [7]:
for n in range(1, 6):
    roc_stories_entities_df[f'entities_str_t{n}'] = roc_stories_entities_df[f'srl_entities{n}'] \
        .progress_apply(lambda t: entities_to_string(t))

100%|██████████| 10/10 [00:00<00:00, 13277.32it/s]
100%|██████████| 10/10 [00:00<00:00, 12314.46it/s]
100%|██████████| 10/10 [00:00<00:00, 14433.26it/s]
100%|██████████| 10/10 [00:00<00:00, 15380.65it/s]
100%|██████████| 10/10 [00:00<00:00, 11161.00it/s]


In [8]:
roc_stories_entities_string_df = roc_stories_entities_df[
    ['storyid', 'entities_str_t1', 'entities_str_t2', 'entities_str_t3', 'entities_str_t4', 'entities_str_t5']]

In [9]:
roc_stories_entities_string_df.to_csv(
    '../generated/conceptnet-node-extraction/ROCStories_resolved_entities_entity_extraction_input_first10.csv', sep='\t', header=None, index=False)

In [10]:
roc_stories_entities_string_df

Unnamed: 0,storyid,entities_str_t1,entities_str_t2,entities_str_t3,entities_str_t4,entities_str_t5
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,"David, David had put on a lot of weight recently, David, a lot of weight,","David, David habits, David, David, the reason,","David, David'd been eating too much fast food lately, David'd, too much fast food,","David, going to burger places, David, to burger places, David, a vegetarian diet,","to feel much better, David, much better,"
1,0beabab2-fb49-460e-a6e6-f35a202e3348,"Tom, a very short temper,","a guest, Tom very angry,","Tom, a hole in the wall of Tom house,","Tom 's guest, afraid, Tom 's guest,","Tom, on Tom couch, Tom, with regret about Tom actions,"
2,87da1a22-df0b-410c-b186-439700b70ba6,"Marcus, clothing,","All of Marcus clothes, either too formal or too casual,","Marcus, to buy a pair of khakis, Marcus, a pair of khakis,",,"Marcus, happy to have the right clothes for the event, Marcus, the right clothes for the event,"
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,"Bobby, Bill should buy a trailer and haul a trailer with Bill car, Bill, buy a trailer, Bill, a trailer, Bill, a trailer,","Bill, a truck would be better for what Bill needed, a truck, better for what Bill needed, what, Bill,","Bobby, two vehicles were much more expensive, two vehicles, much more expensive,","Bill, in Bill ways,","Bill, buying a truck, Bill, a truck,"
4,c71bb23b-7731-4233-8298-76ba6886cee1,"John, a pastor with a very bad memory,","John, to memorize John sermons many days in advance, John, John sermons, John,","John, to learn to sing to overcome John handicap, John, to sing, John, John, John handicap,","John, all John sermons, into music, John, John, all his sermons,","John congregation, delighted, so, John,"
5,4d7b022e-25d2-4300-a9b0-24ab35f4045b,"Melody 's parents, Melody, with a trip to the big aquarium,","Melody, a nap,","Melody 's parents, Melody, energetic and excited,","Melody, sharks , tropical fish and many others,","Melody and Melody family, home,"
6,8036c905-f23e-4976-83a1-85d679b5e0c2,"The math teacher, a pop quiz, class,","some students, The math teacher, passing out a pop quiz, The math teacher, a pop quiz,","I, my pencil, I, to work, I,","I,","I, I, confident, I, my pencil,"
7,77338898-07d4-4143-8451-284540c8b082,"My first girlfriend, i,","My first girlfriend,","we, each other,","My first girlfriend, with me,","we, we could n't be apart so My first girlfriend moved in with me, we, apart, My first girlfriend,"
8,110fafd1-2bb7-4ffe-aac7-475706165d41,"I, Charlie Horse, I, four years old,","Charlie Horse, a brown stuffed horse, I,","Charlie Horse, my best friend, Charlie Horse, at the head of my bed,","I, Charlie Horse, next to me, I, Charlie Horse, soft fur,","I, to listen to my radio as I fell asleep cuddling Charlie Horse, I, to my radio, I, asleep, I, Charlie Horse,"
9,13573c2e-5eed-40eb-bbe5-ed259b5c76a6,"Laura, corn,","Laura, to grow some in Laura backyard, Laura, some,","them, The whole process of growing them, Laura very excited,","Laura, that them required too much water, them, too much water,","Laura, Laura corn garden idea,"


In [11]:
os.chdir('../lib/CoCo-Ex/')

In [12]:
entity_extraction_script_name = 'CoCo-Ex_entity_extraction.py'
input_csv = "../../generated/conceptnet-node-extraction/ROCStories_resolved_entities_entity_extraction_input_first10.csv"
output_tsv = "../../generated/conceptnet-node-extraction/ROCStories_resolved_entities_entity_extraction_output_first10.tsv"
os.system(f'python3 {entity_extraction_script_name} {input_csv} {output_tsv}')

0

In [13]:
ezdf = pd.read_csv(output_tsv, sep='\t', header=0)

In [14]:
ezdf.head(10)

Unnamed: 0,###SENT-ID,SENT,PHRASE,PHRASE-TYPE,NODE,NODE-LEMMATIZED,EXACT-MATCH,EXACT-MATCH-LEMMAS,EXACT-MATCH-NOSTOPS,LEN-DIFF-TOKEN,LEN-DIFF-CHAR,DICE,DICE-LEMMAS,DICE-NOSTOPS,JACCARD,JACCARD-LEMMAS,JACCARD-NOSTOPS,WMD,WMD-LEMMAS,WMD-NOSTOPS,MED,MED-LEMMAS,MED-NOSTOPS,COS,COS-LEMMAS,COS-NOSTOPS
0,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot of weight,NP,lots weight,,False,False,True,2,4,0.333333,0.666667,1.0,,,,,,,,,,,,
1,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot of weight,NP,lift lot of weight,lift lot of weight,False,False,False,0,3,0.75,0.75,0.8,,,,,,,,,,,,
2,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot of weight,NP,pull lot of weight,pull lot of weight,False,False,False,0,3,0.75,0.75,0.8,,,,,,,,,,,,
3,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,divide into lots,,False,False,False,1,11,0.0,0.4,0.666667,,,,,,,,,,,,
4,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,lot of books,,False,False,False,1,7,0.4,0.4,0.666667,,,,,,,,,,,,
5,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,lot of plants,,False,False,False,1,8,0.4,0.4,0.666667,,,,,,,,,,,,
6,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,lot smaller than,,False,False,False,1,11,0.4,0.4,0.666667,,,,,,,,,,,,
7,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,hold lot of material,hold lot of material,False,False,False,2,15,0.333333,0.333333,0.5,,,,,,,,,,,,
8,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,lose lot of money,lose lot of money,False,False,False,2,12,0.333333,0.333333,0.5,,,,,,,,,,,,
9,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd_sentence1,"david, david had put on a lot of weight recently, david, a lot of weight,",a lot,NP,fishing lot rights,,False,False,False,1,13,0.4,0.4,0.5,,,,,,,,,,,,


In [15]:
output_filtered_tsv = "../../generated/conceptnet-node-extraction/ROCStories_resolved_entities_conceptnet_nodes_filtered_first10.tsv"
overhead_filter_script_name = 'CoCo-Ex_overhead_filter.py'
len_diff_tokenlevel = 1
len_diff_charlevel = 10
dice_coefficient = 0.8

In [16]:
os.system(f'python3 {overhead_filter_script_name} '
          f'--inputfile {output_tsv} '
          f'--outputfile {output_filtered_tsv} '
          f'--len_diff_tokenlevel {len_diff_tokenlevel} '
          f'--len_diff_charlevel {len_diff_charlevel} '
          f'--dice_coefficient {dice_coefficient}')

0

In [17]:
roc_stories_extracted_nodes_df = pd.read_csv(output_filtered_tsv, sep='\t', header=None,
                                             names=['storyid', 'sentence_index', 'sentence', 'nodes'])

In [18]:
roc_stories_extracted_nodes_df

Unnamed: 0,storyid,sentence_index,sentence,nodes
0,0beabab2-fb49-460e-a6e6-f35a202e3348,sentence1,"tom, a very short temper,",[temper][tom]
1,0beabab2-fb49-460e-a6e6-f35a202e3348,sentence2,"a guest, tom very angry,",[guest]
2,0beabab2-fb49-460e-a6e6-f35a202e3348,sentence3,"tom, a hole in the wall of tom house,",[hole][house][tom][wall]
3,0beabab2-fb49-460e-a6e6-f35a202e3348,sentence4,"tom 's guest, afraid, tom 's guest,",[guest][tom]
4,0beabab2-fb49-460e-a6e6-f35a202e3348,sentence5,"tom, on tom couch, tom, with regret about tom actions,",[couch][regret][action][tom]
5,110fafd1-2bb7-4ffe-aac7-475706165d41,sentence1,"charlie horse, a brown stuffed horse, i,",[charlie horse][horse]
6,110fafd1-2bb7-4ffe-aac7-475706165d41,sentence2,"charlie horse, my best friend, charlie horse, at the head of my bed,",[friend][charlie horse][bed][head][horse]
7,110fafd1-2bb7-4ffe-aac7-475706165d41,sentence3,"i, charlie horse, next to me, i, charlie horse, soft fur,",[horse][fur][charlie horse][soft fur]
8,110fafd1-2bb7-4ffe-aac7-475706165d41,sentence4,"i, to listen to my radio as i fell asleep cuddling charlie horse, i, to my radio, i, asleep, i, charlie horse,",[horse][charlie horse][radio]
9,110fafd1-2bb7-4ffe-aac7-475706165d41,sentence5,"laura, corn,",[laura][corn]


In [19]:
roc_stories_pivoted_df = roc_stories_extracted_nodes_df.pivot(index='storyid', columns='sentence_index', values='nodes').reset_index()

In [20]:
roc_stories_pivoted_df.columns.name = None

In [24]:
roc_stories_pivoted_df.columns = ['storyid', 'cn_nodes1', 'cn_nodes2', 'cn_nodes3', 'cn_nodes4', 'cn_nodes5']

In [25]:
roc_stories_pivoted_df

Unnamed: 0,storyid,cn_nodes1,cn_nodes2,cn_nodes3,cn_nodes4,cn_nodes5
0,0beabab2-fb49-460e-a6e6-f35a202e3348,[temper][tom],[guest],[hole][house][tom][wall],[guest][tom],[couch][regret][action][tom]
1,110fafd1-2bb7-4ffe-aac7-475706165d41,[charlie horse][horse],[friend][charlie horse][bed][head][horse],[horse][fur][charlie horse][soft fur],[horse][charlie horse][radio],[laura][corn]
2,13573c2e-5eed-40eb-bbe5-ed259b5c76a6,[laura][backyard],[laura],[laura][water],[laura][corn][idea][garden],
3,2d16bcd6-692a-4fc0-8e7c-4a6f81d9efa9,[better][bill][truck],[vehicle],[bill][ways],[bill][truck],[memory][john][pastor]
4,4d7b022e-25d2-4300-a9b0-24ab35f4045b,[nap][melody],[parent][melody],[fish][shark's][sharks],[family][melody],[pop quiz][teacher][quiz][pop][math teacher][math]
5,77338898-07d4-4143-8451-284540c8b082,,,[girlfriend],[girlfriend],[charlie horse][year][horse][four year]
6,8036c905-f23e-4976-83a1-85d679b5e0c2,[pop quiz][teacher][quiz][pop][math teacher][math][student],[pencil],,[confident][pencil],[girlfriend]
7,87da1a22-df0b-410c-b186-439700b70ba6,,[clothe],[pair],[clothe][event],[car][bill][car bill][trailer]
8,8bbe6d11-1e2e-413c-bf81-eaea05f4f1bd,[weight][lot][david],[david][reason][habit],[david][food],[vegetarian diet][david][vegetarian][place][diet][burger],
9,c71bb23b-7731-4233-8298-76ba6886cee1,[day][advance][many days][sermon][john],[john][handicap],[sermon][john][music],[congregation][john],[trip][parent][aquarium][melody]
