# loading and exploring cpnet

## loading graph

In [6]:
!head data/conceptnet-en.csv
!wc -l data/conceptnet-en.csv

Antonym	0	1	1.0
Antonym	12_hour_clock	24_hour_clock	1.0
Antonym	24_hour_clock	12_hour_clock	1.0
Antonym	5	3	1.0
Antonym	a.c	d.c	1.0
Antonym	a.m	afternoon	1.0
Antonym	a.m	p.m	3.464
Antonym	a.m	pm	1.0
Antonym	ab_extra	ab_intra	1.0
Antonym	ab_intra	ab_extra	1.0
3423004 data/conceptnet-en.csv


In [56]:
!cat data/conceptnet-en.csv | grep soccer

AtLocation	advertisements	soccer_game	1.0
AtLocation	angry_mothers	soccer_game	1.0
AtLocation	ball	soccer_field	2.0
AtLocation	ball	soccer_game	2.0
AtLocation	balls	soccer_game	3.464
AtLocation	beer	soccer_game	1.0
AtLocation	black_and_white_soccer_ball	soccer_game	1.0
AtLocation	bleachers	soccer_game	1.0
AtLocation	blood	soccer_game	1.0
AtLocation	cheering_parents	soccer_game	1.0
AtLocation	children	soccer_game	1.0
AtLocation	coach	soccer_game	1.0
AtLocation	drunken_fans	soccer_game	1.0
AtLocation	fans	soccer_game	2.0
AtLocation	field	soccer_game	1.0
AtLocation	football_players	soccer_game	1.0
AtLocation	goal	soccer_game	1.0
AtLocation	goalie	soccer_game	1.0
AtLocation	goalies	soccer_game	2.0
AtLocation	grass	soccer_game	1.0
AtLocation	hot_dog	soccer_game	1.0
AtLocation	keeper	soccer_game	1.0
AtLocation	knee_pads	soccer_game	1.0
AtLocation	net	soccer_game	1.0
AtLocation	parent	soccer_game	2.0
AtLocation	parents	soccer_game	1.0
AtLocation	people_wearing_socks	soccer_game	1.0
AtLocation

RelatedTo	dugout	soccer	1.0
RelatedTo	early_bath	soccer	1.0
RelatedTo	field	soccer	0.521
RelatedTo	field	soccer_space	0.152
RelatedTo	five_side	soccer	1.0
RelatedTo	flop	soccer	1.0
RelatedTo	foosball	table_soccer	1.0
RelatedTo	football	american_soccer	0.172
RelatedTo	football	europe_soccer	0.171
RelatedTo	football	soccer	0.343
RelatedTo	football	soccer	1.0
RelatedTo	football_hooligan	soccer	1.0
RelatedTo	football_hooliganism	soccer	1.0
RelatedTo	football_player	soccer	1.0
RelatedTo	footer	soccer	1.0
RelatedTo	footy	soccer	1.0
RelatedTo	forward	soccer	1.0
RelatedTo	futsal	soccer	1.0
RelatedTo	game	or_soccer	0.196
RelatedTo	game	soccer	0.196
RelatedTo	goal	soccer	2.063
RelatedTo	goal	soccer_field	0.219
RelatedTo	goal	soccer_score	0.795
RelatedTo	goal	soccer_touchdown	0.105
RelatedTo	goal	soccer_win	0.146
RelatedTo	goal_post	soccer	1.0
RelatedTo	gridiron_football	soccer_football	1.0
RelatedTo	ground	soccer	1.0
RelatedTo	hockey_mom	soccer_mom	1.0
RelatedTo	juke	soccer	1.0
RelatedTo	keepie_

## processing cpnet file and building a nx graph + rel & concept vocabularies

In [42]:
from tqdm import tqdm 
import networkx as nx

relations = {}
concepts = {}
cpnet_graph = nx.DiGraph()

with open('./data/conceptnet-en.csv', 'r') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        rel, c1, c2, w = line.strip().split('\t')
        rev_rel = f"{rel}_reverse"
        
        if rel not in relations:
            relations[rel] = f"REL_{len(relations)}"
        if rev_rel not in relations:
            relations[rev_rel] = f"REL_{len(relations)}"
        if c1 not in concepts:
            concepts[c1] = f"CSPT_{len(concepts)}"
        if c2 not in concepts:
            concepts[c2] = f"CSPT_{len(concepts)}"
        
        cpnet_graph.add_edge(concepts[c1], concepts[c2], weight=float(w), rel_type=relations[rel])
        cpnet_graph.add_edge(concepts[c2], concepts[c1], weight=float(w), rel_type=relations[rev_rel])

100%|██████████| 3423004/3423004 [00:24<00:00, 141364.29it/s]


In [43]:
inv_rels = {v:k for k,v in relations.items()}
inv_concepts = {v:k for k,v in concepts.items()}

In [44]:
list(set(relations.keys()))[:5], len(relations.keys())

(['capital', 'MadeOf', 'language', 'Entails_reverse', 'CreatedBy_reverse'], 94)

## find shortest path between two concepts

In [80]:
assert 'highway' in concepts
assert 'atlas' in concepts


def print_readable_all_path(graph, src, target):
    src_repr = concepts[src]
    target_repr = concepts[target]
    results = nx.all_simple_paths(graph, src_repr, target_repr, cutoff=2)
    
    for res in results:
        steps = []
        steps.append(src)
        prev_c = src_repr
        for c in res:
            if c == prev_c:
                continue
            cur_rel = graph.edges[prev_c, c]
            steps.append(f"--{inv_rels[cur_rel['rel_type']]}({cur_rel['weight']})-->")
            steps.append(inv_concepts[c])
            prev_c = c
        print(' '.join(steps))
        

def print_readable_shortest_path(graph, src, target):
    steps = []
    src_repr = concepts[src]
    target_repr = concepts[target]
    res = nx.shortest_path(graph, src_repr, target_repr)
    steps.append(src)
    prev_c = src_repr
    for c in res:
        if c == prev_c:
            continue
        cur_rel = graph.edges[prev_c, c]
        steps.append(f"--{inv_rels[cur_rel['rel_type']]}({cur_rel['weight']})-->")
        steps.append(inv_concepts[c])
        prev_c = c
    return ' '.join(steps)
        

#print_readable_shortest_path(cpnet_graph, 'people', 'populated_areas')
# print_readable_all_path(cpnet_graph, 'forest', 'natural_habitat')
print_readable_all_path(cpnet_graph, 'fox', 'city')

fox --AtLocation(1.0)--> back_yard --AtLocation(1.0)--> city
fox --AtLocation(1.0)--> bar --RelatedTo(0.27)--> city
fox --AtLocation(1.0)--> book --IsA_reverse(0.5)--> city
fox --AtLocation(1.0)--> nature --DistinctFrom(0.173)--> city
fox --AtLocation(1.0)--> new_york --RelatedTo(1.0)--> city
fox --AtLocation(1.0)--> warren --RelatedTo(1.0)--> city
fox --HasContext(1.0)--> soccer --HasContext_reverse(1.0)--> city
fox --RelatedTo(0.323)--> forest --Antonym_reverse(0.23)--> city
fox --RelatedTo(0.135)--> house --AtLocation(2.0)--> city
fox --RelatedTo(0.146)--> houses --RelatedTo_reverse(0.428)--> city
fox --RelatedTo(0.62)--> small --Antonym_reverse(0.14)--> city
fox --RelatedTo(0.263)--> small_dog --AtLocation(2.0)--> city
fox --RelatedTo(0.122)--> smaller --RelatedTo_reverse(0.171)--> city
fox --language(0.5)--> english_language --language_reverse(0.5)--> city


In [30]:
cpnet_graph.edges['CSPT_1', 'CSPT_5']

KeyError: 'CSPT_5'

In [32]:
cpnet_graph['CSPT_1']

AtlasView({'CSPT_0': {'weight': 1.0, 'rel_type': 'REL_65'}, 'CSPT_32059': {'weight': 1.0, 'rel_type': 'REL_4'}, 'CSPT_73566': {'weight': 1.0, 'rel_type': 'REL_15'}, 'CSPT_180550': {'weight': 1.0, 'rel_type': 'REL_15'}, 'CSPT_10340': {'weight': 0.25, 'rel_type': 'REL_25'}, 'CSPT_378685': {'weight': 2.0, 'rel_type': 'REL_26'}, 'CSPT_378686': {'weight': 1.0, 'rel_type': 'REL_26'}, 'CSPT_378784': {'weight': 1.0, 'rel_type': 'REL_27'}, 'CSPT_378815': {'weight': 2.0, 'rel_type': 'REL_27'}, 'CSPT_2729': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_814875': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_10488': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_245458': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_14931': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_8750': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_19734': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_18071': {'weight': 1.0, 'rel_type': 'REL_30'}, 'CSPT_1053': {'weight': 1.0, 'rel_type': 'REL_71'}, 'CSPT_867809': {'weight': 1.0, 'rel

In [36]:
cpnet_graph.edges['CSPT_1', 'CSPT_32059']

{'weight': 1.0, 'rel_type': 'REL_4'}

In [1]:
from datasets import load_dataset

ds = load_dataset('trivia_qa', 'rc.web')

ds

OSError: Not enough disk space. Needed: 13.48 GiB (download: 2.48 GiB, generated: 11.00 GiB, post-processed: Unknown size)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey'],
        num_rows: 1140
    })
})