In [1]:
import pandas as pd

Read in data and take a quick look at the data.

In [18]:
net = pd.read_csv("conceptnet-assertions-5.7.0-en.csv")
net.shape 

(2543196, 4)

In [19]:
net.head()

Unnamed: 0,relation,start,end,weight
0,antonym,ab_extra,ab_intra,1.0
1,antonym,ab_intra,ab_extra,1.0
2,antonym,abactinal,actinal,1.0
3,antonym,abandonment,acquisition,1.0
4,antonym,abapical,apical,1.0


data cleaning 1: fill NA

In [20]:
net[net.isnull().T.any().T] # rows with NaN values

Unnamed: 0,relation,start,end,weight
11532,antonym,,antinull,1.000
11533,antonym,,non_null,1.000
229345,hascontext,,computing,1.000
235214,hascontext,,genetics,1.000
235215,hascontext,,mathematics,1.000
235217,hascontext,,computing,1.000
235218,hascontext,,statistics,1.000
515769,isa,,flatbread,1.000
653722,relatedto,absent_referent,,1.000
693496,relatedto,ambitwistor,,1.000


It seems that the original csv treats "null" as NaN.

In [22]:
net = net.fillna('null')

data cleaning 2: drop duplicates

In [97]:
net.drop_duplicates(subset = ['relation', 'start', 'end'], inplace = True)

In [98]:
net.shape

(2273085, 4)

data cleaning 3: remove self-edges

In [125]:
net = net.drop(net.loc[net['start'] == net['end']].index)

In [126]:
net.shape

(2235061, 4)

count the number of nodes

In [127]:
nodes = set(net['start'])
len(nodes)

585349

In [128]:
list(nodes)[:10]

['impressor',
 'cholesterogenesis',
 'right_eleventh_rib',
 'choochkie',
 'neodiapsida',
 'loose_coupling',
 'geonastic',
 'emblazoned_on_eagle',
 'bleated',
 'possessive_marker']

We now know that our original ConceptNet has 585349 nodes and 2273085 edges.
Let's move on to the next step: we need to split all the phrase concepts.

In [129]:
phrases = []
for n in nodes:
    if '_' in n:
        phrases.append(n)

In [130]:
len(phrases)

173437

In [131]:
phrases.sort(key = lambda x: x.count('_'), reverse = True)

In [132]:
phrases[:10]

['it_is_easier_for_camel_to_go_through_eye_of_needle_than_for_rich_man_to_enter_into_kingdom_of_god',
 'give_man_fish_and_you_feed_him_for_day_teach_man_to_fish_and_you_feed_him_for_lifetime',
 'if_it_looks_like_duck_swims_like_duck_and_quacks_like_duck_then_it_probably_is_duck',
 'is_that_gun_in_your_pocket_or_are_you_just_pleased_to_see_me',
 'if_you_find_yourself_in_hole_first_thing_to_do_is_stop_digging',
 'there_may_be_snow_on_rooftop_but_there_is_fire_in_furnace',
 'early_to_bed_early_to_rise_makes_man_healthy_wealthy_and_wise',
 'what_does_that_have_to_do_with_price_of_tea_in_china',
 'one_hair_of_woman_can_draw_more_than_hundred_pair_of_oxen',
 'take_care_of_pennies_and_pounds_will_take_care_of_themselves']

In [134]:
net.loc[net['start'] == phrases[0]] # the out neighbor of 'it_is_..._of_god'

Unnamed: 0,relation,start,end,weight
1388347,relatedto,it_is_easier_for_camel_to_go_through_eye_of_ne...,camel_through_eye_of_needle,1.0


In [135]:
phrases_count = map(lambda x: x.count('_'), phrases)

In [136]:
from collections import Counter

In [137]:
Counter(phrases_count)

Counter({20: 1,
         18: 1,
         16: 1,
         13: 1,
         12: 1,
         11: 4,
         10: 6,
         9: 13,
         8: 29,
         7: 44,
         6: 136,
         5: 641,
         4: 2851,
         3: 13038,
         2: 35919,
         1: 120751})

We have 173437 phrasal concepts. 120751 of them consist of two words. The rest consist of three or more words.

Here we simply split each phrasal concepts, and replace '_' with an edge of label 'p' (meaning phrase)

In [172]:
from collections import defaultdict

In [170]:
# one iteration to extract all in-neighbors and out-neighbors of phrases
p_in_dict = defaultdict(list)
p_out_dict = defaultdict(list)
for _,row in net.iterrows():
    if ('_' in row['end']):
        p_in_dict[row['end']].append((row['start'], row['relation'])) 
    if ('_' in row['start']):
        p_out_dict[row['start']].append((row['end'], row['relation'])) 

In [183]:
count = 0
add_dict = defaultdict(list) # the additional edges
for p in phrases:
    p_split = p.split('_')
    
    # the underline in the phrase is substituted by len(p_split) - 1 edges with label 'p'
    add_dict['relation'].extend(['p' for i in p_split[1:]]) 
    add_dict['start'].extend(p_split[:-1])
    add_dict['end'].extend(p_split[1:])
    
    # the in neighbors and out neighbors of phrases
    in_nei, in_rel = list(zip(*p_in_dict[p])) if len(p_in_dict[p]) != 0 else ([], [])
    out_nei, out_rel = list(zip(*p_out_dict[p])) if len(p_out_dict[p]) != 0 else ([], [])
    
    # the last word of each in-neighbor is directed to the first word of the phrase
    add_dict['relation'].extend(in_rel)
    add_dict['start'].extend([w.split('_')[-1] for w in in_nei])
    add_dict['end'].extend([p_split[0] for i in in_rel])
    
    # the last word of the phrase is directed to the first word of each out-neighbor
    add_dict['relation'].extend(out_rel)
    add_dict['start'].extend([p_split[-1] for i in out_rel])
    add_dict['end'].extend([w.split('_')[0] for w in out_nei])   

In [184]:
add_df = pd.DataFrame.from_dict(add_dict)

Use similar data cleaning tools.

In [188]:
add_df['weight'] = 1.0
add_df.drop_duplicates(inplace = True)
print(add_df.shape)
add_df = add_df.drop(add_df.loc[add_df['start'] == add_df['end']].index)
print(add_df.shape)

(766376, 4)
(753832, 4)


In [189]:
new_net = net.append(add_df, ignore_index=True)

In [194]:
new_net.drop_duplicates(subset = ['relation', 'start', 'end'], inplace = True)

In [195]:
new_net.shape

(2955994, 4)

In [196]:
len(set(new_net['start']))

601623

In [198]:
new_net.to_csv('new_concept_net.csv')