In [89]:
import requests
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import pprint
from itertools import islice, combinations
import networkx as nx
import matplotlib.pyplot as plt
import pylab as plt
from networkx.drawing.nx_agraph import graphviz_layout, to_agraph

pp = pprint.PrettyPrinter(indent=4)

%matplotlib inline

In [98]:
# Example
obj = requests.get('http://api.conceptnet.io/c/en/n\'t').json()
print(obj.keys())

print(len(obj['edges']))

pp.pprint(obj['edges'])

dict_keys(['@context', '@id', 'edges'])
6
[   {   '@id': "/a/[/r/HasPrerequisite/,/c/en/judge/,/c/en/n't/]",
        '@type': 'Edge',
        'dataset': '/d/conceptnet/4/en',
        'end': {   '@id': "/c/en/n't",
                   '@type': 'Node',
                   'label': "n't",
                   'language': 'en',
                   'term': "/c/en/n't"},
        'license': 'cc:by/4.0',
        'rel': {   '@id': '/r/HasPrerequisite',
                   '@type': 'Relation',
                   'label': 'HasPrerequisite'},
        'sources': [   {   '@id': '/and/[/s/activity/omcs/omcs1_possibly_free_text/,/s/contributor/omcs/zenith/]',
                           '@type': 'Source',
                           'activity': '/s/activity/omcs/omcs1_possibly_free_text',
                           'contributor': '/s/contributor/omcs/zenith'}],
        'start': {   '@id': '/c/en/judge',
                     '@type': 'Node',
                     'label': 'judge someone',
                     '

In [73]:
sentence = """At eight o'clock on Thursday morning Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)

net_dict = {}
pairs = []
for t in tokens:
    if t in token_response:
        pass
    response = requests.get('http://api.conceptnet.io/c/en/'+t).json()
    for i, edge in enumerate(response['edges']):
        #print(edge['start']['label'], edge['rel']['label'], edge['end']['label'])
        pairs.append((edge['start']['label'].lower(), edge['end']['label'].lower()))
        
G = nx.DiGraph()
# G.add_edges_from(pairs)
# G.add_edges_from(known)
G.add_edges_from(pairs)
#nx.draw(G, with_labels=True)
A = to_agraph(G)
A.layout('dot')
A.draw('graph.pdf')

In [25]:
# data
df = pd.DataFrame({'temperature': [100, 90, 0], 
                   'humidity': [10, 90, 20], 
                   'continent': ['africa', 'south america', 'asia'], 
                   'biome': ['desert', 'rainforest', 'tundra']})

# descriptions
descriptions = {
    'temperature': "The temperature of the biome in degrees Fahrenheit.",
    'humidity': "Atmospheric moisture; a quantity representing the amount of water vapor in the atmosphere.",
    'continent': "Any of the world's main continuous expanses of land (Africa, Antarctica, Asia, Australia, Europe, North America, South America).",
    'biome': "A large naturally occurring community of flora and fauna occupying a major habitat; for example, rainforest or tundra."
}

# corpus
corpus = "Deserts are hot and have low humidity. There are deserts in Africa and South America. Rainforests are hot and have high humidity. There are rainforests in South America and Africa. The tundra is very cold and dry. Russia, a country in Asia, is mostly tundra."

In [31]:
tokens = nltk.word_tokenize(corpus)

# build pairs using conceptnet
net_dict = {}
for t in tokens:
    if t.lower() in net_dict.keys():
        pass
    response = requests.get('http://api.conceptnet.io/c/en/'+t).json()
    for i, edge in enumerate(response['edges']):
        net_dict[edge['start']['label'].lower()] = edge['end']['label'].lower()
        
for key, value in descriptions.items():
    text = key + value
    tokens = nltk.word_tokenize(text)
    for t in tokens:
        if t.lower() in net_dict.keys():
            pass
        response = requests.get('http://api.conceptnet.io/c/en/'+t).json()
        for i, edge in enumerate(response['edges']):
            net_dict[edge['start']['label'].lower()] = edge['end']['label'].lower()
        
# remove pairs that don't link to anything
pairs = [(key, value) for key, value in net_dict.items() if key in net_dict.values() and value in net_dict.keys()]
        
G = nx.DiGraph()
# G.add_edges_from(pairs)
# G.add_edges_from(known)
G.add_edges_from(pairs)
#nx.draw(G, with_labels=True)
A = to_agraph(G)
A.layout('dot')
A.draw('graph.png')

In [32]:
# what happens if we use the descriptions provided by a generic dataset?
features = [
    "CRIM per capita crime rate by town",
    "ZN proportion of residential land zoned for lots over 25,000 sq.ft.",
    "INDUS proportion of non-retail business acres per town",
    "CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)",
    "NOX nitric oxides concentration (parts per 10 million)",
    "RM average number of rooms per dwelling",
    "AGE proportion of owner-occupied units built prior to 1940",
    "DIS weighted distances to five Boston employment centres",
    "RAD index of accessibility to radial highways",
    "TAX full-value property-tax rate per $10,000",
    "PTRATIO pupil-teacher ratio by town",
    "B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town",
    "LSTAT % lower status of the population",
    "MEDV Median value of owner-occupied homes in $1000’s"
]

net_dict = {}
for f in features:
    tokens = nltk.word_tokenize(f)
    for t in tokens:
        if t.lower() in net_dict.keys():
            pass
        response = requests.get('http://api.conceptnet.io/c/en/'+t).json()
        for i, edge in enumerate(response['edges']):
            net_dict[edge['start']['label'].lower()] = edge['end']['label'].lower()
        
# remove pairs that don't link to anything
#pairs = [(key, value) for key, value in net_dict.items()]
pairs = [(key, value) for key, value in net_dict.items() if key in net_dict.values() and value in net_dict.keys()]
        
G = nx.DiGraph()
G.add_edges_from(pairs)
A = to_agraph(G)
A.layout('dot')
A.draw('boston.png')




In [33]:
pairs = [(key, value) for key, value in net_dict.items() if key in net_dict.values() and value in net_dict.keys()]
        
G = nx.DiGraph()
G.add_edges_from(pairs)
A = to_agraph(G)
A.layout('dot')
A.draw('boston.png')

## Conclusion

I think I need to start by identifying the concepts that must end up being mapped. Otherwise the search is going to be too expensive.

1. Create a list of the features and the words associated with each.
2. Build a small network of some set depth for each feature.
3. For each word in a features network, get the relation to words in every other feature.

This still seems flawed because it is very brute force and doesn't guarantee features will be connected. Even if they are connected, it is likely by extraneous relations.

I probably need to study graph search algorithms...

In [35]:
# queries
response = requests.get('http://api.conceptnet.io/query?node=/c/en/dog&other=/c/en/bark').json()
for i, edge in enumerate(response['edges']):
    print(edge['start']['label'], edge['rel']['label'], edge['end']['label'])

dog CapableOf bark
dog RelatedTo bark
bark RelatedTo dog
are a dog HasSubevent bark
bark RelatedTo dog


In [124]:
# given two words, continue to query conceptnet until two words are connected
a = 'mountain'
b = 'land'

related = [a]
searched = []
pairs = []

searching = True
while searching:
    
    # get next item to search
    item = related.pop(0)
    print(item)
    searched.append(item)
    
    # query conceptnet
    response = requests.get('http://api.conceptnet.io/c/en/{}'.format('_'.join(nltk.word_tokenize(item.lower())))).json()
    
    # add new nodes to related if not in searched or related
    for edge in response['edges']:
        for node in [edge['start']['label'], edge['end']['label']]:
            if node not in related and node not in searched:
                related.append(node)
        pairs.append((edge['start']['label'].lower(), edge['end']['label'].lower()))
        
    # stopping conditions
    if b in related:
        print('Success!')
        searching = False
    
    elif len(searched)>=1000:
        print('{} not found after {} iterations'.format(b, len(searched)))
        searching = False
              
G = nx.DiGraph()
G.add_edges_from(pairs)
A = to_agraph(G)
A.layout('dot')
A.draw('{}-{}.pdf'.format(a,b))

mountain
Success!


In [63]:
pairs = [(key, value) for key, value in net_dict.items()] + [(value, key) for key, value in net_dict.items()]
        
G = nx.DiGraph()
G.add_edges_from(pairs)
A = to_agraph(G)
A.layout('dot')
A.draw('graph.png')

## Next Steps

1. Figure out how to link more complex concepts. (ie not just a word)
2. Once a full graph linking features is constructed, what next?
    1. Identify input and output nodes.
    2. Identify prior weights.
    3. Train model?
    
How do I make sure the logic of the corpus is included? Right now I'm only relying on ConceptNet for structure, but not for weighting.

- map relations to mathematical operations?

## Graphing a Bag of Words

### Method 1

1. For each word in set(words), get all edges, save off opposite nodes
2. For word in set(words), if word in node set, that word is done
3. Repeat 1 & 2 until set(words) is empty.

In [140]:
corpus = "coffee tea"
corpus = corpus.lower()
corpus = nltk.word_tokenize(corpus)

edges = []
not_found = set(corpus)
found = set()
searching = set(corpus)
next_search = set()

while True:
    
    word = searching.pop()
    
    print('word:', word)
    
    response = requests.get('http://api.conceptnet.io/c/en/{}'.format('_'.join(nltk.word_tokenize(word)))).json()
    
    for edge in response['edges']:
        start = edge['start']['label'].lower()
        end = edge['end']['label'].lower()
        not_word = set((start, end)).difference([word])
        next_search.update(not_word)
        found.update(not_word)
        edges.append((start, end))
        
    print('found:', found)
    print('searching:', searching)
    print('next_search:', next_search)
    
    if len(searching)==0:
        not_found = not_found.difference(found)
        print('not_found:', not_found)
        if len(not_found)==0:
            print('Success!')
            break
        elif len(found)>=1000:
            print('Did not find all words after {} iterations'.format(len(found)))
            break
        searching = next_search
        next_search = set()
        
    print()

word: tea
found: {'sun tea', 'a beverage', 'green tea', 'ice tea', 'oolong', 'a cupboard', 'cuppa', 'united kingdom', 'herb tea', 'tea bag', 'herb', 'teas', 'a drink', 'beverage', 'irish breakfast', 'cambric tea', 'caffeine', 'meal', 'caffein', 'black tea'}
searching: {'coffee'}
next_search: {'sun tea', 'a beverage', 'green tea', 'ice tea', 'oolong', 'a cupboard', 'cuppa', 'united kingdom', 'herb tea', 'tea bag', 'herb', 'teas', 'a drink', 'beverage', 'irish breakfast', 'cambric tea', 'caffeine', 'meal', 'caffein', 'black tea'}

word: coffee
found: {'coffees', 'a can', 'green tea', 'a cup', 'united kingdom', 'herb', 'tea bag', 'irish breakfast', 'a coffee shop', 'teas', 'a drink', 'a mug', 'a internet cafe', 'caffeine', 'meal', 'oolong', 'black tea', 'café', 'a sugar', 'a stimulant', 'served hot', 'the office', 'good in the morning', 'made from coffee beans', 'cafe noir', 'a beverage', 'ice tea', 'cafe au lait', 'a cupboard', 'cuppa', 'sugar', 'herb tea', 'arabian coffee', 'beverage', 

found: {'all-mains', 'coffees', 'a can', 'bachelor of medicine', 'argy-bargy', 'assurance', 'herb', 'irish breakfast', '107951392-n', 'グリーンティー', 'angelica', 'a internet cafe', 'apple of peru', 'meal', 'basil', 'served hot', 'made from coffee beans', 'andryala', '煎茶', 'cafe au lait', 'al-muhajiroun', 'agrimonia', 'arterial road', 'alumroot', 'appro', 'beverage', 'a container', 'chá verde', 'グリーン ティー', 'asparagus', 'bar', 'arnica', 'arum', 'amaranth', 'ballup', 'teh hijau', 'teas', 'a drink', 'abyssinian banana', 'caffeine', 'a stimulant', 'cafe noir', '緑茶', 'reino unido', 'achillea', 'a cupboard', 'sugar', 'arabian coffee', 'backbencher', 'american ginseng', 'coffee', 'bagatelle', 'caffein', 'anglo-saxon', 'vihreä tee', 'green tea', 'acanthus', 'tea', 'inggris raya', 'united kingdom', 'a mug', 'a sugar', '綠茶', 'a beverage', 'herb tea', 'q484083', 'alpine coltsfoot', 'hyson', 'admiralty', 'sun tea', 'cambric tea', 'anchusa', 'asclepiad', 'ชาเขียว', 'cafe', 'te verd', 'a cup', 'banger', '

found: {'all-mains', 'tea bagging', 'tea bags', 'ティーバッグ', 'coffees', 'a can', 'bachelor of medicine', 'argy-bargy', 'assurance', 'herb', 'irish breakfast', 'tepose', '107951392-n', 'グリーンティー', 'angelica', 'a internet cafe', 'apple of peru', 'meal', 'basil', 'served hot', 'made from coffee beans', 'andryala', '煎茶', 'cafe au lait', 'al-muhajiroun', 'agrimonia', 'arterial road', 'teepussi', 'alumroot', 'appro', 'beverage', 'a container', 'chá verde', 'ถุงใส่ใบชา', 'グリーン ティー', 'asparagus', 'bar', 'arnica', 'arum', 'amaranth', 'ballup', 'teh hijau', 'teas', 'a drink', 'abyssinian banana', 'tea bagged', 'caffeine', 'teabag', 'a stimulant', 'cafe noir', '緑茶', 'reino unido', 'achillea', 'a cupboard', 'sugar', 'arabian coffee', 'backbencher', 'american ginseng', 'coffee', 'bagatelle', 'caffein', 'anglo-saxon', 'vihreä tee', 'green tea', 'acanthus', 'tea', 'inggris raya', 'united kingdom', 'a mug', 'bag', 'ティー バッグ', '104403943-n', 'bosseta de te', 'a sugar', '綠茶', 'a beverage', 'kantong teh', 'he

found: {'all-mains', 'tea bagging', 'tea bags', 'ティーバッグ', 'coffees', 'a can', 'bachelor of medicine', 'argy-bargy', 'assurance', 'herb', 'irish breakfast', 'tepose', '107951392-n', 'グリーンティー', 'angelica', 'a internet cafe', 'apple of peru', 'meal', 'basil', 'served hot', 'made from coffee beans', 'andryala', '煎茶', 'cafe au lait', 'al-muhajiroun', 'agrimonia', 'arterial road', 'teepussi', 'alumroot', 'appro', 'beverage', 'a container', 'chá verde', 'ถุงใส่ใบชา', 'グリーン ティー', 'asparagus', 'bar', 'arnica', 'arum', 'amaranth', 'ballup', 'teh hijau', 'teas', 'a drink', 'abyssinian banana', 'tea bagged', 'caffeine', 'teabag', 'a stimulant', 'cafe noir', '緑茶', 'reino unido', 'achillea', 'a cupboard', 'sugar', 'arabian coffee', 'backbencher', 'american ginseng', 'coffee', 'bagatelle', 'caffein', 'anglo-saxon', 'vihreä tee', 'green tea', 'acanthus', 'tea', 'inggris raya', 'united kingdom', 'a mug', 'bag', 'ティー バッグ', '104403943-n', 'bosseta de te', 'a sugar', '綠茶', 'a beverage', 'kantong teh', 'he

found: {'all-mains', 'tea bagging', 'tea bags', 'ティーバッグ', 'coffees', 'a can', 'bachelor of medicine', 'kofeiini', 'argy-bargy', 'assurance', 'oolongs', 'herb', 'irish breakfast', 'kofein', 'tepose', '107951392-n', 'グリーンティー', 'angelica', 'a internet cafe', 'apple of peru', 'meal', 'basil', 'served hot', 'kafeina', 'bite', 'made from coffee beans', 'herbata czerwona', 'andryala', '煎茶', 'cafe au lait', 'al-muhajiroun', 'agrimonia', 'arterial road', 'teepussi', 'alumroot', 'ingredients', 'appro', 'beverage', 'a container', 'chá verde', 'ถุงใส่ใบชา', 'mess', 'کافئین', '烏竜', 'グリーン ティー', 'asparagus', 'bar', 'arnica', 'arum', 'dinner', 'kofeina', 'amaranth', 'ballup', 'a plate', 'cornmeal', 'thé oolong', 'teh hijau', 'teas', 'a drink', 'caféine', 'foodstuff', 'abyssinian banana', 'tea bagged', 'caffeine', 'farina', 'teabag', 'a stimulant', 'cafe noir', '緑茶', 'reino unido', 'achillea', 'cafeína', 'a cupboard', 'sugar', '烏龍茶', 'arabian coffee', 'backbencher', 'american ginseng', 'カフェイン', 'coffee

found: {'café chantant', 'coffees', 'bachelor of medicine', 'argy-bargy', 'teh hitam', 'assurance', 'herb', 'apple of peru', 'a internet cafe', 'booth', '煎茶', 'agrimonia', 'arterial road', 'alumroot', 'ブラックティー', 'appro', 'chá verde', 'ถุงใส่ใบชา', 'کافئین', 'グリーン ティー', 'asparagus', 'bar', 'dinner', 'a plate', 'kofeina', 'amaranth', 'ballup', 'soochong', 'teas', 'abyssinian banana', 'caffeine', 'teabag', 'cafe noir', 'reino unido', '烏龍茶', 'arabian coffee', 'кафе', 'coffee', 'bagatelle', 'keep you awake', 'mele', 'anglo-saxon', 'café goer', 'tea', 'inggris raya', 'caffeina', 'cafeina', 'buffet', 'banquet', 'a mug', 'ティー バッグ', '104403943-n', 'bosseta de te', 'darjeeling', '綠茶', 'herb tea', 'q484083', 'ブラック ティー', 'a fast-food restaurant', 'admiralty', 'sun tea', 'teh oolong', 'cambric tea', 'ถุงชา', 'asclepiad', 'te negre', 'cafe', 'ชาอูหลง', 'a cup', 'thé noir', 'مقهى', 'قهوة', 'banger', 'asparagus fern', 'tea bag', '紅茶', 'breakfast', 'green_tea', 'admass', 'café', 'oolong', 'black tea', 

found: {'café chantant', 'coffees', 'bachelor of medicine', 'argy-bargy', 'teh hitam', 'assurance', 'herb', 'apple of peru', 'a internet cafe', 'booth', '煎茶', 'agrimonia', 'arterial road', 'alumroot', 'ブラックティー', 'appro', 'chá verde', 'ถุงใส่ใบชา', 'کافئین', 'グリーン ティー', 'asparagus', 'bar', 'dinner', 'a plate', 'kofeina', 'amaranth', 'ballup', 'soochong', 'teas', 'abyssinian banana', 'caffeine', 'teabag', 'cafe noir', 'reino unido', '烏龍茶', 'arabian coffee', 'кафе', 'coffee', 'bagatelle', 'keep you awake', 'mele', 'anglo-saxon', 'café goer', 'tea', 'inggris raya', 'caffeina', 'cafeina', 'buffet', 'banquet', 'a mug', 'ティー バッグ', '104403943-n', 'bosseta de te', 'darjeeling', '綠茶', 'herb tea', 'q484083', 'ブラック ティー', 'a fast-food restaurant', 'admiralty', 'sun tea', 'teh oolong', 'cambric tea', 'ถุงชา', 'asclepiad', 'te negre', 'cafe', 'ชาอูหลง', 'a cup', 'thé noir', 'مقهى', 'قهوة', 'banger', 'asparagus fern', 'tea bag', '紅茶', 'breakfast', 'green_tea', 'admass', 'café', 'oolong', 'black tea', 

found: {'café chantant', 'coffees', 'bachelor of medicine', 'argy-bargy', 'teh hitam', 'assurance', 'herb', 'apple of peru', 'a internet cafe', 'booth', '煎茶', 'agrimonia', 'arterial road', 'alumroot', 'ブラックティー', 'appro', 'chá verde', 'ถุงใส่ใบชา', 'کافئین', 'グリーン ティー', 'asparagus', 'bar', 'dinner', 'a plate', 'kofeina', 'amaranth', 'ballup', 'soochong', 'teas', 'abyssinian banana', 'caffeine', 'teabag', 'not all coffee', 'cafe noir', 'reino unido', '烏龍茶', 'arabian coffee', 'кафе', 'coffee', 'bagatelle', 'keep you awake', 'mele', 'anglo-saxon', 'café goer', 'tea', 'inggris raya', 'caffeina', 'cafeina', 'buffet', 'banquet', 'a mug', 'ティー バッグ', '104403943-n', 'bosseta de te', 'darjeeling', '綠茶', 'herb tea', 'q484083', 'ブラック ティー', 'a fast-food restaurant', 'admiralty', 'sun tea', 'teh oolong', 'cambric tea', 'ถุงชา', 'asclepiad', 'te negre', 'cafe', 'ชาอูหลง', 'a cup', 'thé noir', 'مقهى', 'قهوة', 'banger', 'asparagus fern', 'tea bag', '紅茶', 'breakfast', 'green_tea', 'admass', 'café', 'oolo

found: {'café chantant', 'coffees', 'bachelor of medicine', 'argy-bargy', 'teh hitam', 'assurance', 'herb', 'apple of peru', 'a internet cafe', 'booth', '煎茶', 'agrimonia', 'arterial road', 'alumroot', 'ブラックティー', 'appro', 'chá verde', 'ถุงใส่ใบชา', 'کافئین', 'グリーン ティー', 'asparagus', 'bar', 'dinner', 'a plate', 'kofeina', 'amaranth', 'ballup', 'soochong', 'teas', 'abyssinian banana', 'caffeine', 'teabag', 'not all coffee', 'cafe noir', 'reino unido', '烏龍茶', 'arabian coffee', 'кафе', 'coffee', 'bagatelle', 'keep you awake', 'mele', 'anglo-saxon', 'café goer', 'tea', 'inggris raya', 'caffeina', 'cafeina', 'buffet', 'banquet', 'a mug', 'ティー バッグ', '104403943-n', 'bosseta de te', 'darjeeling', '綠茶', 'herb tea', 'q484083', 'ブラック ティー', 'a fast-food restaurant', 'admiralty', 'sun tea', 'teh oolong', 'cambric tea', 'ถุงชา', 'asclepiad', 'te negre', 'cafe', 'ชาอูหลง', 'a cup', 'thé noir', 'مقهى', 'قهوة', 'banger', 'asparagus fern', 'tea bag', '紅茶', 'breakfast', 'green_tea', 'admass', 'café', 'oolo

KeyboardInterrupt: 

The above code fails for multiple reasons. Primarily, if you get all children of A, then search each child, the resulting set will likely circle back to A. This does not imply any connection to B, so a connected graph is not constructed.

### Method 2

```python
g = dict()
for token in corpus:
    g[token] = set()
    
    edges = request(conceptnet+token)
    
    for edge in edges:
        node = not token
        g[token].update(node)
        
```

### Method 3

Are there built in methods in networkx for checking if a graph is connected?

https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.connectivity.edge_augmentation.is_k_edge_connected.html#networkx.algorithms.connectivity.edge_augmentation.is_k_edge_connected

In [2]:
G = nx.barbell_graph(10, 0)
nx.is_k_edge_connected(G, k=1)

True

In [6]:
# given two words, continue to query conceptnet until two words are connected
a = 'us'
b = 'country'

related = [a]
searched = []
pairs = []

searching = True
while searching:
    
    # get next item to search
    item = related.pop(0)
    print(item)
    searched.append(item)
    
    # query conceptnet
    response = requests.get('http://api.conceptnet.io/c/en/{}'.format('_'.join(nltk.word_tokenize(item.lower())))).json()
    
    # add new nodes to related if not in searched or related
    for edge in response['edges']:
        for node in [edge['start']['label'], edge['end']['label']]:
            if node not in related and node not in searched:
                related.append(node)
        pairs.append((edge['start']['label'].lower(), edge['end']['label'].lower()))
        
    # stopping conditions
    if b in related:
        print('Success!')
        searching = False
    
    elif len(searched)>=1000:
        print('{} not found after {} iterations'.format(b, len(searched)))
        searching = False
              
G = nx.Graph()
G.add_edges_from(pairs)
print(nx.is_k_edge_connected(G, k=1))
A = to_agraph(G)
A.layout('dot')
A.draw('{}-{}.pdf'.format(a,b))

us
after school special
aggie
beltway
blow dryer
broken arrow
chee chee
coin
confectioners sugar
democratic
fed
guidette
guido
guinea
heinie
hide and go seek
hill
US
United States
Success!
False


Awesome! So now I can just run this check after each iteration of expansion. Only catch, is I need to find a way to handle the weird nodes that pop up lacking the literal query word.

In [9]:
response = requests.get('http://api.conceptnet.io/c/en/coffee').json()
response['edges']

[{'@id': '/a/[/r/IsA/,/c/en/coffee/,/c/en/stimulant/]',
  '@type': 'Edge',
  'dataset': '/d/conceptnet/4/en',
  'end': {'@id': '/c/en/stimulant',
   '@type': 'Node',
   'label': 'a stimulant',
   'language': 'en',
   'term': '/c/en/stimulant'},
  'license': 'cc:by/4.0',
  'rel': {'@id': '/r/IsA', '@type': 'Relation', 'label': 'IsA'},
  'sources': [{'@id': '/and/[/s/activity/omcs/omcs1_possibly_free_text/,/s/contributor/omcs/albedo/]',
    '@type': 'Source',
    'activity': '/s/activity/omcs/omcs1_possibly_free_text',
    'contributor': '/s/contributor/omcs/albedo'},
   {'@id': '/and/[/s/activity/omcs/omcs1_possibly_free_text/,/s/contributor/omcs/rossjesse/]',
    '@type': 'Source',
    'activity': '/s/activity/omcs/omcs1_possibly_free_text',
    'contributor': '/s/contributor/omcs/rossjesse'},
   {'@id': '/and/[/s/activity/omcs/vote/,/s/contributor/omcs/20q_1231409081/]',
    '@type': 'Source',
    'activity': '/s/activity/omcs/vote',
    'contributor': '/s/contributor/omcs/20q_1231409

Looks like I have 2 options:

1. Save off label and @id in dict; use @id to generate graph and label to create human-readable form. Kind of a pain.
2. Parse @id. This may not work if the format is not consistent.```"/".split(response['edges'][i]['@id'][j])[-1]```

Yup... It's not consistent. Option 1 it is...

## Plan

Build a net connecting all words in a corpus.

```python
while disconnected:
    for each word in corpus:
        get response
        append pairs to net list if not in net list
    check for connectivity
```

In [97]:
corpus = "the biomes. a temperature humidity continent isn't"
corpus = corpus.lower()
tokens = nltk.word_tokenize(corpus)

lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]

tokens = [token for token in tokens if token not in stopwords.words('english')]

print(tokens)

edges = []
labels = dict()
G = nx.Graph()

searching = set('/c/en/'+w for w in tokens)
next_search = set()

iterations = 0
while not nx.is_k_edge_connected(G, k=1):
    
    edge_id = searching.pop()
    
    print('edge:', edge_id)
    
    try:
        response = requests.get('http://api.conceptnet.io'+edge_id).json()
    except Exception as e:
        print(e)
        continue
    
    for edge in response['edges']:
        
        start_id = '/'.join(edge['start']['@id'].split('/')[:4])
        end_id = '/'.join(edge['end']['@id'].split('/')[:4])
        if 'http' in start_id or 'http' in end_id:
            print('found a link. skipping', edge['start']['@id'], edge['end']['@id'])
            continue
        edges.append((start_id, end_id))
        
        if start_id not in labels.keys():
            next_search.update([start_id])
        if end_id not in labels.keys():
            next_search.update([end_id])
        labels[start_id] = edge['start']['label']
        labels[end_id] = edge['end']['label']
    
    if len(searching)==0:
        iterations += 1
        G.add_edges_from(edges)
        A = to_agraph(G)
        A.layout('dot')
        A.draw('graph{}.pdf'.format(iterations))
        if iterations>=10:
            print('Did not find all words after {} iterations'.format(iterations))
            break
        searching = next_search
        next_search = set()
        
    print()
    
# cleanup graph (remove extraneous nodes)
G2 = nx.Graph()
for edge in combinations(tokens, 2):
    print(edge)
    print(list(islice(nx.shortest_simple_paths(G, '/c/en/'+edge[0], '/c/en/'+edge[1]), 1)))
    G2.add_path(list(islice(nx.shortest_simple_paths(G, '/c/en/'+edge[0], '/c/en/'+edge[1]), 1))[0])
    
# output the final graph in a nice format
A = to_agraph(G2)
A.layout('dot')
A.draw('final.pdf')

['biome', '.', 'temperature', 'humidity', 'continent', "n't"]
edge: /c/en/temperature

edge: /c/en/humidity

edge: /c/en/n't
found a link. skipping /c/en/n't http://en.wiktionary.org/wiki/n't

edge: /c/en/.

edge: /c/en/continent

edge: /c/en/biome

edge: /c/fi/biomi
found a link. skipping /c/fi/biomi http://en.wiktionary.org/wiki/biomi

edge: /c/ms/komunitas_biotik_utama

edge: /c/fa/تری

edge: /c/pt/temperatura


KeyboardInterrupt: 

Removing 'extraneous' nodes isn't working because I need to remove extraneous paths, not just nodes. Following the logic I'm using, only the outer nodes will be removed. This means that in a chain of nodes extending out from a corpus node will only get one shorter, leaving the rest of the chain behind.

## Conclusion

I now have a working prototype that can build a simple graph from a set of words. Next I need to be able to input a more complex corpus and do the same. This means I need to isolate the "important" words in the corpus (so I'm not searching for connections to 'a' and 'the' etc). That should be relatively simple with NLTK; if there isn't a function already, I can probably just select only nouns, verbs, adjectives and adverbs.

This looks like a good place to start: https://www.quora.com/How-can-I-extract-keywords-from-a-document-using-NLTK

Looks like I've sort of accomplished this, but there are definitely some flaws.

1. The list of stopwords isn't very long. Words like "this" would still be searched.
2. I need to handle punctuation.