This notebook relies on sampling from the `sample.ipynb` notebook. We will extract metadata descriptions, then fit models on them.

In [1]:
# Import collection metadata
import pickle
from pyCMR.pyCMR import CMR
long_names, metadata = pickle.load(open('metadata.p', 'rb'))

We now need to make a small corpus. First naive strategy: concatenate all the leaves of the structure in random order.

NOTE: this discards important information along the paths to the leaves.

In [2]:
# source: https://stackoverflow.com/questions/12507206/python-recommended-way-to-walk-complex-dictionary-structures-imported-from-json

# This code turns a dictionary into a list of paths to leaves
def dict_generator(indict, pre=None):
    pre = pre[:] if pre else []
    if isinstance(indict, dict):
        for key, value in indict.items():
            if isinstance(value, dict):
                for d in dict_generator(value, [key] + pre):
                    yield d
            elif isinstance(value, list) or isinstance(value, tuple):
                for v in value:
                    for d in dict_generator(v, [key] + pre):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield indict

In [3]:
# source: https://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-float
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [4]:
# WARNING: Naive implementation
def struct2Doc(cln):
    # Convert to javascript-style dictionary-array object
    hierarchy = json.loads(json.dumps(cln))

    path_gen = dict_generator(hierarchy)
    leaves = []

    for path in path_gen:
        leaf = path[-1]

        # Do some filtering on the leaves (see below)
        if leaf == '' or leaf == None:
            continue
        if is_number(leaf):
            continue
        if validators.url(leaf):
            continue

        # TODO: Extend
        leaf = re.sub(r'[^\s\w]+', ' ', leaf)
        #print leaf
        leaves.append(leaf.lower())

    # Shuffle so that incidental proximity of leaves to eachother is not taken into account
    shuffle(leaves)
    document = ' '.join(leaves)
    
    return document

# Just from looking at the result it seems we should filter out:
#  - Empty leaves
#  - Numbers
#  - URL's and emails
#  - Dates?
    

In [5]:
import re

# WARNING: This may be too reliant on structured data

# Structured output
def struct2Sentence(cln):
    # Convert to javascript-style dictionary-array object
    hierarchy = json.loads(json.dumps(cln))

    path_gen = dict_generator(hierarchy)
    sentences = []

    for path in path_gen:
        leaf = path[-1]

        # Do some filtering on the leaves (see below)
        if leaf == '' or leaf == None:
            continue
        if is_number(leaf):
            continue
        if validators.url(leaf):
            continue

        sentence = ' '.join(path)
        
        # Make sentence lowercase and remove periods
        sentence = re.sub(r'[^\s\w]+', ' ', sentence)
        sentences.append(sentence.lower())
    
    return sentences

# Just from looking at the result it seems we should filter out:
#  - Empty leaves
#  - Numbers
#  - URL's and emails
#  - Dates?
    

In [8]:
import json
import validators
cln = metadata[0]
document = struct2Sentence(cln)
print document

[u'concept id c1000000000 cddis', u'instrument instruments platform platforms collection shortname doris receiver', u'platform platforms collection shortname cryosat 2', u'platform platforms collection type  ', u'platform platforms collection longname cryosat 2', u'instrument instruments platform platforms collection shortname doris beacon', u'platform platforms collection shortname ground stations', u'platform platforms collection type  ', u'platform platforms collection longname ground stations', u'instrument instruments platform platforms collection shortname doris receiver', u'platform platforms collection shortname hy 2a', u'platform platforms collection type  ', u'platform platforms collection longname haiyang 2a', u'instrument instruments platform platforms collection shortname doris receiver', u'platform platforms collection shortname jason 1', u'platform platforms collection type  ', u'platform platforms collection longname jason 1', u'instrument instruments platform platforms

The next step is to convert collections in-bulk to documents

In [9]:
import multiprocessing

# Parallelize for speed
pool = multiprocessing.Pool()
#documents = pool.map(struct2Doc, metadata)
sentences = pool.map(struct2Sentence, metadata)

sentences[0]

[u'concept id c1000000000 cddis',
 u'collection lastupdate 2012 05 31t00 00 00',
 u'collection description the doppler orbitography by radiopositioning integrated on satellite  doris  was developed by the centre national d etudes spatiales  cnes  with cooperation from other french government agencies  the system was developed to provide precise orbit determination and high accuracy location of ground beacons for point positioning  doris is a dual frequency doppler system that has been included as an experiment on various space missions such as topex poseidon  spot 2   3   4  and  5  envisat  and jason satellites  unlike many other navigation systems  doris is based on an uplink device  the receivers are on board the satellite with the transmitters are on the ground  this creates a centralized system in which the complete set of observations is downloaded by the satellite to the ground center  from where they are distributed after editing and processing  an accurate measurment is made o

In [10]:
def flattenGenerator(listOfLists):
    for list2 in listOfLists:
        for item in list2:
            yield item

sentenceGen = flattenGenerator(sentences)
sentenceGen.next()

u'concept id c1000000000 cddis'

For word2vec we need just a list of sentences

In [11]:
sentences_flat = [filter(None, item.split(' ')) for sublist in sentences for item in sublist]

Now we can examine document similarity between collection metadata.

In [6]:
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

ImportError: No module named gensim.models

Try word2vec on sentences

In [13]:
model = word2vec.Word2Vec(sentences_flat, size=100, window=5, min_count=5, workers=7)

2018-02-21 21:43:13,182 : INFO : collecting all words and their counts
2018-02-21 21:43:13,186 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-21 21:43:13,236 : INFO : PROGRESS: at sentence #10000, processed 81270 words, keeping 4422 word types
2018-02-21 21:43:13,285 : INFO : PROGRESS: at sentence #20000, processed 167911 words, keeping 6558 word types
2018-02-21 21:43:13,332 : INFO : PROGRESS: at sentence #30000, processed 246934 words, keeping 8342 word types
2018-02-21 21:43:13,385 : INFO : PROGRESS: at sentence #40000, processed 336864 words, keeping 8924 word types
2018-02-21 21:43:13,431 : INFO : PROGRESS: at sentence #50000, processed 417518 words, keeping 10722 word types
2018-02-21 21:43:13,485 : INFO : PROGRESS: at sentence #60000, processed 509788 words, keeping 12744 word types
2018-02-21 21:43:13,542 : INFO : PROGRESS: at sentence #70000, processed 600369 words, keeping 14036 word types
2018-02-21 21:43:13,599 : INFO : PROGRESS: at sente

In [5]:
#model.save('word2vec_structured.m')
from gensim.models import word2vec
model = word2vec.load('simple_model.m')

ImportError: No module named gensim

In [15]:
model.wv['rain']
model.wv.most_similar('rainfall')

2018-02-21 21:44:02,982 : INFO : precomputing L2-norms of word weight vectors


[(u'measurement', 0.6895172595977783),
 (u'volume', 0.635983943939209),
 (u'tropical', 0.6309189796447754),
 (u'monoxide', 0.6228217482566833),
 (u'tropospheric', 0.6224027276039124),
 (u'mission', 0.6212568879127502),
 (u'dioxide', 0.6178737878799438),
 (u'component', 0.6072835922241211),
 (u'no2', 0.6064711213111877),
 (u'hydrogen', 0.6028136610984802)]

In [16]:
vocab = list(model.wv.vocab)
X = model[vocab]

In [19]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [1]:
import pickle
#pickle.dump(X_tsne, open('X_tsne.p', 'wb'))
X_tsne = pickle.load(open('X_tsne.p', 'rb'))

In [2]:
import pandas as pd
df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

NameError: name 'vocab' is not defined

In [69]:
import matplotlib.patheffects as path_effects
reload(path_effects)

<module 'matplotlib.patheffects' from '/anaconda2/envs/earthdata/lib/python2.7/site-packages/matplotlib/patheffects.pyc'>

In [70]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(8, 6), dpi=150)
ax = fig.add_subplot(1, 1, 1)

ax.scatter(df['x'], df['y'], c='cyan', alpha=0.25)
ax.set_facecolor('black')

#idx = 6
idx = 2

for word, pos in df.iterrows():
    #idx += 1
    idx += 1
    if idx % 160 != 0:
        continue
    
    ax.annotate(word, pos, color='white', fontweight='bold', path_effects=[path_effects.SimpleLineShadow()])

plt.show()



ValueError: 'vertices' must be a 2D list or array with shape Nx2

<matplotlib.figure.Figure at 0x1a33f3c550>

In [59]:
fig.savefig('MINE')

Time for doc2vec

In [None]:
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument

# We need to feed it labeled sentences
idx = 0
doc_sentences = []
for sentence_list in sentences:
    
    ln = metadata[idx]['Collection']['LongName']
    
    for sentence in sentence_list:
        ls = TaggedDocument(words=filter(None, sentence.split(' ')), tags=[unicode(idx), ln])
        doc_sentences.append(ls)
        
    idx += 1



In [None]:
doc_sentences[0]

In [None]:
model = doc2vec.Doc2Vec(doc_sentences, size=100, window=8, min_count=5, workers=7)

In [None]:
model.save('doc2vec_structured.m')

In [None]:
# Now lets see which is the most similiar to a chosen document

model.docvecs.most_similar(200) 

In [None]:
print metadata[200]['Collection']['ShortName']

This works, but what would have happened if we had used a different representation of documents that captured less of the structure?

In [None]:
documents[0]

In [None]:
import multiprocessing

# Parallelize for speed
pool = multiprocessing.Pool()
documents = pool.map(struct2Doc, metadata)

In [None]:
from gensim.models import doc2vec
from gensim.models.doc2vec import TaggedDocument

# We need to feed it labeled sentences
idx = 0
doc_sentences2 = []
for document in documents:
    ln = metadata[idx]['Collection']['LongName']
    sentence = filter(None, document.split(' '))
    td = TaggedDocument(words=sentence, tags=[unicode(idx), ln])
    doc_sentences2.append(td)
    idx += 1

In [None]:
model_simplified = doc2vec.Doc2Vec(doc_sentences2, size=100, window=8, min_count=5, workers=7)

In [None]:
model_simplified.save('doc2vec_simple.m')

Let's compare how this version of documents compares

In [None]:
model_simplified.docvecs.most_similar(200) 

In [None]:
model.docvecs.most_similar(200)

Looks about the same.

Now let's use LDA on the data. First we have to make a corpus.

In [None]:
from gensim import corpora

plain_sentences = [filter(None, document.split(' ')) for document in documents]
dictionary = corpora.Dictionary(plain_sentences)
corpus = [dictionary.doc2bow(sentence) for sentence in plain_sentences]

In [None]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=100, id2word=dictionary)

In [None]:
lda.print_topics(10)

topic #25 (0.010): 0.045*"fertilizer" + 0.018*"and" + 0.016*"image" + 0.015*"the" + 0.014*"data" + 0.010*"00z" + 0.009*"p" + 0.009*"daac" + 0.009*"for" + 0.008*"soil"

Clearly this needs a lot more fine-tuning and removal of common words.

In [None]:
lda.save('lda_simple.m')