In [2]:
DATA_FILE       = '../data/nasa.p'
PREFIX          = 'nasa/nasa_d2v_'

def extract_title(datum):
    return datum['Collection']['ShortName'] + ' ' + datum['Collection']['LongName']

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
import time
from datetime import datetime as dt
import json
import pickle
import random
from os.path import join
from pathlib import Path
import logging

from cleaning.serialize import struct2sentence
import stdlog
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

2018-05-08 16:47:35,445 : INFO : 'pattern' package not found; tag filters are not available for English


In [5]:
# Load pickled dataset in entirety
long_names, metadata = pickle.load(open(DATA_FILE, 'rb'))

In [6]:
import multiprocessing

# Parallelize serialization of data into sentences
pool = multiprocessing.Pool()
sentences_2d = pool.map(struct2sentence, metadata)

In [7]:
# We need to feed it labeled sentences
doc_sentences = []
for idx, sentence_list in enumerate(sentences_2d):
    
    # This is dataset specific
    ln = extract_title(metadata[idx])
    
    for sentence in sentence_list:
        words = list(filter(None, sentence.split(' ')))
        ls = TaggedDocument(words=words, tags=[str(idx), ln])
        doc_sentences.append(ls)

In [19]:
# Create and train a new model
model = Doc2Vec(doc_sentences, size=100, window=8, min_count=5, workers=7)

2018-05-08 16:35:17,954 : INFO : collecting all words and their counts
2018-05-08 16:35:17,954 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-08 16:35:18,041 : INFO : PROGRESS: at example #10000, processed 81182 words (943698/s), 4421 word types, 146 tags
2018-05-08 16:35:18,124 : INFO : PROGRESS: at example #20000, processed 167233 words (1052366/s), 6483 word types, 245 tags
2018-05-08 16:35:18,216 : INFO : PROGRESS: at example #30000, processed 246946 words (873045/s), 8344 word types, 305 tags
2018-05-08 16:35:18,308 : INFO : PROGRESS: at example #40000, processed 336871 words (985680/s), 8925 word types, 425 tags
2018-05-08 16:35:18,394 : INFO : PROGRESS: at example #50000, processed 417503 words (950176/s), 10728 word types, 583 tags
2018-05-08 16:35:18,485 : INFO : PROGRESS: at example #60000, processed 509774 words (1024970/s), 12750 word types, 722 tags
2018-05-08 16:35:18,577 : INFO : PROGRESS: at example #70000, processed 600283 words

In [8]:
#model.save(PREFIX + 'basic.m')
model = Doc2Vec.load(PREFIX + 'basic.m')

2018-05-08 16:47:52,510 : INFO : loading Doc2Vec object from nasa/nasa_d2v_basic.m
2018-05-08 16:47:52,597 : INFO : loading vocabulary recursively from nasa/nasa_d2v_basic.m.vocabulary.* with mmap=None
2018-05-08 16:47:52,598 : INFO : loading trainables recursively from nasa/nasa_d2v_basic.m.trainables.* with mmap=None
2018-05-08 16:47:52,599 : INFO : loading wv recursively from nasa/nasa_d2v_basic.m.wv.* with mmap=None
2018-05-08 16:47:52,599 : INFO : loading docvecs recursively from nasa/nasa_d2v_basic.m.docvecs.* with mmap=None
2018-05-08 16:47:52,600 : INFO : loaded nasa/nasa_d2v_basic.m


Investigation of model performance

In [9]:
# Now lets see which is the most similiar to a chosen document
model.docvecs.most_similar(300) 

2018-05-08 16:47:56,045 : INFO : precomputing L2-norms of doc weight vectors


[('NACP VPRM NEE Parameters Optimized to North American Flux Tower Sites, 2000-2006',
  0.9999034404754639),
 ('MODIS NDVI Data, Smoothed and Gap-filled, for the Conterminous US: 2000-2015',
  0.929338276386261),
 ('224', 0.9284223318099976),
 ('CARVE: CH4, CO2, and CO Atmospheric Concentrations, CARVE Tower, Alaska, 2012-2014',
  0.9039781093597412),
 ('530', 0.9038670063018799),
 ('373', 0.9015885591506958),
 ('Airborne Multi-angle Imaging SpectroRadiometer measurements taken over Wisconsin and the ARM/CART site in Oklahoma. (AIRMISR_WISCONSIN_2000)',
  0.9012858867645264),
 ('948', 0.899218738079071),
 ('SNF Leaf Optical Properties: Cary-14', 0.8980725407600403),
 ('FLUXNET Canada Research Network - Canadian Carbon Program Data Collection, 1993-2014',
  0.8970130681991577)]

In [10]:
print(metadata[300]['Collection']['LongName'])

MISR Level 3 Component Global Land Regional public Product covering a day
