In [5]:
DATA_FILE       = '../data/nasa.p'
PREFIX          = 'nasa/nasa_lda_'

In [6]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [18]:
import time
from datetime import datetime as dt
import json
import pickle
import random
from os.path import join
from pathlib import Path
import logging

from cleaning.serialize import struct2sentence
import stdlog
from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [8]:
# Load pickled dataset in entirety
long_names, metadata = pickle.load(open(DATA_FILE, 'rb'))

In [9]:
import multiprocessing

# Parallelize serialization of data into sentences
pool = multiprocessing.Pool()
sentences_2d = pool.map(struct2sentence, metadata)

In [11]:
sentences = [list(filter(None, item.split(' '))) for sublist in sentences_2d for item in sublist]

In [25]:
# Create a corpus
dictionary = corpora.Dictionary(sentences)
corpus = [dictionary.doc2bow(sentence) for sentence in sentences]

2018-05-08 16:55:17,687 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-05-08 16:55:17,833 : INFO : adding document #10000 to Dictionary(4421 unique tokens: ['c1000000000', 'cddis', 'concept', 'id', 'collection']...)
2018-05-08 16:55:17,977 : INFO : adding document #20000 to Dictionary(6483 unique tokens: ['c1000000000', 'cddis', 'concept', 'id', 'collection']...)
2018-05-08 16:55:18,132 : INFO : adding document #30000 to Dictionary(8344 unique tokens: ['c1000000000', 'cddis', 'concept', 'id', 'collection']...)
2018-05-08 16:55:18,294 : INFO : adding document #40000 to Dictionary(8925 unique tokens: ['c1000000000', 'cddis', 'concept', 'id', 'collection']...)
2018-05-08 16:55:18,450 : INFO : adding document #50000 to Dictionary(10728 unique tokens: ['c1000000000', 'cddis', 'concept', 'id', 'collection']...)
2018-05-08 16:55:18,618 : INFO : adding document #60000 to Dictionary(12750 unique tokens: ['c1000000000', 'cddis', 'concept', 'id', 'collection']...)
2018-05-08 

In [26]:
# Create and train a new model
lda = LdaModel(corpus=corpus, num_topics=40, id2word=dictionary)

2018-05-08 16:55:27,216 : INFO : using symmetric alpha at 0.025
2018-05-08 16:55:27,217 : INFO : using symmetric eta at 0.025
2018-05-08 16:55:27,224 : INFO : using serial LDA version on this node
2018-05-08 16:55:27,380 : INFO : running online (single-pass) LDA training, 40 topics, 1 passes over the supplied corpus of 243409 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2018-05-08 16:55:27,382 : INFO : PROGRESS: pass 0, at document #2000/243409
2018-05-08 16:55:28,037 : INFO : merging changes from 2000 documents into a model of 243409 documents
2018-05-08 16:55:28,184 : INFO : topic #31 (0.025): 0.149*"collection" + 0.047*"cddis" + 0.043*"platforms" + 0.043*"platform" + 0.034*"00" + 0.032*"longname" + 0.023*"archivecenter" + 0.021*"shortname" + 0.017*"contacts" + 0.017*"product"
2018-05-08 16:55:28,186 : INFO : topic #20 (0.025): 0.119*"collection" + 0.075*"contact" + 0.075*"cont

In [27]:
lda.save(PREFIX + 'basic.m')
#model = LdaModel.load(PREFIX + 'basic.m')

2018-05-08 16:56:58,217 : INFO : saving LdaState object under nasa/nasa_lda_basic.m.state, separately None
2018-05-08 16:56:58,270 : INFO : saved nasa/nasa_lda_basic.m.state
2018-05-08 16:56:58,289 : INFO : saving LdaModel object under nasa/nasa_lda_basic.m, separately ['expElogbeta', 'sstats']
2018-05-08 16:56:58,290 : INFO : storing np array 'expElogbeta' to nasa/nasa_lda_basic.m.expElogbeta.npy
2018-05-08 16:56:58,302 : INFO : not storing attribute dispatcher
2018-05-08 16:56:58,302 : INFO : not storing attribute state
2018-05-08 16:56:58,303 : INFO : not storing attribute id2word
2018-05-08 16:56:58,305 : INFO : saved nasa/nasa_lda_basic.m


Investigation of model performance

In [28]:
# Now lets see which is the most similiar to a chosen document
lda.print_topics(10)

2018-05-08 16:57:03,842 : INFO : topic #32 (0.025): 0.357*"shortname" + 0.242*"collection" + 0.062*"30t00" + 0.041*"02t00" + 0.011*"mgg" + 0.009*"hazard_images_database" + 0.007*"05t00" + 0.006*"aa" + 0.006*"3" + 0.005*"buoys"
2018-05-08 16:57:03,843 : INFO : topic #15 (0.025): 0.072*"the" + 0.062*"e" + 0.057*"r" + 0.047*"s" + 0.043*"set" + 0.039*"a" + 0.032*"u" + 0.029*"d" + 0.024*"to" + 0.024*"of"
2018-05-08 16:57:03,844 : INFO : topic #39 (0.025): 0.139*"02" + 0.111*"collection" + 0.102*"2018" + 0.080*"2015" + 0.062*"29" + 0.058*"lastupdate" + 0.055*"inserttime" + 0.048*"09" + 0.044*"28" + 0.025*"13t21"
2018-05-08 16:57:03,846 : INFO : topic #9 (0.025): 0.118*"gov" + 0.094*"contacts" + 0.094*"contact" + 0.081*"email" + 0.081*"organizationemails" + 0.077*"collection" + 0.057*"nasa" + 0.029*"research" + 0.027*"technique" + 0.026*"gsfc"
2018-05-08 16:57:03,848 : INFO : topic #17 (0.025): 0.174*"science" + 0.168*"earth" + 0.163*"collection" + 0.161*"sciencekeywords" + 0.161*"sciencekeyw

[(32,
  '0.357*"shortname" + 0.242*"collection" + 0.062*"30t00" + 0.041*"02t00" + 0.011*"mgg" + 0.009*"hazard_images_database" + 0.007*"05t00" + 0.006*"aa" + 0.006*"3" + 0.005*"buoys"'),
 (15,
  '0.072*"the" + 0.062*"e" + 0.057*"r" + 0.047*"s" + 0.043*"set" + 0.039*"a" + 0.032*"u" + 0.029*"d" + 0.024*"to" + 0.024*"of"'),
 (39,
  '0.139*"02" + 0.111*"collection" + 0.102*"2018" + 0.080*"2015" + 0.062*"29" + 0.058*"lastupdate" + 0.055*"inserttime" + 0.048*"09" + 0.044*"28" + 0.025*"13t21"'),
 (9,
  '0.118*"gov" + 0.094*"contacts" + 0.094*"contact" + 0.081*"email" + 0.081*"organizationemails" + 0.077*"collection" + 0.057*"nasa" + 0.029*"research" + 0.027*"technique" + 0.026*"gsfc"'),
 (17,
  '0.174*"science" + 0.168*"earth" + 0.163*"collection" + 0.161*"sciencekeywords" + 0.161*"sciencekeyword" + 0.160*"categorykeyword" + 0.000*"ellipsoid" + 0.000*"usages" + 0.000*"hydraulic" + 0.000*"directionality"'),
 (24,
  '0.130*"collection" + 0.097*"datasetid" + 0.095*"revisiondate" + 0.073*"2017" +