In [1]:
from IPython.lib import kernel

kernel.get_connection_file()

'/run/user/1000/jupyter/kernel-cf852721-0121-4e17-8bd8-b00f921f02d9.json'

In [2]:
from gensim.corpora import Dictionary
from utilities.LabeledDocuments import LabeledDocument
from gensim import corpora
from os.path import isfile

# inspired by http://radimrehurek.com/gensim/tut1.html#corpus-formats

class MyCorpus():
    def __init__(self, path='samples/', \
                 dictFile='data/MMCorpus/econstor.dict', \
                 corpusFile='data/MMCorpus/corpus.mm', \
                 worker = 4):
        self.path = path
        self.worker = worker
        
        if isfile(dictFile):
            self.dictionary = Dictionary.load(dictFile)
        else:
            labeledDocs = LabeledDocument(wd=self.path, workers=self.worker)
            
            # because LabeledDocument yields gensim.models.doc2vec.LabeledSentence
            # objects and we are only interested in the words property
            docGen = (d.words for d in labeledDocs)
            dictionary = Dictionary(iter(docGen))
            dictionary.save(dictFile)
            self.dictionary = dictionary
            
        if isfile(corpusFile):
            self.corpus = corpora.MmCorpus(corpusFile)
        else:
            corpora.MmCorpus.serialize(corpusFile, self)
            self.corpus = corpora.MmCorpus(corpusFile)
            
    
    def __getitem__(self, name):
        if name == 'dictionary':
            return self.dictionary
        elif name == 'corpus':
            return self.corpus
        else:
            raise AttributeError('This object has no attribute {}.'.format(name))
    
    def __len__(self):
        return len(self.corpus)
    
    def __iter__(self):
        labeledDocs = LabeledDocument(wd=self.path, workers=self.worker)
        for d in labeledDocs:
            yield self.dictionary.doc2bow(d.words)

In [3]:
from gensim.models.ldamulticore import LdaMulticore

LDAFile = 'data/LDA/LDAModel.lda'
numTopics = 50
lda = ""
if isfile(LDAFile):
    lda = LdaMulticore.load(LDAFile)
else:
    myCorpus = MyCorpus(path='data/samples')
    lda = LdaMulticore(myCorpus, id2word=myCorpus.dictionary,\
                       num_topics=numTopics, workers=4, passes=10, \
                       batch=True)
    lda.save(LDAFile)

INFO:gensim.utils:loading Dictionary object from data/MMCorpus/econstor.dict
INFO:gensim.corpora.indexedcorpus:loaded corpus index from data/MMCorpus/corpus.mm.index
INFO:gensim.matutils:initializing corpus reader from data/MMCorpus/corpus.mm
INFO:gensim.matutils:accepted corpus with 5005 documents, 874774 features, 7669943 non-zero entries
INFO:gensim.models.ldamodel:using symmetric alpha at 0.02
INFO:gensim.models.ldamodel:using symmetric eta at 0.02
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamulticore:running batch LDA training, 50 topics, 10 passes over the supplied corpus of 5005 documents, updating every 5005 documents, evaluating every ~5005 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamulticore:training LDA model using 4 processes
INFO:gensim.models.ldamulticore:PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/5005, outstanding queue size 1
INFO:gensim.models.ldamulticore:PROGRE

In [4]:
myCorpus = MyCorpus(path='data/samples')

INFO:gensim.utils:loading Dictionary object from data/MMCorpus/econstor.dict
INFO:gensim.corpora.indexedcorpus:loaded corpus index from data/MMCorpus/corpus.mm.index
INFO:gensim.matutils:initializing corpus reader from data/MMCorpus/corpus.mm
INFO:gensim.matutils:accepted corpus with 5005 documents, 874774 features, 7669943 non-zero entries


In [5]:
'print words of all topics'
words = []
for topic in range(1,numTopics):
    words = [myCorpus.dictionary[word[0]] for word in lda.get_topic_terms(topic)]
    print(words)

['effect', 'offshoring', 'para', 'cost', 'variable', 'task', 'group', 'datum', 'trade', 'breastfeeding']
['cost', 'immigrant', 'price', 'time', 'year', 'economic', 'market', 'reform', 'emission', 'state']
['treatment', 'subject', 'experiment', 'group', 'decision', 'participant', 'game', 'economic', 'individual', 'behavior']
['market', 'bank', 'policy', 'economic', 'development', 'government', 'state', 'service', 'also', 'company']
['city', 'region', 'urban', 'spatial', 'local', 'area', 'water', 'population', 'space', 'level']
['firm', 'loan', 'sector', 'table', 'result', 'variable', 'effect', 'cost', 'economic', 'export']
['test', 'assumption', 'function', 'estimator', 'rating', 'theorem', 'model', 'result', 'distribution', 'lemma']
['table', 'column', 'waste', 'coin', 'feedstock', 'cost', 'area', 'european', 'price', 'value']
['country', 'migration', 'immigrant', 'migrant', 'labmy', 'economic', 'immigration', 'population', 'education', 'employment']
['bank', 'banking', 'financial', 'r

In [6]:
'print most frequent words'
from itertools import chain
words = []
for topic in range(1,numTopics):
    words.append([myCorpus.dictionary[word[0]] for word in lda.get_topic_terms(topic)])

words = chain(*words)
wordCount = {}
for word in words:
    if word in wordCount:
        wordCount[word] += 1
    else:
        wordCount[word] = 1
        
sortedWords = sorted(wordCount.items(), key=lambda x: x[1])
for a in reversed(sortedWords):
    print(a)

('effect', 24)
('economic', 23)
('market', 13)
('model', 10)
('country', 9)
('price', 9)
('table', 9)
('result', 9)
('cost', 8)
('policy', 8)
('rate', 8)
('year', 7)
('datum', 7)
('variable', 7)
('level', 6)
('time', 5)
('individual', 5)
('case', 5)
('group', 5)
('education', 5)
('income', 5)
('financial', 4)
('also', 4)
('firm', 4)
('bank', 4)
('that', 4)
('risk', 3)
('treatment', 3)
('value', 3)
('service', 3)
('with', 3)
('product', 3)
('area', 3)
('school', 3)
('immigrant', 3)
('equilibrium', 3)
('social', 3)
('thi', 3)
('household', 3)
('change', 3)
('asset', 3)
('state', 3)
('climate', 2)
('research', 2)
('trade', 2)
('government', 2)
('which', 2)
('care', 2)
('growth', 2)
('loan', 2)
('trust', 2)
('function', 2)
('health', 2)
('child', 2)
('network', 2)
('game', 2)
('labor', 2)
('increase', 2)
('city', 2)
('employment', 2)
('emission', 2)
('crop', 2)
('distribution', 2)
('region', 2)
('student', 2)
('study', 2)
('test', 2)
('development', 2)
('population', 2)
('energy', 2)
('fro

In [7]:
from os.path import isdir
import os
from collections import namedtuple
import json

class ConceptCorpus():
    def __init__(self, topicModel, path='samples'):
        if isdir(path):
            self.path = path
        else:
            raise AttributeError("{} is not a directory".format(path))
        
        self.topicModel = topicModel
        self.keywords = set()
        
        for f in self._fileIter():
            if 'subject' in f:
                self.keywords.update(f['subject'])
        self.keyword = list(self.keyword)

        
    def _fileIter():
        '''
        iterates over all files in self.path. return their content as python object.
        '''
        for f in os.scandir(self.path):
            if f.is_file():
                with open(f.name) as fd:
                    yield json.load(fd)
        
    def __getitem__(self, name):
        if name == keywords:
            return self.keywords
        else:
            raise AttributeError('This object has no attribute {}.'.format(name))
        
        
    def __iter__(self):
        for content in self._fileIter():
            if 'subject' in content and 'plaintext' in f:
                topicDistribution = lda.get_document_topics( \
                                        myCorpus.dictionary.doc2bow(f['plaintext'].split()))
                # return all that in a namedtuple

In [8]:
def fitConcepts(alpha=.5, workers=4, passes=25)

SyntaxError: invalid syntax (<ipython-input-8-b9190031ae62>, line 1)