In [4]:
import os
import sys
import nltk
import itertools 

In [5]:
# stupid way to access the data
my_path = '/Users/vassiki/Desktop/nhw17/DataDrivenCognitiveOntology/'
sys.path.append(my_path)

In [6]:
# Import core functions
from lisc.base import Base
from lisc.core.db import SCDB
from lisc.data import Data
from lisc.data_all import DataAll

In [7]:
# Add data to path
db = SCDB(False)
db.project_path = os.path.join(os.getcwd(), 'dat')
db.gen_paths()

In [8]:
# Using a single case, for autobiographical recall
# Need associated .json file to run this!
dat = Data('autobiographical recall')
dat.load(db)

In [9]:
def get_corpus(ds):
    """
    To flatten all abstract words into a long list, and make a corpus out of it 
    
    TO DO: Generalize to making a corpus of authors, or any metric the author wants
    
    Attributes
    ----------
    ds : Data structure returned by lisc when a query is submitted
    """
    aPreCorpus = []
    
    numArticles = ds.n_articles
    print('Iterating over %d articles for extracting words') %(ds.n_articles)
    for AIdx in xrange(numArticles):
        if ds.words[AIdx] != None:
            aPreCorpus.append(ds.words[AIdx])
        
    
    PreText = list(itertools.chain.from_iterable(aPreCorpus))
    
    Corpus = nltk.Text(PreText)
    
    return Corpus
        

def get_common_words(Corpus,NumWords):
    """
    Return the most common words in the concatenated article text corpus
    
    Attributes
    ----------
    Corpus: generated from keyword arguments from cognitive atlas
    NumWords: Top N words to return
    """
    
    assert(NumWords <= len(Corpus))
    
    fdist = nltk.FreqDist(Corpus)
    common_words = fdist.most_common(NumWords)
    header = [('Term', 'Frequency'),('-'*len('Term'),'-'*len('Frequency'))]
    wordList = header + common_words
    
    width = max(len(e) for t in wordList for e in t[:-1]) + 1 
    format=('%%-%ds' % width) * len(wordList[0])
    print '\n'.join(format % tuple(t) for t in wordList)
    print('\n Total Word Count : %d') %(len(Corpus))
    

def get_similarity(Corpus,ds,keyword='memory'):
    """
    Returns the most similar words, determined by neighborhood in corpus
    
    Attributes
    ----------
    
    Corpus : concatenated list of abstracts
    ds : ouptut of lisc
    keyword : query
    
    NOTE: There is a bug in text.similarity in NLTK2.0, we want to use this function to return similar 
    words in a ranked fashion 
    """
    #similar_words = Corpus.similar(keyword)
    #return similar_words
    #similar_words = Corpus._word_contex_index.similar_words(keyword)
    #if similar_words != None:
     #   print '\n'.join(word for word in similar_words)
    #else: 
     #   print 'No matches.'
        
    idx = nltk.text.ContextIndex([word.lower() for word in Corpus])
    simWords = []
    for word in nltk.word_tokenize(keyword):
        simWords.append(idx.similar_words(word))
        if word != None:
            print '\n'.join(idx.similar_words(word))
    return list(itertools.chain.from_iterable(simWords))

In [16]:
# See how many terms from Cognitive Atlas exist in given Corpus, don't run this now
CA_words = ['abstract reasoning','attention','arousal']
def CA_word_presence(Corpus,CA_terms):
    """
    Returns the words from CA that are present in the CA
    
    TO DO: Generalize to entering tasks 
    
    Attributes
    ----------
    
    Corpus : concatenated list of abstracts
    """    
    mask = []
    for ca_idx in xrange(len(CA_terms)):
        if CA_terms[ca_idx] in Corpus:
            mask.append(True)
        else:
            mask.append(False)
            
    PresentTerms = list(compress(CA_words, mask))
    print '\n'.join(word for word in PresentTerms)
    
    return PresentTerms
   

### Scratchpad

In [None]:
Corpus = get_corpus(dat)

In [None]:
get_common_words(Corpus,10)

In [None]:
sim = get_similarity(Corpus,dat,'autobiographical recall')

In [None]:
def test_neighbors(Corpus,keyword)
    
    plain_list = list(Corpus)
    max_instances = 10

    indices = [i for i, x in enumerate(plain_list) if x == "p300"]

    check_first = indices[:max_instances]

    for i in range(max_instances):
        print('Occurence %d of p300 preceded by %s and followed by %s.') \
        %(check_first[i],plain_list[i-1],plain_list[i+1])

### Visualization with Gensim

In [27]:
import gensim
from gensim import corpora
import pandas as pd
from gensim.models import LdaModel
import pyLDAvis.gensim

In [28]:
textList = []
    
numArticles = dat.n_articles
for AIdx in xrange(numArticles):
    if dat.words[AIdx] != None:
        textList.append(dat.words[AIdx])
        

In [29]:
dictionary = corpora.Dictionary(textList)
dictionary.save('dictionary.dict')

In [30]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in textList]
corpora.MmCorpus.serialize('corpus.mm', doc_term_matrix)

In [31]:
Lda = gensim.models.ldamodel.LdaModel

ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

ldamodel.save('topic.model')

In [32]:
loading = LdaModel.load('topic.model')
print(loading.print_topics(num_topics=2, num_words=4))

[(0, u'0.026*"autobiographical" + 0.021*"memory" + 0.015*"recall" + 0.010*"memories"'), (1, u'0.018*"mood" + 0.012*"induction" + 0.009*"study" + 0.009*"j"')]


In [33]:
pyLDAvis.enable_notebook()

In [34]:
d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('topic.model')

In [35]:
data = pyLDAvis.gensim.prepare(lda, c, d)

In [36]:
data