#Load up Gensim

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities

###import basic corpus

In [2]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time", 
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement", 
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

##From Strings to Vectors [1]

###remove common words and tokenize

In [3]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents] # lowercase split around stopwords for each document

###remove words that appear only once

In [4]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1 # counting frequency

texts = [[token for token in text if frequency[token] > 1]
         for text in texts] 

In [5]:
from pprint import pprint # pretty-printer

pprint(texts) #print our output text

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


###converting documents to vectors using bag-of-words representation

In [6]:
dictionary = corpora.Dictionary(texts)
dictionary.save('Tutorial/deerwester.dict') #storing dictionary for future reference
print(dictionary)

Dictionary(12 unique tokens: ['eps', 'survey', 'user', 'system', 'human']...)


In [7]:
print(dictionary.token2id) #printing the mapping between words and id's

{'eps': 8, 'survey': 5, 'user': 6, 'system': 7, 'human': 0, 'trees': 9, 'minors': 11, 'graph': 10, 'interface': 2, 'response': 3, 'computer': 1, 'time': 4}


In [8]:
new_doc = "Human computer interaction" #example input
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec) #interaction word isnt in dictionary and is ignored

[(0, 1), (1, 1)]


In [9]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('Tutorial/deerwester.mm', corpus) #store for later use
print(corpus)

[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (7, 2), (8, 1)], [(3, 1), (4, 1), (6, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(5, 1), (10, 1), (11, 1)]]


##Corpus Streaming - One Document at a Time

###making corpus memory friendly so only one vector is stored in RAM at a time

In [10]:
class MyCorpus(object):
     def __iter__(self):
         for line in open('Tutorial/mycorpus.txt'): #pure python list
             # assume there's one document per line, tokens separated by whitespace
             yield dictionary.doc2bow(line.lower().split())

In [11]:
corpus_memory_friendly = MyCorpus() #doesnt load the corpus into memory
print(corpus_memory_friendly) #not useful

<__main__.MyCorpus object at 0x108fa2da0>


In [12]:
for vector in corpus_memory_friendly: #load one vector into memory at a time!
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (6, 1), (7, 1), (8, 1)]
[(0, 1), (7, 2), (8, 1)]
[(3, 1), (4, 1), (6, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(5, 1), (10, 1), (11, 1)]


###constructing the dictionary without loading all texts into memory

In [13]:
#collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('Tutorial/mycorpus.txt'))

#remove stopwords and words that only appear once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]

once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]

dictionary.filter_tokens(stop_ids + once_ids) #remove stop & words occuring once
dictionary.compactify() #removes gaps in id sequence after words that were removed

print(dictionary)

Dictionary(12 unique tokens: ['eps', 'survey', 'user', 'system', 'human']...)


#Topics and Transformations

##Transformation interface

In [14]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('Tutorial/deerwester.dict')
corpus = corpora.MmCorpus('Tutorial/deerwester.mm')

print(corpus) #using a corpus from previous, of documents represented as a stream of vectors

MmCorpus(9 documents, 12 features, 28 non-zero entries)


###transforming vectors

In [15]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
#tfidf is treated as a read only object that can be used to convery any vector from old representation (bag of words interger counts) to the new representation (Tfldf real valued weights) 

In [16]:
doc_bow = [(0, 1), (1, 1)] #test input
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [17]:
#testing for whole corpus
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(0, 0.49182558987264147), (7, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (6, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [18]:
#Transformations can also be serialized, one on top of another, in a sort of chain

# initialize an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) 
# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsi = lsi[corpus_tfidf] 

In [19]:
lsi.print_topics(2) #we created num_topics=2 above

['0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"',
 '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"']

In [20]:
#It appears that according to LSI, “trees”, “graph” and “minors” are all related words (and contribute the most to the direction of the first topic), while the second topic practically concerns itself with all the other words

In [21]:
for doc in corpus_lsi:
    print(doc) #both bow->tfidf and tfidf->lsi transformations are actually executed here on the fly

[(0, 0.066007833960907758), (1, -0.52007033063618469)]
[(0, 0.19667592859143146), (1, -0.76095631677000275)]
[(0, 0.089926399724470377), (1, -0.72418606267525032)]
[(0, 0.075858476521786899), (1, -0.63205515860034256)]
[(0, 0.1015029918498063), (1, -0.57373084830029419)]
[(0, 0.70321089393782965), (1, 0.1611518021402637)]
[(0, 0.87747876731198149), (1, 0.1675890686466012)]
[(0, 0.90986246868185638), (1, 0.14086553628719753)]
[(0, 0.61658253505692828), (1, -0.053929075663888673)]


In [22]:
[(0, -0.066), (1, 0.520)] # "Human machine interface for lab abc computer applications"
[(0, -0.197), (1, 0.761)] # "A survey of user opinion of computer system response time"
[(0, -0.090), (1, 0.724)] # "The EPS user interface management system"
[(0, -0.076), (1, 0.632)] # "System and human system engineering testing of EPS"
[(0, -0.102), (1, 0.574)] # "Relation of user perceived response time to error measurement"
[(0, -0.703), (1, -0.161)] # "The generation of random binary unordered trees"
[(0, -0.877), (1, -0.168)] # "The intersection graph of paths in trees"
[(0, -0.910), (1, -0.141)] # "Graph minors IV Widths of trees and well quasi ordering"
[(0, -0.617), (1, 0.054)] # "Graph minors A survey"

[(0, -0.617), (1, 0.054)]

In [23]:
lsi.save('Tutorial/model.lsi') #same for tfidf, lda..
lis = models.LsiModel.load('Tutorial/model.lsi')

###See evernote for list of avalible transformations!
###https://www.evernote.com/l/AELRgoeKjwBILpZxqkIPoQdpCg_wS8BIGuM

#Similarity Queries

##Similarity interface

In [24]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('Tutorial/deerwester.dict')
corpus = corpora.MmCorpus('Tutorial/deerwester.mm')

print(corpus) #using a corpus from previous, of documents represented as a stream of vectors

MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [25]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

In [26]:
#To rank documents in relevancy using semantic extension over the boolean keyword match

doc = "Human computer interaction" #our input we want to rank documents against
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] #convert query to LSI space

print(vec_lsi)

[(0, 0.46182100453271524), (1, 0.070027665279001478)]


###Initializing query structures

In [27]:
# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus]) 



In [28]:
index.save('Tutorial/deerwester.index')
index = similarities.MatrixSimilarity.load('Tutorial/deerwester.index')

###performing queries

In [29]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.098794639), (8, 0.050041765)]


In [30]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims) #print sorted (document number, similarity score)

[(2, 0.99844527), (0, 0.99809301), (3, 0.9865886), (1, 0.93748635), (4, 0.90755945), (8, 0.050041765), (7, -0.098794639), (6, -0.10639259), (5, -0.12416792)]


In [31]:
dictionary = corpora.Dictionary.load('Tutorial/deerwester.dict')
corpus = corpora.MmCorpus('Tutorial/deerwester.mm')

In [32]:
dictionary[1]

'computer'

In [33]:
dictionary[2]

'interface'