# GENSIM

In [1]:
# DOCUMENT: some text 
text_corpus= ["Human machine interface for lab for abc computer applications",
           "A survey of user opinion of computer system response time",
           "The EPS user interface management system",
           "system and human system engineering testing of EPS",
           "Relation of user perceived response time to error measurement",
           "The generation of random binary unordered trees",
           "The interaction graph of paths in trees",
            "Graph minors IV widths of trees and well quasi ordering",
             "Graph minors a survey"]

In [2]:
# VECTOR: a mathematically convenient representation of a document

import pprint

In [3]:
# create a set of frequent words

stoplist= set('for a of the and to in'.split(' '))

# Lowercase each document, split it by whitespace and filter out stopwords
texts= [[word for word in document.lower().split() if word not in stoplist]
        for document in text_corpus]
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['interaction', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [13]:
# count word frequencies
from collections import defaultdict

frequency= defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1
print(frequency)

defaultdict(<class 'int'>, {'human': 2, 'machine': 1, 'interface': 2, 'lab': 1, 'abc': 1, 'computer': 2, 'applications': 1, 'survey': 2, 'user': 3, 'opinion': 1, 'system': 4, 'response': 2, 'time': 2, 'eps': 2, 'management': 1, 'engineering': 1, 'testing': 1, 'relation': 1, 'perceived': 1, 'error': 1, 'measurement': 1, 'generation': 1, 'random': 1, 'binary': 1, 'unordered': 1, 'trees': 3, 'interaction': 1, 'graph': 3, 'paths': 1, 'minors': 2, 'iv': 1, 'widths': 1, 'well': 1, 'quasi': 1, 'ordering': 1})


In [14]:
# Only keep words that appear more than once
processed_corpus=  [[token for token in text if frequency[token] > 1] for text in texts]
processed_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [15]:
# Creating dictionaries, which helps during Topic modelling
from gensim import corpora

dictionary= corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [16]:
# Vectorise representation
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}



MODEL : an algorithm for transforming vectors from one representation to another

In [18]:
# Always make a practise of testing small texts when trying something new
#  in each tuple below 1st occurance is ID and 2nd ocurance is count

new_doc= "Human computer interface computer"
new_doc2 = "Human computer Interaction computer"
new_vec= dictionary.doc2bow(new_doc.lower().split())
print(new_vec)
new_vec2= dictionary.doc2bow(new_doc2.lower().split())
print(new_vec2)  # Interaction word noit in corpus so didn't display

[(0, 2), (1, 1), (2, 1)]
[(0, 2), (1, 1)]


In [19]:
# 
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [16]:
#
from gensim import models

# train the model
tfidf= models.TfidfModel(bow_corpus)

#
words= 'system minors'.lower().split()  ## transform the  "system minors" string

print(tfidf[dictionary.doc2bow(words)])

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


In [25]:

# 
import gensim
from gensim import corpora
from pprint import pprint
from gensim.utils import simple_preprocess

import os
dict_text= corpora.Dictionary(simple_preprocess(line) for line in open(r'sample.txt') )
print(dict_text.token2id)

{'abc': 0, 'applications': 1, 'computer': 2, 'for': 3, 'human': 4, 'interface': 5, 'lab': 6, 'machine': 7, 'of': 8, 'opinion': 9, 'response': 10, 'survey': 11, 'system': 12, 'time': 13, 'user': 14}


In [22]:
tex= "Human machine interface for lab for abc computer applications, A survey of user opinion of computer system response time"

f= open('sample.txt', 'w')
f.write(tex)
f.close()