# Gensim Tutorial

In [13]:
# Corpora and Vector Spaces: https://radimrehurek.com/gensim/tut1.html
#----------------------------------------------------------------------
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
# Let's create a small corpus where each document is one sentence as in the gensim tutorial from the link above.
# Let's take text from Wikipedia article on deep learning: https://en.wikipedia.org/wiki/Deep_learning
# We have 10 documents (each doc is a sentence).
documents = [
    "Deep learning (deep structured learning, hierarchical learning or deep machine learning)\
    is a branch of machine learning based on a set of algorithms that attempt to model \
    high-level abstractions in data by using multiple processing layers with complex \
    structures, or otherwise composed of multiple non-linear transformations.[1][2][3][4][5][6]",
    "Deep learning is part of a broader family of machine learning methods based on learning representations of data.",
    "An observation (e.g., an image) can be represented in many ways such as a vector of intensity values per pixel,\
    or in a more abstract way as a set of edges, regions of particular shape, etc. Some representations make it \
    easier to learn tasks (e.g., face recognition or facial expression recognition[7]) \
    from examples. One of the promises of deep learning is replacing handcrafted features \
    with efficient algorithms for unsupervised or semi-supervised feature learning and hierarchical \
    feature extraction.[8]",
    "Deep learning is part of a broader family of machine learning methods based on learning \
    representations of data.",
    "An observation (e.g., an image) can be represented in many ways such as a vector of intensity\
    values per pixel, or in a more abstract way as a set of edges, regions of particular shape, etc.",
    "Some representations make it easier to learn tasks (e.g., face recognition or facial expression recognition[7]) from examples.",
    "One of the promises of deep learning is replacing handcrafted features with efficient algorithms for unsupervised or semi-supervised \
    feature learning and hierarchical feature extraction.[8]",
    "Research in this area attempts to make better representations and create models to learn these representations\
    from large-scale unlabeled data.",
    "Some of the representations are inspired by advances in neuroscience and are loosely based on interpretation of information processing\
    and communication patterns in a nervous system, such as neural coding which attempts to define a relationship between various stimuli \
    and associated neuronal responses in the brain.[9]",
    "Various deep learning architectures such as deep neural networks, convolutional deep neural networks, \
    deep belief networks and recurrent neural networks have been applied to fields like computer vision, automatic\
    speech recognition, natural language processing, audio recognition and bioinformatics where they have been shown to produce state-of-the-art\
    results on various tasks."]

print len(documents)

10


In [14]:
# Let's remove common words like "a" "the", etc. in English.
# These are called stop words and we can use nltk for a list of these in English
import nltk
from nltk.corpus import stopwords
stopwords= stopwords.words('english')
print stopwords

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [23]:
#We will need to lowercase text (some times we shouldn't do that naively if we care for things like\
#named entities (which start with uppercase)')
# NLTK has a number of tokenization options here: http://www.nltk.org/api/nltk.tokenize.html
# Especially note that NLTK also supports Twitter tokenization, which will be useful for us
# Look at this line from the link above:
#  from nltk.tokenize import TweetTokenizer
#-----------------------------------------
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
texts= [[w for w in tokenizer.tokenize(document.lower()) if w not in stopwords] for document in documents]
print texts[:1]


[['deep', 'learning', 'deep', 'structured', 'learning', 'hierarchical', 'learning', 'deep', 'machine', 'learning', 'branch', 'machine', 'learning', 'based', 'set', 'algorithms', 'attempt', 'model', 'high', 'level', 'abstractions', 'data', 'using', 'multiple', 'processing', 'layers', 'complex', 'structures', 'otherwise', 'composed', 'multiple', 'non', 'linear', 'transformations', '1', '2', '3', '4', '5', '6']]


In [46]:
from collections import defaultdict
word_freq=defaultdict(int)
from itertools import groupby
for text in texts:
    for w in text:
        word_freq[w]+=1
print word_freq

defaultdict(<type 'int'>, {'interpretation': 1, 'results': 1, 'brain': 1, 'attempts': 2, 'broader': 2, 'networks': 4, 'layers': 1, 'machine': 4, 'based': 4, 'nervous': 1, 'state': 1, 'better': 1, '4': 1, '8': 2, 'pixel': 2, 'non': 1, 'advances': 1, 'facial': 2, 'using': 1, 'like': 1, 'semi': 2, 'level': 1, 'fields': 1, 'loosely': 1, 'shape': 2, 'large': 1, 'vector': 2, 'neuronal': 1, 'automatic': 1, 'vision': 1, 'set': 3, 'art': 1, 'methods': 2, 'intensity': 2, 'computer': 1, 'examples': 2, 'recognition': 6, 'responses': 1, 'shown': 1, 'scale': 1, 'ways': 2, 'per': 2, 'research': 1, 'replacing': 2, '3': 1, 'various': 3, '7': 2, 'linear': 1, 'processing': 3, 'represented': 2, 'g': 4, 'many': 2, 'inspired': 1, 'abstractions': 1, 'etc': 2, 'produce': 1, 'supervised': 2, 'expression': 2, 'otherwise': 1, 'composed': 1, 'tasks': 3, 'features': 2, 'family': 2, 'communication': 1, 'image': 2, 'coding': 1, 'natural': 1, 'one': 2, 'learning': 16, 'neuroscience': 1, 'transformations': 1, 'area': 

In [47]:
# Side note: OrderedDict in Python
# Take a look at the documentation of the Python collections module: 
# https://docs.python.org/2/library/collections.html
from collections import OrderedDict
# dictionary sorted by key
print OrderedDict(sorted(word_freq.items(), key=lambda t: t[0]))

OrderedDict([('1', 1), ('2', 1), ('3', 1), ('4', 1), ('5', 1), ('6', 1), ('7', 2), ('8', 2), ('9', 1), ('abstract', 2), ('abstractions', 1), ('advances', 1), ('algorithms', 3), ('applied', 1), ('architectures', 1), ('area', 1), ('art', 1), ('associated', 1), ('attempt', 1), ('attempts', 2), ('audio', 1), ('automatic', 1), ('based', 4), ('belief', 1), ('better', 1), ('bioinformatics', 1), ('brain', 1), ('branch', 1), ('broader', 2), ('coding', 1), ('communication', 1), ('complex', 1), ('composed', 1), ('computer', 1), ('convolutional', 1), ('create', 1), ('data', 4), ('deep', 11), ('define', 1), ('e', 4), ('easier', 2), ('edges', 2), ('efficient', 2), ('etc', 2), ('examples', 2), ('expression', 2), ('extraction', 2), ('face', 2), ('facial', 2), ('family', 2), ('feature', 4), ('features', 2), ('fields', 1), ('g', 4), ('handcrafted', 2), ('hierarchical', 3), ('high', 1), ('image', 2), ('information', 1), ('inspired', 1), ('intensity', 2), ('interpretation', 1), ('language', 1), ('large', 

In [48]:
# dictionary sorted by value, in reverse order
print OrderedDict(sorted(word_freq.items(), key=lambda t: t[1], reverse=True))

OrderedDict([('learning', 16), ('deep', 11), ('representations', 7), ('recognition', 6), ('networks', 4), ('machine', 4), ('based', 4), ('g', 4), ('e', 4), ('neural', 4), ('feature', 4), ('data', 4), ('set', 3), ('various', 3), ('processing', 3), ('tasks', 3), ('hierarchical', 3), ('learn', 3), ('make', 3), ('algorithms', 3), ('attempts', 2), ('broader', 2), ('8', 2), ('pixel', 2), ('facial', 2), ('semi', 2), ('shape', 2), ('vector', 2), ('methods', 2), ('intensity', 2), ('examples', 2), ('ways', 2), ('per', 2), ('replacing', 2), ('7', 2), ('represented', 2), ('many', 2), ('etc', 2), ('supervised', 2), ('expression', 2), ('features', 2), ('family', 2), ('image', 2), ('one', 2), ('extraction', 2), ('way', 2), ('particular', 2), ('easier', 2), ('regions', 2), ('values', 2), ('promises', 2), ('abstract', 2), ('efficient', 2), ('handcrafted', 2), ('multiple', 2), ('edges', 2), ('observation', 2), ('unsupervised', 2), ('face', 2), ('part', 2), ('interpretation', 1), ('results', 1), ('brain'

In [49]:
# dictionary sorted by length of the key string, in reverse order (So you get longer keys first)
print OrderedDict(sorted(word_freq.items(), key=lambda t: len(t[0]), reverse=True))


OrderedDict([('transformations', 1), ('representations', 7), ('interpretation', 1), ('bioinformatics', 1), ('communication', 1), ('convolutional', 1), ('architectures', 1), ('abstractions', 1), ('neuroscience', 1), ('relationship', 1), ('hierarchical', 3), ('unsupervised', 2), ('recognition', 6), ('represented', 2), ('information', 1), ('handcrafted', 2), ('observation', 2), ('processing', 3), ('supervised', 2), ('expression', 2), ('structured', 1), ('extraction', 2), ('structures', 1), ('particular', 2), ('associated', 1), ('algorithms', 3), ('automatic', 1), ('intensity', 2), ('responses', 1), ('replacing', 2), ('otherwise', 1), ('efficient', 2), ('recurrent', 1), ('unlabeled', 1), ('attempts', 2), ('networks', 4), ('advances', 1), ('neuronal', 1), ('computer', 1), ('examples', 2), ('research', 1), ('inspired', 1), ('composed', 1), ('features', 2), ('learning', 16), ('language', 1), ('promises', 2), ('abstract', 2), ('multiple', 2), ('patterns', 1), ('results', 1), ('broader', 2), ('

In [59]:
# Let's remove words of freq < 2 and keep only unique words, using a set
texts = [set([w for w in text if word_freq[w] > 1]) for text in texts]
from pprint import pprint
pprint(texts[:1])

[set(['algorithms',
      'based',
      'data',
      'deep',
      'hierarchical',
      'learning',
      'machine',
      'multiple',
      'processing',
      'set'])]


In [60]:
# Let's represent each document as a bag-of-words, where each word is assigned a unique integer id\
dictionary = corpora.Dictionary(texts)
print dictionary
# You can save this dictionary to desk for future reference, using gensim:
# dictionary.save('/tmp/word_freq.dict') # 

Dictionary(60 unique tokens: [u'promises', u'set', u'features', u'family', u'image']...)


In [62]:
# You can get each word and its token id:
print(dictionary.token2id)

{u'promises': 15, u'set': 0, u'features': 17, u'family': 12, u'image': 18, u'deep': 3, u'one': 19, u'shape': 20, u'tasks': 16, u'examples': 22, u'broader': 13, u'networks': 59, u'recognition': 23, u'methods': 14, u'regions': 28, u'based': 1, u'etc': 49, u'efficient': 21, u'make': 25, u'feature': 26, u'per': 27, u'machine': 4, u'extraction': 29, u'vector': 51, u'various': 57, u'supervised': 52, u'7': 30, u'8': 32, u'abstract': 34, u'handcrafted': 35, u'attempts': 56, u'multiple': 8, u'way': 36, u'replacing': 37, u'processing': 2, u'g': 44, u'hierarchical': 5, u'facial': 39, u'particular': 40, u'represented': 41, u'representations': 10, u'data': 9, u'values': 46, u'e': 42, u'observation': 43, u'semi': 31, u'unsupervised': 45, u'many': 33, u'edges': 38, u'neural': 58, u'intensity': 47, u'face': 48, u'ways': 24, u'easier': 50, u'part': 11, u'algorithms': 6, u'learning': 7, u'learn': 53, u'expression': 54, u'pixel': 55}


In [69]:
# Let's add a new document and get a sparse vector of it using gensim's "doc2bow" \
# dictionary attribute:
new_doc= "Deep learning? I like deep learning a lot."
tokenized_and_split_doc =tokenizer.tokenize(new_doc.lower())
new_vec = dictionary.doc2bow(tokenized_and_split_doc) 
# Only the words deep (id 3) and learning (id 7)
# occur in our previous dictionary, and each of these occur twice in this new document
print new_vec

[(3, 2), (7, 2)]


In [None]:
# What does the sparse vector [(3, 2), (7, 2)] mean?
# Well, all it means is that it has two words, "deep" and "learning", ids 3 and 7, respectively
# and that each of them occurs twice in this new_vec vector. This should be clear to you by now.

In [63]:
# Then you get a sparse vector representation for each document.
# Remember, each word is represented as an integer and the code 
corpus = [dictionary.doc2bow(text) for text in texts]
print corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(1, 1), (3, 1), (4, 1), (7, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(0, 1), (3, 1), (5, 1), (6, 1), (7, 1), (10, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)], [(1, 1), (3, 1), (4, 1), (7, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(0, 1), (18, 1), (20, 1), (24, 1), (27, 1), (28, 1), (33, 1), (34, 1), (36, 1), (38, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (46, 1), (47, 1), (49, 1), (51, 1), (55, 1)], [(10, 1), (16, 1), (22, 1), (23, 1), (25, 1), (30, 1), (39, 1), (42, 1), (44, 1), (48, 1), (50, 1), (53, 1), (54, 1)], [(3, 1), (5, 1), (6, 1)

In [None]:
# This is how you save the dict to desk for later use, using gensim:
# corpora.MmCorpus.serialize('/tmp/dictionary.mm', corpus) 

In [None]:
#corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
        [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
        [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
        [(0, 1.0), (4, 2.0), (7, 1.0)],
        [(3, 1.0), (5, 1.0), (6, 1.0)],
        [(9, 1.0)],
        [(9, 1.0), (10, 1.0)],
        [(9, 1.0), (10, 1.0), (11, 1.0)],
        [(8, 1.0), (10, 1.0), (11, 1.0)]]
tfidf = models.TfidfModel(corpus)
print tfidf

vec = [(0, 1), (4, 1)]
print(tfidf[vec])

In [None]:
# To be continued