#Topic modeling with Gensim

Gensim is an open-source library for topic modeling in python. For more information see the [https://radimrehurek.com/gensim/](https://radimrehurek.com/gensim/).

## Download the data

Download corpus of NIPS articles. The original data comes from [https://cs.nyu.edu/~roweis/data.html](https://cs.nyu.edu/~roweis/data.html). 

In [None]:
!gdown https://drive.google.com/uc?id=1ZOI9KoC14VT2zLfd4gJvR5S4fBFpVE79

In [None]:
corpus = []
with open("nips.txt", 'rt') as f:
  for doc in f.readlines():
    corpus.append(doc.strip())

print(corpus[0][:100])

1  CONNECTIVITY VERSUS ENTROPY  Yaser S. Abu-Mostafa  California Institute of Technology  Pasadena, 


# Data preparation

In [None]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Tokenize the corpus.
tokenized_corpus = [word_tokenize(doc) for doc in corpus]

# Remove numbers, but not words that contain numbers.
tokenized_corpus = [[token for token in doc if not token.isnumeric()] for doc in tokenized_corpus]

# Remove words that are only one character.
tokenized_corpus = [[token for token in doc if len(token) > 1] for doc in tokenized_corpus]

print (tokenized_corpus[:5])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[['CONNECTIVITY', 'VERSUS', 'ENTROPY', 'Yaser', 'S.', 'Abu-Mostafa', 'California', 'Institute', 'of', 'Technology', 'Pasadena', 'CA', 'ABSTRACT', 'How', 'does', 'the', 'connectivity', 'of', 'neural', 'network', 'number', 'of', 'synapses', 'per', 'neuron', 'relate', 'to', 'the', 'complexity', 'of', 'the', 'problems', 'it', 'can', 'handle', 'measured', 'by', 'the', 'entropy', 'Switching', 'theory', 'would', 'suggest', 'no', 'relation', 'at', 'all', 'since', 'all', 'Boolean', 'functions', 'can', 'be', 'implemented', 'using', 'circuit', 'with', 'very', 'low', 'connectivity', 'e.g.', 'using', 'two-input', 'NAND', 'gates', 'However', 'for', 'network', 'that', 'learns', 'problem', 'from', 'examples', 'using', 'local', 'learning', 'rule', 'we', 'prove', 'that', 'the', 'entropy', 'of', 'the', 'problem', 'becomes', 'lower', 'bound', 'for', 'the', 'connectivity', 'of', 'the', 'network', 'INTR

Remove rare words and common words based on their document frequency.

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(tokenized_corpus)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

Transform the documents to a vectorized form.

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(bow_corpus))

Number of unique tokens: 8985
Number of documents: 1740


In [None]:
# Set training parameters.
num_topics = 10
passes = 20

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

##Latent Semantic Analysis

In [None]:
from gensim.models import LsiModel
lsa_model = LsiModel(corpus = bow_corpus, id2word=id2word, num_topics=10)

In [None]:
from pprint import pprint
for i in range(min(5, num_topics)):
  top_topics = lsa_model.show_topic(i)
  pprint(top_topics)
  print()

[('units', 0.19235418233662324),
 ('state', 0.16240358933934307),
 ('hidden', 0.13617919187763552),
 ('unit', 0.11555365232103111),
 ('layer', 0.10561770434628621),
 ('distribution', 0.101650647055897),
 ('neurons', 0.10122317240883169),
 ('noise', 0.10048758983314339),
 ('weight', 0.09824955556684571),
 ('image', 0.0901365647825311)]

[('cells', -0.28270743702750983),
 ('neurons', -0.2690591358149691),
 ('cell', -0.22781366987802967),
 ('neuron', -0.19780541363512882),
 ('visual', -0.17176315974595535),
 ('state', 0.171418476531871),
 ('response', -0.15308579989035637),
 ('firing', -0.15261680601488223),
 ('activity', -0.13863401109291904),
 ('synaptic', -0.13784285973663535)]

[('units', -0.46266971732915585),
 ('state', 0.4099468801715468),
 ('hidden', -0.2752394132134866),
 ('unit', -0.24996287506232467),
 ('layer', -0.15324839268501494),
 ('policy', 0.15022987344111338),
 ('states', 0.1416856332571333),
 ('control', 0.1379535539291444),
 ('image', -0.13227574221038713),
 ('optimal

## Latent Dirichlet Allocation

In [None]:
from gensim.models import LdaMulticore
lda_model = LdaMulticore(corpus = bow_corpus, 
                 id2word=id2word, 
                 num_topics=num_topics,
                 passes=passes
                 )

In [None]:
from pprint import pprint
for i in range(min(5, num_topics)):
  top_topics = lda_model.show_topic(i)
  pprint(top_topics)
  print()

[('neurons', 0.009475051),
 ('cells', 0.008994424),
 ('cell', 0.0069663217),
 ('neuron', 0.0063389298),
 ('visual', 0.0060701636),
 ('activity', 0.005888351),
 ('response', 0.0056764707),
 ('synaptic', 0.0049531115),
 ('firing', 0.004792633),
 ('stimulus', 0.004767299)]

[('image', 0.012632106),
 ('images', 0.0077324416),
 ('recognition', 0.0066129714),
 ('feature', 0.0059718424),
 ('features', 0.0059635327),
 ('units', 0.005580828),
 ('speech', 0.005057873),
 ('layer', 0.0047320854),
 ('hidden', 0.004005269),
 ('trained', 0.0039953375)]

[('matrix', 0.009063283),
 ('gradient', 0.005177249),
 ('dynamics', 0.0048708245),
 ('equation', 0.00475012),
 ('rule', 0.004002417),
 ('equations', 0.0039212047),
 ('convergence', 0.0037287374),
 ('solution', 0.0037086932),
 ('weight', 0.003633182),
 ('noise', 0.003567578)]

[('units', 0.02166299),
 ('unit', 0.013329621),
 ('hidden', 0.011166208),
 ('memory', 0.0078068413),
 ('patterns', 0.007020283),
 ('layer', 0.0069447043),
 ('pattern', 0.00637696