An example from https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html

In [9]:
import io
import os.path
import re
import tarfile
import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())


In [11]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]


ModuleNotFoundError: No module named 'nltk'

In [12]:
import nltk
nltk.download('wordnet')

ModuleNotFoundError: No module named 'nltk'

In [4]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [5]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)


In [6]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [7]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]


In [8]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


In [9]:
%%time
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

CPU times: user 1min 58s, sys: 24 ms, total: 1min 58s
Wall time: 1min 58s


In [10]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)


Average topic coherence: -1.1401.
[([(0.024176192, 'neuron'),
   (0.009569491, 'circuit'),
   (0.0076048793, 'cell'),
   (0.0075144167, 'spike'),
   (0.0067892037, 'synaptic'),
   (0.0065952875, 'chip'),
   (0.0063055987, 'analog'),
   (0.0062299604, 'signal'),
   (0.005651782, 'voltage'),
   (0.0054583726, 'firing'),
   (0.004494699, 'response'),
   (0.0044115093, 'channel'),
   (0.004370286, 'potential'),
   (0.004278465, 'frequency'),
   (0.004138902, 'fig'),
   (0.004025923, 'connection'),
   (0.0038297768, 'threshold'),
   (0.0038272964, 'synapsis'),
   (0.0035814962, 'noise'),
   (0.0033613855, 'memory')],
  -0.9426353504997164),
 ([(0.009407897, 'matrix'),
   (0.008047599, 'gaussian'),
   (0.0059282053, 'density'),
   (0.0051058144, 'likelihood'),
   (0.005042755, 'mixture'),
   (0.00498137, 'prior'),
   (0.0048193233, 'solution'),
   (0.004728297, 'approximation'),
   (0.004545836, 'bayesian'),
   (0.004179032, 'component'),
   (0.004094893, 'em'),
   (0.0038646834, 'posterior'

In [25]:
d=Dictionary(documents=[['fdsalj dlsk fj s ab','ab aldsjfld dsjflsdjlkdsf ddsf']])
list(d.items())

[(0, 'ab aldsjfld dsjflsdjlkdsf ddsf'), (1, 'fdsalj dlsk fj s ab')]

In [18]:
?Dictionary

[0;31mInit signature:[0m [0mDictionary[0m[0;34m([0m[0mdocuments[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mprune_at[0m[0;34m=[0m[0;36m2000000[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Dictionary encapsulates the mapping between normalized words and their integer ids.

Notable instance attributes:

Attributes
----------
token2id : dict of (str, int)
    token -> tokenId.
id2token : dict of (int, str)
    Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed).
cfs : dict of (int, int)
    Collection frequencies: token_id -> how many instances of this token are contained in the documents.
dfs : dict of (int, int)
    Document frequencies: token_id -> how many documents contain this token.
num_docs : int
    Number of documents processed.
num_pos : int
    Total number of corpus positions (number of processed words).
num_nnz : int
    Total number of non-zeroes in the BOW matrix (sum of the number of un

In [30]:
c=chr('C')

TypeError: an integer is required (got type str)

In [28]:
c-'0'

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [33]:
ord('E')

69

In [15]:
from functools import reduce
[i*(i+1) for i in range(18)]

324