In [2]:
pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 252 kB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-2.2.1.tar.gz (122 kB)
[K     |████████████████████████████████| 122 kB 7.1 MB/s eta 0:00:01
Collecting boto3
  Downloading boto3-1.15.11.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 1.3 MB/s eta 0:00:011
Collecting botocore<1.19.0,>=1.18.11
  Downloading botocore-1.18.11-py2.py3-none-any.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 5.7 MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 1.4 MB/s eta 0:00:01
Building wheels for collected packages: smart-open, boto3
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Crea

In [1]:
#importing libraries
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

## Loading Data

In [2]:
def load_data(path,file_name):
    documents_list=[]
    titles=[]
    with open(os.path.join(path,file_name),"r") as fin:
        for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)
            
    print("Total Number of Documents: ",len(documents_list))
    titles.append( text[0: min(len(text),100)])
    return documents_list,titles

# Preprocssing Data

In [3]:
def preprocess_data(doc_set):
    
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = set(stopwords.words('english'))
    p_stemmer = PorterStemmer()
    texts = []
    
    for i in doc_set:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if i not in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    return texts

## Prepare Corpus

In [4]:
def prepare_corpus(doc_clean):
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    
    return dictionary,doc_term_matrix

## Create an LSA model using Gensim

In [5]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    dictionary,doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix,num_topics=number_of_topics,id2word=dictionary)
    print(lsamodel.print_topics(num_topics=number_of_topics,num_words=words))
    return lsamodel

# Determine the number of topics

In [9]:
def compute_coherence_values(dictionary,doc_term_matrix,doc_clean,stop,start=2,step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

## Plotting Coherence Score Values

In [16]:
def plot_graph(doc_clean,start,stop,step):
    dictionary,doc_term_matrix = prepare_corpus(doc_clean)
    model_list,coherence_values = compute_coherence_values(dictionary,doc_term_matrix,doc_clean,stop,start,step)
    
    x = range(start,stop,step)
    plt.plot(x,coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence_Score")
    plt.legend(("coherence_values"),loc="best")
    plt.show()

In [17]:
number_of_topics=7
words=10
document_list,titles=load_data("","pep.txt")
clean_text=preprocess_data(document_list)
model=create_gensim_lsa_model(clean_text,number_of_topics,words)

Total Number of Documents:  2
[(0, '-0.313*"mother" + -0.250*"mathura" + -0.188*"meat" + -0.188*"told" + -0.188*"went" + -0.188*"eat" + -0.188*"go" + -0.188*"place" + -0.188*"hotel" + -0.188*"cri"'), (1, '0.408*"rashmi" + 0.408*"kant" + 0.408*"portug" + 0.408*"receiv" + 0.408*"kanji" + 0.408*"letter" + -0.000*"deiti" + 0.000*"forbidden" + 0.000*"file" + -0.000*"encroach"')]
