In [2]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import os

import gensim
from gensim import corpora
from gensim.models import LsiModel

In [3]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

summary_dir = './Datasets/Summaries'
summaries = os.listdir(summary_dir)

In [4]:
def clean(doc):
    f = open(os.path.join(summary_dir, doc),"r")
    contents = f.read()

    stop_free = " ".join([i for i in contents.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    f.close()
    return normalized

In [5]:
doc_clean = [clean(doc).split() for doc in summaries] 

In [6]:
dictionary = corpora.Dictionary(doc_clean) 

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [7]:
lsamodel = LsiModel(doc_term_matrix, id2word = dictionary)  # train model
print(lsamodel.print_topics(-1))

[(0, '0.243*"word" + 0.215*"evaluation" + 0.214*"rule" + 0.205*"language" + 0.197*"sentence" + 0.181*"grammar" + 0.178*"interpretation" + 0.165*"task" + 0.159*"model" + 0.156*"set"'), (1, '0.426*"evaluation" + 0.323*"assessment" + 0.308*"task" + 0.256*"user" + 0.232*"technology" + 0.218*"project" + 0.189*"data" + -0.185*"interpretation" + 0.170*"system" + -0.164*"sentence"'), (2, '-0.425*"word" + 0.404*"interpretation" + -0.261*"rule" + 0.225*"sentence" + 0.223*"clause" + 0.221*"reading" + 0.169*"ellipsis" + 0.165*"source" + -0.160*"model" + 0.159*"ambiguity"'), (3, '-0.549*"rule" + 0.479*"word" + -0.335*"grammar" + 0.259*"model" + -0.215*"feature" + -0.138*"structure" + -0.131*"tree" + 0.100*"corpus" + 0.083*"speech" + 0.081*"probability"'), (4, '0.390*"discourse" + -0.320*"clause" + -0.288*"rule" + -0.260*"reading" + -0.253*"source" + -0.232*"ellipsis" + -0.194*"word" + 0.156*"model" + 0.130*"theory" + 0.124*"ambiguity"'), (5, '0.514*"rule" + -0.355*"feature" + -0.303*"term" + -0.264

In [8]:
conceptMap = dict()
for tup in lsamodel.print_topics(-1):
    s = tup[1]
    s = s.split('+')
    l = []
    for ele in s:
        l.append(ele[:-1])
    l1 = []
    for ele in l:
        t = ele.split('"')
        l1.append(t[1])
    conceptMap[tup[0]] = l1

In [9]:
conceptMap

{0: ['word',
  'evaluation',
  'rule',
  'language',
  'sentence',
  'grammar',
  'interpretation',
  'task',
  'model',
  'set'],
 1: ['evaluation',
  'assessment',
  'task',
  'user',
  'technology',
  'project',
  'data',
  'interpretation',
  'system',
  'sentence'],
 2: ['word',
  'interpretation',
  'rule',
  'sentence',
  'clause',
  'reading',
  'ellipsis',
  'source',
  'model',
  'ambiguity'],
 3: ['rule',
  'word',
  'grammar',
  'model',
  'feature',
  'structure',
  'tree',
  'corpus',
  'speech',
  'probability'],
 4: ['discourse',
  'clause',
  'rule',
  'reading',
  'source',
  'ellipsis',
  'word',
  'model',
  'theory',
  'ambiguity'],
 5: ['rule',
  'feature',
  'term',
  'tree',
  'grammar',
  'structure',
  'discourse',
  'interpretation',
  'word',
  'node'],
 6: ['noun',
  'phrase',
  'term',
  'pronoun',
  'language',
  'possessive',
  'ambiguity',
  'interpretation',
  'grammar',
  'theory'],
 7: ['term',
  'tree',
  'dictionary',
  'model',
  'feature',
  'wor