In [1]:
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import os
import gensim
from gensim import corpora

In [2]:
stopWords = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [3]:
summaryDir = "./Datasets/Summaries"
summaryFiles = os.listdir(summaryDir)

In [4]:
import pandas as pd

In [5]:
documents = []
for sFile in summaryFiles:
    f = open(os.path.join(summaryDir,sFile))
    documents.append(f.read())

In [6]:
data = pd.DataFrame({'Documents':documents})   # Creating data frame for documents

In [7]:
def clean(doc):
    stopFree = " ".join([word for word in doc.lower().split() if word not in stopWords])
    puncFree = "".join([word for word in stopFree if word not in exclude])
    normalized = " ".join([lemma.lemmatize(word) for word in puncFree.split()])
    return normalized

In [8]:
data['Clean Documents'] = data['Documents'].map(clean)

``` Now the Dataframe will contain the documents and their cleaned (normalized versions) with it ```

``` Now creating Bag Of Words on the Dataset ```

In [9]:
# Creating Bag of Words
docsForDict = []
for doc in data['Clean Documents']:
    tokens = doc.split()
    docsForDict.append(tokens)

In [10]:
data['Token Lists'] = docsForDict

In [11]:
dictionary = gensim.corpora.Dictionary(data['Token Lists'])

In [12]:
bagOfWords = [dictionary.doc2bow(doc) for doc in data['Token Lists']]

In [14]:
bagOfWords10 = bagOfWords[10]
for value in bagOfWords10:
    print("Word:",value[0],":",dictionary[value[0]],"appears",value[1],"times.")

Word: 19 : also appears 2 times.
Word: 20 : analysis appears 2 times.
Word: 24 : analyzing appears 1 times.
Word: 26 : argument appears 1 times.
Word: 31 : assumed appears 1 times.
Word: 37 : case appears 1 times.
Word: 51 : consists appears 1 times.
Word: 55 : could appears 1 times.
Word: 67 : every appears 1 times.
Word: 74 : first appears 1 times.
Word: 82 : identifying appears 2 times.
Word: 85 : instance appears 1 times.
Word: 97 : must appears 1 times.
Word: 102 : one appears 1 times.
Word: 103 : order appears 1 times.
Word: 110 : relation appears 1 times.
Word: 115 : respectively appears 1 times.
Word: 117 : result appears 1 times.
Word: 120 : section appears 6 times.
Word: 127 : set appears 6 times.
Word: 131 : since appears 1 times.
Word: 133 : source appears 1 times.
Word: 135 : structure appears 18 times.
Word: 140 : therefore appears 1 times.
Word: 142 : three appears 1 times.
Word: 143 : tree appears 33 times.
Word: 144 : two appears 7 times.
Word: 146 : use appears 2 time

``` Creating the TF-IDF Model on the BagOfWords ```

In [15]:
from gensim import models
tfIdf = models.TfidfModel(bagOfWords)

In [16]:
corpusTfIdf = tfIdf[bagOfWords]

In [17]:
corpusTfIdf[1]

[(6, 0.0063309794583893296),
 (16, 0.011099543936877992),
 (19, 0.002421228796177869),
 (20, 0.01868742373432438),
 (24, 0.015375772155766326),
 (32, 0.01239226965830445),
 (33, 0.026943624664441133),
 (34, 0.016353161128169823),
 (49, 0.03270632225633965),
 (71, 0.027612461987154727),
 (74, 0.0026723718043296864),
 (78, 0.010105509906997061),
 (86, 0.014942186141414869),
 (88, 0.0162216784318173),
 (103, 0.0036171919957734494),
 (104, 0.0045291480354023765),
 (106, 0.002470602369836127),
 (110, 0.09680144478468378),
 (114, 0.022837810655909094),
 (120, 0.004389838269311124),
 (126, 0.006831346925506218),
 (131, 0.003990769958366832),
 (132, 0.03382159122909624),
 (135, 0.0986272557919883),
 (136, 0.006727791412284012),
 (139, 0.0417156560205006),
 (143, 0.019878110868313623),
 (144, 0.0014703738091943028),
 (147, 0.001597109459641685),
 (153, 0.005044640199672259),
 (154, 0.2194466274664879),
 (155, 0.020966215222502447),
 (156, 0.022271768730860037),
 (157, 0.021773533066921084),
 (1

``` Making LDA on Bag Of Words ```

In [18]:
Lda = gensim.models.ldamodel.LdaModel
ldaBow = Lda(bagOfWords,id2word=dictionary,passes=50)

  diff = np.log(self.expElogbeta)


In [27]:
ldaBow.print_topics(-1)

[(0,
  '0.057*"grammar" + 0.040*"rule" + 0.027*"magic" + 0.016*"pruning" + 0.012*"constituent" + 0.010*"result" + 0.009*"specialization" + 0.009*"edge" + 0.009*"figure" + 0.008*"coverage"'),
 (1,
  '0.046*"transition" + 0.041*"grammar" + 0.021*"model" + 0.020*"language" + 0.020*"formalism" + 0.020*"word" + 0.016*"category" + 0.014*"dog" + 0.014*"state" + 0.012*"corpus"'),
 (2,
  '0.053*"word" + 0.018*"speech" + 0.017*"class" + 0.014*"corpus" + 0.014*"lexicon" + 0.013*"used" + 0.012*"ambiguity" + 0.011*"code" + 0.010*"suffix" + 0.010*"training"'),
 (3,
  '0.029*"language" + 0.021*"word" + 0.020*"processing" + 0.019*"segment" + 0.018*"speech" + 0.016*"lexical" + 0.016*"recognition" + 0.016*"phoneme" + 0.015*"text" + 0.012*"spoken"'),
 (4,
  '0.028*"colour" + 0.025*"equation" + 0.025*"solution" + 0.024*"rule" + 0.024*"variable" + 0.022*"occurrence" + 0.018*"coloured" + 0.016*"tgl" + 0.014*"primary" + 0.014*"set"'),
 (5,
  '0.029*"lexical" + 0.028*"datr" + 0.021*"word" + 0.017*"theory" + 0

``` Making LDA on Tf-Idf Values ```

In [29]:
ldaTfIdf = Lda(corpusTfIdf,id2word=dictionary,passes=50)

In [30]:
ldaTfIdf.print_topics(-1)

[(0,
  '0.000*"unprompted" + 0.000*"webber" + 0.000*"subgoals" + 0.000*"subordination" + 0.000*"tannen" + 0.000*"tod" + 0.000*"transferred" + 0.000*"uh" + 0.000*"started" + 0.000*"contextdependence"'),
 (1,
  '0.007*"structural" + 0.006*"operation" + 0.006*"current" + 0.006*"disambiguation" + 0.005*"lhip" + 0.005*"pause" + 0.004*"island" + 0.004*"seems" + 0.003*"constraintbased" + 0.003*"focusing"'),
 (2,
  '0.000*"unprompted" + 0.000*"webber" + 0.000*"subgoals" + 0.000*"subordination" + 0.000*"tannen" + 0.000*"tod" + 0.000*"transferred" + 0.000*"uh" + 0.000*"started" + 0.000*"contextdependence"'),
 (3,
  '0.005*"auxiliary" + 0.005*"foot" + 0.005*"domination" + 0.004*"sfs" + 0.004*"hpsg" + 0.002*"lexicalized" + 0.002*"link" + 0.001*"schema" + 0.001*"raised" + 0.001*"anchored"'),
 (4,
  '0.007*"tone" + 0.005*"pragmatic" + 0.005*"inheritance" + 0.005*"defeasible" + 0.004*"transcription" + 0.004*"dimension" + 0.004*"inference" + 0.003*"typed" + 0.003*"presupposition" + 0.003*"multidimensi

``` Making Concept Maps ```

In [31]:
conceptMapBow = dict()
for tup in ldaBow.print_topics(-1):
    s = tup[1]
    s = s.split('+')
    l = []
    for ele in s:
        l.append(ele[:-1])
    l1 = []
    for ele in l:
        t = ele.split('"')
        l1.append(t[1])
    conceptMapBow[tup[0]] = l1

In [32]:
conceptMapTfIdf = dict()
for tup in ldaTfIdf.print_topics(-1):
    s = tup[1]
    s = s.split('+')
    l = []
    for ele in s:
        l.append(ele[:-1])
    l1 = []
    for ele in l:
        t = ele.split('"')
        l1.append(t[1])
    conceptMapTfIdf[tup[0]] = l1

In [33]:
print(conceptMapTfIdf)

{0: ['unprompted', 'webber', 'subgoals', 'subordination', 'tannen', 'tod', 'transferred', 'uh', 'started', 'contextdependence'], 1: ['structural', 'operation', 'current', 'disambiguation', 'lhip', 'pause', 'island', 'seems', 'constraintbased', 'focusing'], 2: ['unprompted', 'webber', 'subgoals', 'subordination', 'tannen', 'tod', 'transferred', 'uh', 'started', 'contextdependence'], 3: ['auxiliary', 'foot', 'domination', 'sfs', 'hpsg', 'lexicalized', 'link', 'schema', 'raised', 'anchored'], 4: ['tone', 'pragmatic', 'inheritance', 'defeasible', 'transcription', 'dimension', 'inference', 'typed', 'presupposition', 'multidimensional'], 5: ['bin', 'optimal', 'uniform', 'learner', 'modebased', 'falling', 'arrive', 'logarithmic', 'itai', 'severe'], 6: ['attachment', 'backedoff', 'underspecified', 'htype', 'sparse', 'prepositional', 'ibm', '3097', 'board', 'director'], 7: ['obligation', 'actor', 'conversation', 'request', 'game', 'do', 'govern', 'suggestion', 'executable', 'repairing'], 8: ['u

In [34]:
print(conceptMapBow)

{0: ['grammar', 'rule', 'magic', 'pruning', 'constituent', 'result', 'specialization', 'edge', 'figure', 'coverage'], 1: ['transition', 'grammar', 'model', 'language', 'formalism', 'word', 'category', 'dog', 'state', 'corpus'], 2: ['word', 'speech', 'class', 'corpus', 'lexicon', 'used', 'ambiguity', 'code', 'suffix', 'training'], 3: ['language', 'word', 'processing', 'segment', 'speech', 'lexical', 'recognition', 'phoneme', 'text', 'spoken'], 4: ['colour', 'equation', 'solution', 'rule', 'variable', 'occurrence', 'coloured', 'tgl', 'primary', 'set'], 5: ['lexical', 'datr', 'word', 'theory', 'function', 'path', 'category', 'value', 'information', 'node'], 6: ['clause', 'subordinate', 'role', 'main', 'sentence', 'agent', 'semantic', 'task', 'complex', 'constraint'], 7: ['text', 'class', 'system', 'one', 'different', 'technique', 'nlg', 'use', 'used', 'number'], 8: ['rule', 'spelling', 'parsing', 'lexical', 'surface', 'algorithm', 'root', 'tabular', 'form', 'pattern'], 9: ['feature', 'str