In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
    
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



In [3]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['expect', 'from', 'subject', 're', 'use', 'iss', 'issue', 'bug', 'error', 'test', 'remove', 'event', 'result', 'enable', 'support'])

# NLTK Stop words - Noun
from nltk.corpus import stopwords
stop_words_noun = stopwords.words('english')
stop_words_noun.extend(['iss', 'value', 'case', 'from', 'subject', 're', 'show', 'crash', 'issue', 'error', 'support', 'bug', 'area','default'])

# NLTK Stop words - Verb
from nltk.corpus import stopwords
stop_words_verb = stopwords.words('english')
stop_words_verb.extend(['from', 'subject', 're', 'edu', 'use','iss','issue', 'iss-metro', 'ldp', 'iss_customiz'])

In [4]:
# Import Dataset
df = pd.read_csv('CSR_title_text.csv', error_bad_lines=False);
df.head()

Unnamed: 0,CSR_no,SR_title
0,143979,When will earfcns greater than 65535 be suppor...
1,144595,SR periodicity
2,144653,LTE Stack Issue with CLI
3,146023,L2 crash (Segmentation fault) when originating...
4,146100,L2 crash after pumping UL data from a UE.


In [5]:
# Convert to list
data = df.SR_title.values.tolist()
df.dtypes

CSR_no       int64
SR_title    object
dtype: object

In [6]:
# Remove special characters
data = [re.sub(r'[@#\$%&\*\(\)\<\>\?\'\";:\]\[-]', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

pprint(data[:1])

['When will earfcns greater than 65535 be supported ']


In [7]:
#Tokenize words and Clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['when', 'will', 'earfcns', 'greater', 'than', 'be', 'supported']]


In [8]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def remove_stopwords_noun(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words_noun] for doc in texts]

def remove_stopwords_verb(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words_verb] for doc in texts]

In [9]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [39]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
len(data_words)
len(data_words)
print(data_words_nostops[13:14])

data_words_nostops_noun = remove_stopwords_noun(data_words)
len(data_words)
len(data_words_nostops_noun)
print(data_words_nostops[13:14])

data_words_nostops_verb = remove_stopwords_verb(data_words)
len(data_words)
len(data_words_nostops_verb)
print(data_words_nostops[1:2])

[['rohc', 'working', 'crash', 'happened', 'rohc']]
[['rohc', 'working', 'crash', 'happened', 'rohc']]
[['sr', 'periodicity']]


In [11]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_lemmatized_noun = lemmatization(data_words_nostops_noun, allowed_postags=['NOUN'])
data_lemmatized_noun = [x for x in data_lemmatized_noun if x != []]

data_lemmatized_verb = lemmatization(data_words_nostops_verb, allowed_postags=['VERB'])
data_lemmatized_verb = [x for x in data_lemmatized_verb if x != []]

In [12]:
# Remove Stop Words from lemmatized words
data_lemmatized = remove_stopwords(data_lemmatized)
len(data_words)
len(data_lemmatized)
print(data_lemmatized[13:14])

data_lemmatized_noun = remove_stopwords_noun(data_lemmatized_noun)
len(data_words)
len(data_lemmatized_noun)
print(data_lemmatized_noun[13:14])

data_lemmatized_verb = remove_stopwords_verb(data_lemmatized_verb)
len(data_words)
len(data_lemmatized_verb)
print(data_lemmatized_verb[13:14])

[['rohc', 'work', 'crash', 'happen', 'rohc']]
[['build', 'package']]
[['feature', 'require']]


In [13]:
#Dictionary and Corpus needed for Topic Modeling
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

id2word_noun = corpora.Dictionary(data_lemmatized_noun)

id2word_verb = corpora.Dictionary(data_lemmatized_verb)

In [14]:
# Create Corpus
texts = data_lemmatized

texts_noun = data_lemmatized_noun

texts_verb = data_lemmatized_verb

In [15]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

corpus_noun = [id2word_noun.doc2bow(text) for text in texts_noun]

corpus_verb = [id2word_verb.doc2bow(text) for text in texts_verb]

In [16]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[13:14]]

[[(id2word[id], freq) for id, freq in cp] for cp in corpus_noun[13:14]]

[[(id2word[id], freq) for id, freq in cp] for cp in corpus_verb[13:14]]


[[('enodeb', 1), ('exchange', 1)]]

In [17]:
#Building the Topic Model
# Build LDA model for whole text
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
#Building the Topic Model
# Build LDA model for whole noun
lda_model_noun = gensim.models.ldamodel.LdaModel(corpus=corpus_noun,
                                           id2word=id2word_noun,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [19]:
#Building the Topic Model
# Build LDA model for whole verb
lda_model_verb = gensim.models.ldamodel.LdaModel(corpus=corpus_verb,
                                           id2word=id2word_verb,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)


In [31]:
#View the topics in LDA model
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_words=30))
doc_lda = lda_model[corpus]

[(0,
  '0.101*"eoam" + 0.069*"dev" + 0.038*"remote" + 0.038*"behavior" + '
  '0.029*"symbolperiod" + 0.017*"work" + 0.014*"node" + 0.013*"tunnel" + '
  '0.013*"bfd" + 0.012*"mac" + 0.011*"reload" + 0.011*"go" + 0.010*"update" + '
  '0.010*"path" + 0.009*"shut" + 0.009*"fault" + 0.008*"type" + '
  '0.008*"failure" + 0.008*"primary" + 0.008*"switch" + 0.008*"segmentation" + '
  '0.007*"remain" + 0.007*"end" + 0.007*"ring" + 0.007*"code" + '
  '0.006*"present" + 0.006*"even" + 0.006*"user" + 0.006*"protocol" + '
  '0.006*"status"'),
 (1,
  '0.073*"redmine" + 0.046*"route" + 0.032*"ospf" + 0.027*"interface" + '
  '0.023*"ospfv" + 0.022*"configure" + 0.020*"tlvs" + 0.019*"ipv" + '
  '0.018*"isis" + 0.016*"configuration" + 0.016*"fail" + 0.015*"fsu" + '
  '0.013*"router" + 0.013*"entry" + 0.012*"vpn" + 0.012*"crash" + '
  '0.011*"default" + 0.010*"ldp" + 0.010*"mpls" + 0.009*"area" + 0.008*"set" + '
  '0.008*"traffic" + 0.008*"static" + 0.008*"learn" + 0.007*"peer" + '
  '0.007*"new" + 0.007

In [30]:
#View the topics in LDA noun model 
# Print the Keyword in the 10 topics
pprint(lda_model_noun.print_topics(num_words=30))
doc_lda_noun = lda_model_noun[corpus_noun]

[(0,
  '0.057*"interface" + 0.055*"ospf" + 0.048*"command" + 0.041*"ip" + '
  '0.038*"drop" + 0.026*"router" + 0.019*"packet" + 0.018*"blocker" + '
  '0.016*"time" + 0.015*"traffic" + 0.014*"question" + 0.013*"output" + '
  '0.013*"peer" + 0.011*"tlv" + 0.011*"module" + 0.011*"shutdown" + '
  '0.010*"igs" + 0.010*"information" + 0.009*"link" + 0.009*"lsa" + '
  '0.009*"user" + 0.009*"fault" + 0.009*"ig" + 0.009*"behavior" + '
  '0.009*"config" + 0.008*"segmentation" + 0.008*"ecmp" + 0.008*"sync" + '
  '0.007*"code" + 0.007*"memory"'),
 (1,
  '0.119*"redmine" + 0.077*"route" + 0.052*"port" + 0.030*"vlan" + '
  '0.029*"isis" + 0.024*"session" + 0.023*"entry" + 0.022*"ipv" + 0.021*"fsu" '
  '+ 0.020*"vpn" + 0.019*"address" + 0.018*"scenario" + 0.015*"ospfv" + '
  '0.014*"igmp" + 0.013*"table" + 0.013*"qos" + 0.012*"number" + 0.011*"acl" + '
  '0.011*"svt" + 0.010*"mrouter" + 0.010*"csr" + 0.010*"mac" + 0.009*"request" '
  '+ 0.008*"portchannel" + 0.008*"routemap" + 0.007*"bgp" + 0.006*"co

In [29]:
#View the topics in LDA verb model 
# Print the Keyword in the 10 topics
pprint(lda_model_verb.print_topics(num_words=30))
doc_lda_verb = lda_model_verb[corpus_verb]

[(0,
  '0.121*"fail" + 0.103*"send" + 0.081*"display" + 0.051*"go" + 0.038*"instal" '
  '+ 0.034*"remain" + 0.033*"create" + 0.032*"would" + 0.026*"give" + '
  '0.015*"log" + 0.015*"stp" + 0.014*"snmpv" + 0.014*"follow" + 0.011*"stop" + '
  '0.010*"rtm" + 0.009*"return" + 0.008*"schedule" + 0.008*"recover" + '
  '0.007*"leave" + 0.007*"help" + 0.007*"call" + 0.006*"seem" + '
  '0.006*"include" + 0.006*"max" + 0.005*"contain" + 0.005*"present" + '
  '0.005*"link" + 0.005*"respond" + 0.005*"flush" + 0.005*"flap"'),
 (1,
  '0.089*"enable" + 0.085*"change" + 0.057*"work" + 0.048*"observe" + '
  '0.032*"reload" + 0.025*"receive" + 0.024*"miss" + 0.022*"rout" + '
  '0.021*"occur" + 0.021*"perform" + 0.020*"relate" + 0.019*"forward" + '
  '0.017*"take" + 0.017*"rip" + 0.016*"disable" + 0.016*"regard" + 0.013*"ig" '
  '+ 0.013*"transmit" + 0.012*"elect" + 0.012*"establish" + 0.012*"expire" + '
  '0.011*"stay" + 0.011*"happen" + 0.009*"accept" + 0.009*"sync" + '
  '0.008*"stick" + 0.008*"join" 

In [23]:
#Compute Model Perplexity and Coherence Score
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Perplexity
print('\nPerplexity: ', lda_model_verb.log_perplexity(corpus_verb))  # a measure of how good the model is. lower the better.

# Compute Perplexity
print('\nPerplexity: ', lda_model_noun.log_perplexity(corpus_noun))  # a measure of how good the model is. lower the better. 


Perplexity:  -7.706076333241437

Perplexity:  -6.354237855077625

Perplexity:  -7.3751259710863595


In [24]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

coherence_model_lda_noun = CoherenceModel(model=lda_model_noun, texts=data_lemmatized_noun, dictionary=id2word_noun, coherence='c_v')
coherence_lda_noun = coherence_model_lda_noun.get_coherence()
print('\nCoherence Score noun: ', coherence_lda_noun)

coherence_model_lda_verb = CoherenceModel(model=lda_model_verb, texts=data_lemmatized_verb, dictionary=id2word_verb, coherence='c_v')
coherence_lda_verb = coherence_model_lda_verb.get_coherence()
print('\nCoherence Score verb: ', coherence_lda_verb)


Coherence Score:  0.3876856847583213

Coherence Score noun:  0.46586660259100254

Coherence Score verb:  0.6207764547543433


In [25]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis_noun = pyLDAvis.gensim.prepare(lda_model_noun, corpus_noun, id2word_noun)
vis_noun

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis_verb = pyLDAvis.gensim.prepare(lda_model_verb, corpus_verb, id2word_verb)
vis_verb

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [32]:
lda_model_noun.print_topics(num_words=30)

[(0,
  '0.057*"interface" + 0.055*"ospf" + 0.048*"command" + 0.041*"ip" + 0.038*"drop" + 0.026*"router" + 0.019*"packet" + 0.018*"blocker" + 0.016*"time" + 0.015*"traffic" + 0.014*"question" + 0.013*"output" + 0.013*"peer" + 0.011*"tlv" + 0.011*"module" + 0.011*"shutdown" + 0.010*"igs" + 0.010*"information" + 0.009*"link" + 0.009*"lsa" + 0.009*"user" + 0.009*"fault" + 0.009*"ig" + 0.009*"behavior" + 0.009*"config" + 0.008*"segmentation" + 0.008*"ecmp" + 0.008*"sync" + 0.007*"code" + 0.007*"memory"'),
 (1,
  '0.119*"redmine" + 0.077*"route" + 0.052*"port" + 0.030*"vlan" + 0.029*"isis" + 0.024*"session" + 0.023*"entry" + 0.022*"ipv" + 0.021*"fsu" + 0.020*"vpn" + 0.019*"address" + 0.018*"scenario" + 0.015*"ospfv" + 0.014*"igmp" + 0.013*"table" + 0.013*"qos" + 0.012*"number" + 0.011*"acl" + 0.011*"svt" + 0.010*"mrouter" + 0.010*"csr" + 0.010*"mac" + 0.009*"request" + 0.008*"portchannel" + 0.008*"routemap" + 0.007*"bgp" + 0.006*"configure" + 0.006*"redistribute" + 0.006*"deadlock" + 0.006*"

In [33]:
X = lda_model_noun.print_topics(num_words=30)

In [34]:
X

[(0,
  '0.057*"interface" + 0.055*"ospf" + 0.048*"command" + 0.041*"ip" + 0.038*"drop" + 0.026*"router" + 0.019*"packet" + 0.018*"blocker" + 0.016*"time" + 0.015*"traffic" + 0.014*"question" + 0.013*"output" + 0.013*"peer" + 0.011*"tlv" + 0.011*"module" + 0.011*"shutdown" + 0.010*"igs" + 0.010*"information" + 0.009*"link" + 0.009*"lsa" + 0.009*"user" + 0.009*"fault" + 0.009*"ig" + 0.009*"behavior" + 0.009*"config" + 0.008*"segmentation" + 0.008*"ecmp" + 0.008*"sync" + 0.007*"code" + 0.007*"memory"'),
 (1,
  '0.119*"redmine" + 0.077*"route" + 0.052*"port" + 0.030*"vlan" + 0.029*"isis" + 0.024*"session" + 0.023*"entry" + 0.022*"ipv" + 0.021*"fsu" + 0.020*"vpn" + 0.019*"address" + 0.018*"scenario" + 0.015*"ospfv" + 0.014*"igmp" + 0.013*"table" + 0.013*"qos" + 0.012*"number" + 0.011*"acl" + 0.011*"svt" + 0.010*"mrouter" + 0.010*"csr" + 0.010*"mac" + 0.009*"request" + 0.008*"portchannel" + 0.008*"routemap" + 0.007*"bgp" + 0.006*"configure" + 0.006*"redistribute" + 0.006*"deadlock" + 0.006*"

In [38]:
X = re.sub(r'\w', ' ', X)

TypeError: expected string or bytes-like object