In [25]:
import pandas as pd

In [26]:
npr = pd.read_csv('../writeups_overview.csv')

In [27]:
# Check the first 5 rows of the csv
npr.head()
npr.dropna()
nRow, nCol = npr.shape
print(f'There are {nRow} rows and {nCol} columns in the training set after removing empty rows')


There are 44 rows and 4 columns in the training set after removing empty rows


In [28]:
# Let's see the first text
npr['Text'][0]

'Visiting the website, we are presented with a login form (and a Robert Frost poem).\nThe hint says "XPATH", and using some common XPATH injection techniques we can leak some information about the underlying DB.\n'

In [29]:
# Number of rows of the dataset
len(npr)

44

In [30]:
# Preprocessing

from sklearn.feature_extraction.text import CountVectorizer

# max_df = ignore high frequency terms (0-1)
# min_df = ignore low frequency terms (min # of documents containing it)
cv = CountVectorizer(max_df=0.9, min_df=4, stop_words='english')

dtm = cv.fit_transform(npr['Text'])

dtm

<44x18 sparse matrix of type '<class 'numpy.int64'>'
	with 126 stored elements in Compressed Sparse Row format>

In [31]:
# LDA

from sklearn.decomposition import LatentDirichletAllocation

# n_components is the number of topics we are looking for
LDA = LatentDirichletAllocation(n_components=1, random_state=42)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=1, random_state=42)

In [32]:
# Grab the vocabulary of words

import random

#len(cv.get_feature_names()) #18

random_word_id = random.randint(0,18)

cv.get_feature_names()[random_word_id]



'visiting'

In [33]:
# Grab the topics

#len(LDA.components_) #3

#type(LDA.components_) #numpy.ndarray - Contains the words and their probability

# LDA.components_.shape # (3,589)

single_topic = LDA.components_[0]

single_topic.argsort() # returns an array containing the index position of each word instead of the word itself

# ARGSORT ----> INDEX POSITION SORTED FROM LEAST TO GREATEST
# TOP 10 VALUES (10 GREATEST VALUES)
# LAST 10 VALUES of argsort()
single_topic.argsort()[-10:] # grab the last 10 values of .argsort()

top_ten_words = single_topic.argsort()[-10:]

for index in top_ten_words:
    print(cv.get_feature_names()[index])

injection
link
form
following
flag
login
code
page
website
challenge


In [39]:
# Grab the highest probability words per topic

for i,topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['appears', 'user', 'just', 'visiting', 'visit', 'different', 'says', 'admin', 'injection', 'link', 'form', 'following', 'flag', 'login', 'code', 'page', 'website', 'challenge']




In [35]:

# Put in column the top
topic_results = LDA.transform(dtm)

npr['Topic'] = topic_results.argmax(axis=1)

topic_dictionary = {0:'Topic 0',1:'Topic 1',2:'Topic 2'}

npr['Topic Label'] = npr['Topic'].map(topic_dictionary)

In [36]:
type(npr)

npr.to_csv(r'LDA_export_dataframe.csv', index=False, header=True)