In [9]:
import pandas as pd

In [10]:
npr = pd.read_csv('./writeups_analysis.csv')

In [11]:
# Check the first 5 rows of the csv
npr.head()

Unnamed: 0,Writeup_name,Category,Text,Label
0,XMarkTheSpot,web,The idea is similar to SQL injection. \nWe hav...,analysis
1,Members,web,"If we leave the search query empty, we would s...",analysis
2,05_Cursed_Secret_Party,web,"Looking into the burpsuite proxy, it is postin...",analysis
3,High_Security,web,"Instead of the path /members, this time we hav...",analysis
4,27_Forbidden_Paths,web,"With file paths, a preceeding ./ means the cur...",analysis


In [12]:
# Let's see the first text
npr['Text'][0]

'The idea is similar to SQL injection. \nWe have a vulnerable query for authenticating the username and password which should be similar to.\nSo, if we inject some valid XPATH vocabulary into the query, we can manipulate it.\n'

In [13]:
# Number of rows of the dataset
len(npr)

51

In [14]:
# Preprocessing

from sklearn.feature_extraction.text import CountVectorizer

# max_df = ignore high frequency terms (0-1)
# min_df = ignore low frequency terms (min # of documents containing it)
cv = CountVectorizer(max_df=0.9, min_df=4, stop_words='english')

dtm = cv.fit_transform(npr['Text'])

dtm

<51x179 sparse matrix of type '<class 'numpy.int64'>'
	with 1174 stored elements in Compressed Sparse Row format>

In [15]:
# LDA

from sklearn.decomposition import LatentDirichletAllocation

# n_components is the number of topics we are looking for
LDA = LatentDirichletAllocation(n_components=1, random_state=42)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=1, random_state=42)

In [16]:
# Grab the vocabulary of words

import random

#len(cv.get_feature_names()) #589

random_word_id = random.randint(0,179)

cv.get_feature_names()[random_word_id]



'tells'

In [17]:
# Grab the topics

#len(LDA.components_) #3

#type(LDA.components_) #numpy.ndarray - Contains the words and their probability

# LDA.components_.shape # (3,589)

single_topic = LDA.components_[0]

single_topic.argsort() # returns an array containing the index position of each word instead of the word itself

# ARGSORT ----> INDEX POSITION SORTED FROM LEAST TO GREATEST
# TOP 10 VALUES (10 GREATEST VALUES)
# LAST 10 VALUES of argsort()
single_topic.argsort()[-10:] # grab the last 10 values of .argsort()

top_ten_words = single_topic.argsort()[-10:]

for index in top_ten_words:
    print(cv.get_feature_names()[index])

let
page
use
source
flag
file
function
value
admin
code


In [23]:
# Grab the highest probability words per topic

for i,topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-10000:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['able', 'inspecting', 'getting', 'io', 'looked', 'making', 'entry', 'elements', 'output', 'override', 'reading', 'clear', 'const', 'help', 'tells', 'argument', 'unfortunately', 'users', 'logic', 'long', 'valid', 'exploit', 'controlled', 'leak', 'element', 'note', 'vulnerable', 'access', 'decode', 'statement', 'creates', 'load', 'provided', 'burpsuite', 'idea', 'calls', 'case', 'second', 'submit', 'assume', 'checks', 'pretty', 'condition', 'console', 'called', 'instead', 'actually', 'll', 'problem', 'similar', 'magic', 'different', 'random', 'new', 'think', 'click', 'client', 'response', 'picoctf', 'site', 'provide', 'solve', 'result', 'shows', 'write', 'files', 'given', 'completely', 'rendered', 'requests', 'happens', 'parameter', 'stored', 'app', 'read', 'variable', 'contains', 'web', 'want', 'work', 'gave', 'pass', 'thing', 'process', 'field', 'passed', 'interesting', 'simply', 'browser', 'run', 'works', 'perform', 'right', 'html', 'text', 'directly', '

In [19]:

# Put in column the top
topic_results = LDA.transform(dtm)

npr['Topic'] = topic_results.argmax(axis=1)

topic_dictionary = {0:'Topic 0',1:'Topic 1',2:'Topic 2'}

npr['Topic Label'] = npr['Topic'].map(topic_dictionary)

In [20]:
type(npr)

npr.to_csv(r'LDA_export_dataframe.csv', index=False, header=True)