In [53]:
import pandas as pd

In [54]:
npr = pd.read_csv('../writeups.csv')

In [55]:
# Check the first 5 rows of the csv
npr.head()

Unnamed: 0,Writeup_name,Category,Text,Overview,Analysis,Attack_execution
0,XMarkTheSpot,web,"Visiting the website, we are presented with a ...",1,0,0
1,XMarkTheSpot,web,The idea is similar to SQL injection. \nWe hav...,0,1,0
2,XMarkTheSpot,web,"For example, let's inject:\nThis should tell u...",0,0,1
3,Members,web,"In the challenge members, we need to get more ...",1,0,0
4,Members,web,"If we leave the search query empty, we would s...",0,1,0


In [56]:
# Let's see the first text
npr['Text'][0]

'Visiting the website, we are presented with a login form (and a Robert Frost poem).\nThe hint says "XPATH", and using some common XPATH injection techniques we can leak some information about the underlying DB.\n'

In [57]:
# Number of rows of the dataset
len(npr)

147

In [58]:
# Preprocessing

from sklearn.feature_extraction.text import CountVectorizer

# max_df = ignore high frequency terms (0-1)
# min_df = ignore low frequency terms (min # of documents containing it)
cv = CountVectorizer(max_df=0.9, min_df=4, stop_words='english')

dtm = cv.fit_transform(npr['Text'])

dtm

<147x319 sparse matrix of type '<class 'numpy.int64'>'
	with 2686 stored elements in Compressed Sparse Row format>

In [59]:
# LDA

from sklearn.decomposition import LatentDirichletAllocation

# n_components is the number of topics we are looking for
LDA = LatentDirichletAllocation(n_components=3, random_state=42)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=3, random_state=42)

In [60]:
# Grab the vocabulary of words

import random

#len(cv.get_feature_names()) #589

random_word_id = random.randint(0,779)

cv.get_feature_names()[random_word_id]



'command'

In [61]:
# Grab the topics

#len(LDA.components_) #3

#type(LDA.components_) #numpy.ndarray - Contains the words and their probability

# LDA.components_.shape # (3,589)

single_topic = LDA.components_[0]

single_topic.argsort() # returns an array containing the index position of each word instead of the word itself

# ARGSORT ----> INDEX POSITION SORTED FROM LEAST TO GREATEST
# TOP 10 VALUES (10 GREATEST VALUES)
# LAST 10 VALUES of argsort()
single_topic.argsort()[-10:] # grab the last 10 values of .argsort()

top_ten_words = single_topic.argsort()[-10:]

for index in top_ten_words:
    print(cv.get_feature_names()[index])

error
string
challenge
cookie
function
bypass
code
php
flag
file


In [62]:
# Grab the highest probability words per topic

for i,topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-10:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['error', 'string', 'challenge', 'cookie', 'function', 'bypass', 'code', 'php', 'flag', 'file']


THE TOP 15 WORDS FOR TOPIC #1
['login', 'javascript', 'code', 'using', 'script', 'password', 'use', 'page', 'website', 'flag']


THE TOP 15 WORDS FOR TOPIC #2
['challenge', 'user', 'url', 'body', 'payload', 'request', 'array', 'value', 'code', 'admin']




In [63]:

# Put in column the top
topic_results = LDA.transform(dtm)

npr['Topic'] = topic_results.argmax(axis=1)

topic_dictionary = {0:'Topic 0',1:'Topic 1',2:'Topic 2'}

npr['Topic Label'] = npr['Topic'].map(topic_dictionary)

In [64]:
type(npr)

npr.to_csv(r'LDA_export_dataframe.csv', index=False, header=True)