In [1]:
import pandas as pd

In [2]:
# Read the CSIC 2010 and ECML/PKDD 2010 dataset
df = pd.read_csv('../data/Web-Application-Attack-Datasets/CSVData/csic_ecml_final.csv')

# extract payloads
payloads = df[['Class', 'GET-Query', 'POST-Data']].copy()
payloads["Payload"] = payloads['GET-Query'].fillna(payloads['POST-Data'])

# Keep only the Class and Payload columns
payloads = payloads[['Payload', 'Class']]
payloads = payloads.sort_values(by='Class')
payloads = payloads.dropna()

# Keep only the benign payloads
csic_ecml_payloads = payloads[payloads['Class'] == 'Valid'].copy()
csic_ecml_payloads.rename(columns={'Class': 'Label'}, inplace=True)
csic_ecml_payloads['Label'] = 'benign'
csic_ecml_payloads['Category'] = 'benign'

print(csic_ecml_payloads.sample(5))
print(csic_ecml_payloads["Category"].value_counts())
print(csic_ecml_payloads["Label"].value_counts())

                                                 Payload   Label Category
6                                                   id=2  benign   benign
68646  kftootsaih=rtm&radn=oox8mtgasSfuhvyesu&ea7eutw...  benign   benign
67564  sTku2nnaas=eYoelear4is%40se&scsAeN7=133&sat=op...  benign   benign
7324                                   B2=Vaciar+carrito  benign   benign
7575   modo=registro&login=saikaley&password=commrade...  benign   benign
Category
benign    24487
Name: count, dtype: int64
Label
benign    24487
Name: count, dtype: int64


In [3]:
# Read mixed payloads from grananqvist repository
df = pd.read_csv('../data/Machine-Learning-Web-Application-Firewall-and-Dataset/data/payloads.csv')

# Keep only the payload, is_malicious and injection_type columns
payloads = df[['payload', 'is_malicious', 'injection_type']].copy()
payloads = payloads.sort_values(by='is_malicious')
payloads = payloads.dropna()

# Rename columns and convert to common format
granqvist_payloads = payloads.copy()
granqvist_payloads.rename(columns={'payload': 'Payload', 'is_malicious': 'Label', 'injection_type': 'Category'}, inplace=True)
granqvist_payloads['Label'] = granqvist_payloads['Label'].apply(lambda x: 'benign' if x == 0 else 'malicious')
granqvist_payloads['Category'] = granqvist_payloads['Category'].str.lower()
granqvist_payloads['Category'] = granqvist_payloads['Category'].replace('legal', 'benign')

print(granqvist_payloads[granqvist_payloads['Label'] == 'benign'].sample(5))
print(granqvist_payloads[granqvist_payloads['Label'] == 'malicious'].sample(5))
print(granqvist_payloads["Category"].value_counts())
print(granqvist_payloads["Label"].value_counts())

                Payload   Label Category
29366  3317651896955813  benign   benign
79216         Deser%E9e  benign   benign
52553      MAldItAmENTe  benign   benign
82609       mc_8165_677  benign   benign
83988       IPI00204311  benign   benign
                                                  Payload      Label Category
26403                       <IMG SRC="livescript:[code]">  malicious      xss
99037                       '><body/onpageshow=alert(1);>  malicious      xss
109865  /%%32e%%32e%%32f%%32e%%32e%%32f%%32e%%32e%%32f...  malicious    shell
77848   ABC<div style="x:\\xE2\\x80\\x8Bexpression(jav...  malicious      xss
101144  <a href="data:text&sol;html;&Tab;base64&NewLin...  malicious      xss
Category
benign    100496
xss         7891
sql         1650
shell        319
Name: count, dtype: int64
Label
benign       100496
malicious      9860
Name: count, dtype: int64


In [4]:
# Read XSS payloads from xss-payload-list repository
with open('../data/xss-payload-list/Intruder/xss-payload-list.txt', 'r') as f:
    xss_payloads = f.readlines()

# Convert to DataFrame in common format
xss_payloads = [x.strip() for x in xss_payloads]
xss_payloads = pd.DataFrame(xss_payloads, columns=['Payload'])
xss_payloads['Label'] = 'malicious'
xss_payloads['Category'] = 'xss'

print(xss_payloads.sample(5))
print(xss_payloads["Category"].value_counts())
print(xss_payloads["Label"].value_counts())

                                                Payload      Label Category
426        <audio oncontextmenu="alert(1)">test</audio>  malicious      xss
1901  <figcaption id=x tabindex=1 onbeforedeactivate...  malicious      xss
337   <article draggable="true" ondragend="alert(1)"...  malicious      xss
5816  <time draggable="true" ondrag="alert(1)">test<...  malicious      xss
4515  <style>:target {color:red;}</style><nobr id=x ...  malicious      xss
Category
xss    6613
Name: count, dtype: int64
Label
malicious    6613
Name: count, dtype: int64


In [5]:
# Read payloads from syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning Kaggle dataset
df = pd.read_csv('../data/cross-site-scripting-xss-dataset-for-deep-learning/XSS_dataset.csv')

# Keep only the Sentence and Label columns
payloads = df[['Sentence', 'Label']].copy()
payloads = payloads.sort_values(by='Label')
payloads = payloads.dropna()

# Rename columns and convert to common format
xss_payloads_kaggle = payloads.copy()
xss_payloads_kaggle.rename(columns={'Sentence': 'Payload'}, inplace=True)
xss_payloads_kaggle['Category'] = 'xss'
xss_payloads_kaggle['Label'] = xss_payloads_kaggle['Label'].apply(lambda x: 'benign' if x == 0 else 'malicious')

print(xss_payloads_kaggle[xss_payloads_kaggle['Label'] == 'benign'].sample(5))
print(xss_payloads_kaggle[xss_payloads_kaggle['Label'] == 'malicious'].sample(5))
print(xss_payloads_kaggle["Category"].value_counts())
print(xss_payloads_kaggle["Label"].value_counts())

                                                 Payload   Label Category
11454                \t <div style="padding:0em 0.25em">  benign      xss
9009   <li id="cite_note-332"><span class="mw-cite-ba...  benign      xss
9328   \t </span><link rel="mw-deduplicated-inline-st...  benign      xss
5880                                                <dl>  benign      xss
11045  \t </span> <span class="reference-text"><cite ...  benign      xss
                                                 Payload      Label Category
4938   <style>@keyframes x{}</style><script style="an...  malicious      xss
11059  <element draggable="true" ondrag="alert(1)">te...  malicious      xss
13484  <strong id=x tabindex=1 ondeactivate=alert(1)>...  malicious      xss
1215   <code id=x tabindex=1 ondeactivate=alert(1)></...  malicious      xss
2160   <style>@keyframes x{}</style><bdi style="anima...  malicious      xss
Category
xss    13686
Name: count, dtype: int64
Label
malicious    7373
benign       6313
Name

In [6]:
# merge all payloads
all_payloads = pd.concat([csic_ecml_payloads, granqvist_payloads, xss_payloads, xss_payloads_kaggle])
all_payloads = all_payloads.drop_duplicates(subset='Payload')
all_payloads = all_payloads.reset_index(drop=True)
all_payloads = all_payloads.sort_values(by='Label')

# lower case all column names
all_payloads.columns = all_payloads.columns.str.lower()
all_payloads.to_csv('../data/dataset.csv', index=False)

print(all_payloads.sample(5))
print(all_payloads["category"].value_counts())
print(all_payloads["label"].value_counts())

            payload   label category
92050  developement  benign   benign
40198    f983Ga2783  benign   benign
41561      Somerset  benign   benign
87797       dwarfed  benign   benign
34170     somewhere  benign   benign
category
benign    113784
xss        18506
sql         1367
shell        319
Name: count, dtype: int64
label
benign       117377
malicious     16599
Name: count, dtype: int64
