In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
import glob
import time 
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


In [14]:
benign = pd.read_csv('benign.csv', header=0)
inline_obf = pd.read_csv('inline_obfuscation.csv', header=0)
percent_obf = pd.read_csv('invalid_percent_encoding_obfuscation.csv', header=0)
url_obf = pd.read_csv('url_encoding_obfuscation.csv', header=0)
nest_obf = pd.read_csv('nested_command_obfuscation.csv', header=0)

In [15]:
# create a new df with bengin and an obf togther
train_inline = pd.concat([benign, inline_obf], ignore_index=True)
train_percent = pd.concat([benign, percent_obf], ignore_index=True)
train_url = pd.concat([benign, url_obf], ignore_index=True)
train_nest = pd.concat([benign, nest_obf], ignore_index=True)

In [20]:
# get rid of NaN entries
train_inline = train_inline.dropna()
train_percent = train_percent.dropna()
train_url = train_url.dropna()
train_nest = train_nest.dropna()

In [21]:
print(train_inline["Query"])

0                                  99745017c
1                                   ejerci78
2                                      47209
3        calle valencia de don juan 161, 7?d
4                                     b3r3al
                        ...                 
35113                                    %26
35114                                    %21
35115                        ' o/**/r '' = '
35116                         ' o/**/r 3 = 3
35117                        o/**/r 3 = 3 --
Name: Query, Length: 35106, dtype: object


In [22]:
# 
vectorizer = CountVectorizer(stop_words='english')
X_inline = vectorizer.fit_transform(train_inline['Query'])
X_percent = vectorizer.fit_transform(train_percent['Query'])
X_url = vectorizer.fit_transform(train_url['Query'])
X_nest = vectorizer.fit_transform(train_nest['Query'])


In [23]:
#
y_inline = train_inline['Label']
y_percent = train_percent['Label']
y_url = train_url['Label']
y_nest = train_nest['Label']

In [24]:
# 
from sklearn.model_selection import train_test_split
X_inline_train, X_inline_test, y_inline_train, y_inline_test = train_test_split(X_inline, y_inline, test_size=0.2, random_state=0)
X_percent_train, X_percent_test, y_percent_train, y_percent_test = train_test_split(X_percent, y_percent, test_size=0.2, random_state=0)
X_url_train, X_url_test, y_url_train, y_url_test = train_test_split(X_url, y_url, test_size=0.2, random_state=0)
X_nest_train, X_nest_test, y_nest_train, y_nest_test = train_test_split(X_nest, y_nest, test_size=0.2, random_state=0)

In [25]:
# 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [26]:
# 
classifier = MultinomialNB()
classifier.fit(X_inline_train, y_inline_train)
y_inline_pred = classifier.predict(X_inline_test)
print(confusion_matrix(y_inline_test, y_inline_pred))
print(accuracy_score(y_inline_test, y_inline_pred))
print(classification_report(y_inline_test, y_inline_pred))


[[4439   62]
 [ 127 2394]]
0.9730845912845343
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4501
           1       0.97      0.95      0.96      2521

    accuracy                           0.97      7022
   macro avg       0.97      0.97      0.97      7022
weighted avg       0.97      0.97      0.97      7022



In [27]:
# 
classifier = MultinomialNB()
classifier.fit(X_percent_train, y_percent_train)
y_percent_pred = classifier.predict(X_percent_test)
print(confusion_matrix(y_percent_test, y_percent_pred))
print(accuracy_score(y_percent_test, y_percent_pred))
print(classification_report(y_percent_test, y_percent_pred))

[[4428   73]
 [ 158 2363]]
0.9671033893477642
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      4501
           1       0.97      0.94      0.95      2521

    accuracy                           0.97      7022
   macro avg       0.97      0.96      0.96      7022
weighted avg       0.97      0.97      0.97      7022



In [28]:
# 
classifier = MultinomialNB()
classifier.fit(X_url_train, y_url_train)
y_url_pred = classifier.predict(X_url_test)
print(confusion_matrix(y_url_test, y_url_pred))
print(accuracy_score(y_url_test, y_url_pred))
print(classification_report(y_url_test, y_url_pred))

[[4509   18]
 [   8 2487]]
0.9962973511819995
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4527
           1       0.99      1.00      0.99      2495

    accuracy                           1.00      7022
   macro avg       1.00      1.00      1.00      7022
weighted avg       1.00      1.00      1.00      7022



In [29]:
# 
classifier = MultinomialNB()
classifier.fit(X_nest_train, y_nest_train)
y_nest_pred = classifier.predict(X_nest_test)
print(confusion_matrix(y_nest_test, y_nest_pred))
print(accuracy_score(y_nest_test, y_nest_pred))
print(classification_report(y_nest_test, y_nest_pred))

[[4445   56]
 [  47 2474]]
0.9853318142979208
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4501
           1       0.98      0.98      0.98      2521

    accuracy                           0.99      7022
   macro avg       0.98      0.98      0.98      7022
weighted avg       0.99      0.99      0.99      7022



In [30]:
# baseline
unobf = pd.read_csv('queries.csv', header=0)
unobf = unobf.dropna()
X_unobf = vectorizer.fit_transform(unobf['Query'])
y_unobf = unobf['Label']
X_unobf_train, X_unobf_test, y_unobf_train, y_unobf_test = train_test_split(X_unobf, y_unobf, test_size=0.2, random_state=0)

In [31]:
# 
classifier = MultinomialNB()
classifier.fit(X_unobf_train, y_unobf_train)
y_unobf_pred = classifier.predict(X_unobf_test)
print(confusion_matrix(y_unobf_test, y_unobf_pred))
print(accuracy_score(y_unobf_test, y_unobf_pred))
print(classification_report(y_unobf_test, y_unobf_pred))

[[4478   76]
 [ 209 2259]]
0.9594132725719169
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4554
           1       0.97      0.92      0.94      2468

    accuracy                           0.96      7022
   macro avg       0.96      0.95      0.95      7022
weighted avg       0.96      0.96      0.96      7022



In [32]:
## OBFUSCATING THE ENTIRE DATASET
#
all_inline = pd.read_csv('all_inline.csv', header=0)
all_percent = pd.read_csv('all_invalid_percent_encoding.csv', header=0)
all_url = pd.read_csv('all_url_encoding.csv', header=0)
all_nest = pd.read_csv('all_nested_command.csv', header=0)

In [34]:
# 
all_inline = all_inline.dropna()
all_percent = all_percent.dropna()
all_url = all_url.dropna()
all_nest = all_nest.dropna()

In [35]:
# 
X_all_inline = vectorizer.fit_transform(all_inline['Query'])
X_all_percent = vectorizer.fit_transform(all_percent['Query'])
X_all_url = vectorizer.fit_transform(all_url['Query'])
X_all_nest = vectorizer.fit_transform(all_nest['Query'])

In [36]:
# 
y_all_inline = all_inline['Label']
y_all_percent = all_percent['Label']
y_all_url = all_url['Label']
y_all_nest = all_nest['Label']

In [37]:
# 
X_all_inline_train, X_all_inline_test, y_all_inline_train, y_all_inline_test = train_test_split(X_all_inline, y_all_inline, test_size=0.2, random_state=0)
X_all_percent_train, X_all_percent_test, y_all_percent_train, y_all_percent_test = train_test_split(X_all_percent, y_all_percent, test_size=0.2, random_state=0)
X_all_url_train, X_all_url_test, y_all_url_train, y_all_url_test = train_test_split(X_all_url, y_all_url, test_size=0.2, random_state=0)
X_all_nest_train, X_all_nest_test, y_all_nest_train, y_all_nest_test = train_test_split(X_all_nest, y_all_nest, test_size=0.2, random_state=0)

In [38]:
# 
classifier = MultinomialNB()
classifier.fit(X_all_inline_train, y_all_inline_train)
y_all_inline_pred = classifier.predict(X_all_inline_test)
print(confusion_matrix(y_all_inline_test, y_all_inline_pred))
print(accuracy_score(y_all_inline_test, y_all_inline_pred))
print(classification_report(y_all_inline_test, y_all_inline_pred))

[[4474   80]
 [ 216 2252]]
0.9578467673027627
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      4554
           1       0.97      0.91      0.94      2468

    accuracy                           0.96      7022
   macro avg       0.96      0.95      0.95      7022
weighted avg       0.96      0.96      0.96      7022



In [39]:
#
classifier = MultinomialNB()
classifier.fit(X_all_percent_train, y_all_percent_train)
y_all_percent_pred = classifier.predict(X_all_percent_test)
print(confusion_matrix(y_all_percent_test, y_all_percent_pred))
print(accuracy_score(y_all_percent_test, y_all_percent_pred))
print(classification_report(y_all_percent_test, y_all_percent_pred))

[[4471   83]
 [ 238 2230]]
0.9542865280546853
              precision    recall  f1-score   support

           0       0.95      0.98      0.97      4554
           1       0.96      0.90      0.93      2468

    accuracy                           0.95      7022
   macro avg       0.96      0.94      0.95      7022
weighted avg       0.95      0.95      0.95      7022



In [40]:
#
classifier = MultinomialNB()
classifier.fit(X_all_url_train, y_all_url_train)
y_all_url_pred = classifier.predict(X_all_url_test)
print(confusion_matrix(y_all_url_test, y_all_url_pred))
print(accuracy_score(y_all_url_test, y_all_url_pred))
print(classification_report(y_all_url_test, y_all_url_pred))

[[3999  551]
 [ 134 2340]]
0.9024772209567198
              precision    recall  f1-score   support

           0       0.97      0.88      0.92      4550
           1       0.81      0.95      0.87      2474

    accuracy                           0.90      7024
   macro avg       0.89      0.91      0.90      7024
weighted avg       0.91      0.90      0.90      7024



In [41]:
# 
classifier = MultinomialNB()
classifier.fit(X_all_nest_train, y_all_nest_train)
y_all_nest_pred = classifier.predict(X_all_nest_test)
print(confusion_matrix(y_all_nest_test, y_all_nest_pred))
print(accuracy_score(y_all_nest_test, y_all_nest_pred))
print(classification_report(y_all_nest_test, y_all_nest_pred))

[[4480   74]
 [ 151 2317]]
0.9679578467673028
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      4554
           1       0.97      0.94      0.95      2468

    accuracy                           0.97      7022
   macro avg       0.97      0.96      0.96      7022
weighted avg       0.97      0.97      0.97      7022

