# **Importing Libraries**

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 8.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 40.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 45.8 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloadin

In [None]:

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [None]:
import nltk
import re

from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **Loading target dataset**

In [None]:
# Load test data
import json
with open('accesslog.json') as in_file:
    data = json.load(in_file)

In [None]:
tags = []
for i in range(len(data)):
  tags.append(data[i]['_source']['tags'])

In [None]:
df1 = pd.DataFrame()
df1['tags'] = tags

In [None]:
df1

Unnamed: 0,tags
0,[Possible SQL Injection]
1,[]
2,[Possible SQL Injection]
3,"[Possible SQL Injection, Possible Cross Site S..."
4,[]
...,...
82,[4XX Client Errors]
83,[]
84,"[Possible Cross Site Scripting, 4XX Client Err..."
85,"[Possible Cross Site Scripting, 4XX Client Err..."


# **Loading Training dataset**

In [None]:
df = pd.read_csv('Alert_file2.csv')
df = df.drop('Unnamed: 0', axis=1)

In [None]:
df['message'] = df['message'].apply(lambda x: x.lower())

In [None]:
df['tags'] = df1['tags']

In [None]:
display(df)

Unnamed: 0,message,source_ip,tags
0,102281393 16aug2021114727 0530 get union http1...,51.222.253.11,[Possible SQL Injection]
1,102281393 16aug2021114727 0530 get faviconico ...,10.228.0.95,[]
2,102281393 16aug2021114732 0530 get 20select 20...,10.208.0.229,[Possible SQL Injection]
3,102281393 16aug2021112839 0530 get unionselect...,10.208.0.229,"[Possible SQL Injection, Possible Cross Site S..."
4,102281393 16aug2021112843 0530 get waitfor 20d...,10.228.0.95,[]
...,...,...,...
82,102281394 17aug2021162151 0530 get faviconico ...,114.55.9.245,[4XX Client Errors]
83,102281394 17aug2021162154 0530 get http11 200 ...,10.228.0.95,[]
84,102281394 17aug2021162205 0530 get alert http1...,10.228.0.95,"[Possible Cross Site Scripting, 4XX Client Err..."
85,102281394 17aug2021162217 0530 get 3cscript 3e...,114.55.9.245,"[Possible Cross Site Scripting, 4XX Client Err..."


In [None]:
documents = df['message']

# **Data preprocessing**

In [None]:
def preprocessing(input_documents):
     wordnet_lemmatizer = WordNetLemmatizer()
     punctuations = set('''!()-[]{};:'"\,<>.`/?``#$%^&*_~''')                         
    
     docs = []
     for sentence in input_documents:
         sentence = sentence.lower()
         string_list = []
         sentence_words = nltk.word_tokenize(sentence)
         for word in sentence_words:
             
             lemmatized_word  =  wordnet_lemmatizer.lemmatize(word)
             
             lemmatized_word = re.sub(r"[^0-9A-Za-z]+", "", lemmatized_word)             
             lemmatized_word = re.sub(r"-", "", lemmatized_word)
             if lemmatized_word not in stop_words and punctuations:
                 string_list.append(lemmatized_word)
         final_word = " ".join(string_list)
         final_word = re.sub("[\t ]{2,}", " ", final_word)
   
         docs.append(final_word)
         
     return docs

In [None]:
 documents = preprocessing(documents)
 print(f'{len(documents)} documents')

87 documents


In [None]:
df = df.drop('message', axis = 1)

In [None]:
df['message'] = documents

In [None]:
# Printing unique source-IPs

import numpy as np
np.unique(df['source_ip'])

array(['10.208.0.229', '10.228.0.93', '10.228.0.95', '10.228.10.59',
       '10.228.10.75', '10.228.11.121', '11.208.0.92', '114.55.9.245',
       '195.154.123.130', '51.222.253.11', '54.36.148.129'], dtype=object)

In [None]:
# Removal of datapoints with missing important features

df = df.dropna()

In [None]:
def remove_empty_list(tag):
  if tag == []:
    return 'NaN'
  return tag[0]

df['tags'] = df['tags'].apply(lambda tag: remove_empty_list(tag))

In [None]:
df = df[df['tags'] != 'NaN']

In [None]:
df = df.drop_duplicates()

In [None]:
# Splitting dataset into train and test.......

X_train, X_test, y_train, y_test = train_test_split(df.message, df.Label, test_size=0.2, random_state=42)

In [None]:
# Relevant Query string for detcting sql injection........

query_string = ['waitfor','waitfordelay','union select','union all select', 'order by', 'you have an error', 'unknown column', 'unionselect', 'orderby', 'order by 1', '@@version','@@user', '@@database']


# **Model Training**

In [None]:
import sentence_transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
# Embedding..........

df['embedded_message'] = df['message'].apply(lambda sentences: model.encode(sentences.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df['embedded_tags'] = df['tags'].apply(lambda sentences: model.encode(sentences.lower()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
query_string = preprocessing(query_string)

In [None]:
query_string_embeddings = model.encode(query_string)

# **Applying Cosine Similarity**

In [None]:
def similarity_check(sent1,query):
  return round(max(max(cosine_similarity([sent1], query))),2)

In [None]:
df['Similarity_score'] = df['embedded_message'].apply(lambda sent: similarity_check(sent,query_string_embeddings[:]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
Similarity_list = df['Similarity_score'].values

In [None]:
Similarity_list

array([0.26, 0.25, 0.47, 0.24, 0.32, 0.24, 0.24, 0.29, 0.28, 0.33, 0.29,
       0.33, 0.25, 0.26, 0.26, 0.26, 0.26, 0.26, 0.26, 0.26, 0.26, 0.26,
       0.26, 0.25, 0.26, 0.17, 0.18, 0.28, 0.28, 0.32, 0.31, 0.31, 0.17,
       0.3 , 0.27, 0.28, 0.31, 0.25, 0.24, 0.24, 0.25, 0.29, 0.25, 0.18,
       0.17, 0.31, 0.33, 0.26, 0.3 , 0.29, 0.29, 0.39, 0.32, 0.19, 0.27,
       0.19, 0.28, 0.17, 0.27, 0.18, 0.22, 0.28, 0.27, 0.22, 0.22, 0.29,
       0.26, 0.26, 0.3 , 0.33, 0.2 , 0.43, 0.33, 0.33, 0.18, 0.24, 0.36,
       0.26], dtype=float32)

In [None]:
import statistics
import numpy as np
Similarity_mean = statistics.mean(Similarity_list)
Similarity_standard_deviation = np.std(Similarity_list)
alpha = 0.1875
threshold = Similarity_mean + alpha * Similarity_standard_deviation
print(Similarity_mean)
print(Similarity_standard_deviation)
print(threshold)

0.26884615
0.05502868
0.2791640318464488


In [None]:
result = df[df['Similarity_score'] >= 0.28]

In [None]:
result = result.sort_values('Similarity_score',  ascending=False)

In [None]:
final_res = result[['source_ip', 'message', 'Similarity_score']]

In [None]:
print("Most relevant SQL injection alert from the available dataset: ")
print(final_res.head(10))

Most relevant SQL injection alert from the available dataset: 
          source_ip                                            message  \
3      10.208.0.229  102281393 16aug2021112839 0530 get unionselect...   
79      10.228.0.95  102281394 17aug2021162156 0530 get admin http1...   
59      10.228.0.95  102281393 16aug2021161415 0530 get admin http1...   
85     114.55.9.245  102281394 17aug2021162217 0530 get 3cscript 3e...   
77      10.228.0.95  1022810118 17aug2021160122 0530 get union 20al...   
54    54.36.148.129  102281393 16aug2021163951 0530 get union http1...   
12     10.228.10.59  102281393 16aug2021114005 0530 get union http1...   
14     10.228.10.75  102281393 16aug2021113713 0530 get union http1...   
81  195.154.123.130  102281394 17aug2021162151 0530 get union http1...   
80     114.55.9.245  102281394 17aug2021162200 0530 get sbadmin htt...   

    Similarity_score  
3               0.47  
79              0.43  
59              0.39  
85              0.36  
77     

In [None]:
test_data_accuracy = round(df['tags'].eq(result['tags']).mean(),2)*100

In [None]:
print("Accuracy of the SQL Injection alert detection: ", test_data_accuracy)

Accuracy of the SQL Injection alert detection:  74.0
