<a href="https://colab.research.google.com/github/mshreeharsha/MiniProject_DNS/blob/main/Mini_Project_DNS_Tunnelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DNS Tunneling Detection**

In [None]:
from google.colab import drive
import pandas as pd
import math
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## **Data Preprocessing**

In [None]:
import pandas as pd
train_data = pd.read_csv('/content/drive/MyDrive/MiniProject/training.csv')
train_data.columns = ["Label","Query"]
n=len(train_data[train_data['Label']==0])
label1 = train_data[train_data['Label'] == 1].sample(2*n, random_state=42)
label0 = train_data[train_data['Label'] == 0]
train_data = pd.concat([label1, label0])

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/MiniProject/validating.csv')
test_data.columns = ["Label","Query"]
n=len(test_data[test_data['Label']==0])
label1 = test_data[test_data['Label'] == 1].sample(n, random_state=42)
label0 = test_data[test_data['Label']==0]
test_data = pd.concat([label1, label0])

In [None]:
#Function to calculate Entropy
def calculate_entropy(text):
    if not text:
        return 0
    entropy = 0
    for x in range(256):
        p_x = float(text.count(chr(x)))/len(text)
        if p_x > 0:
            entropy += - p_x*math.log(p_x, 2)
    return entropy

In [None]:
#Function to count No of Characters
def calculate_no_of_characters(text):
  return len(text)

In [None]:
#Function to Count no of Upper case characters
def calculate_no_of_upper_characters(text):
  count=0
  for ch in text:
    if ch.isupper():
      count+=1

  return count

In [None]:
#Function to Count no of numeric characters

def calculate_no_of_numeric_characters(text):
  count=0
  for ch in text:
    if ch.isnumeric():
      count+=1

  return count

In [None]:
#Functions to count no of labels
def calculate_no_of_labels(text):
  return len(text.split('.'))-1

In [None]:
# Features of Train Data
trainEntropy = []
noOfCharacters=[]
noOfUpperCharacters=[]
noOfNumericCharacters=[]
noOfLabels=[]

for query in train_data['Query']:
    entropy = calculate_entropy(query)
    trainEntropy.append(entropy)
    noOfCharacters.append(calculate_no_of_characters(query))
    noOfUpperCharacters.append(calculate_no_of_upper_characters(query))
    noOfNumericCharacters.append(calculate_no_of_numeric_characters(query))
    noOfLabels.append(calculate_no_of_labels(query))


train_data['Entropy'] = trainEntropy
train_data['Numeric'] = noOfNumericCharacters
train_data['Upper'] = noOfUpperCharacters
train_data['Labels'] = noOfLabels
train_data['Characters'] = noOfCharacters

train_data.head(5)

In [None]:
# Features of Test Data
testEntropy = []
noOfCharacters=[]
noOfUpperCharacters=[]
noOfNumericCharacters=[]
noOfLabels=[]

for query in test_data['Query']:
    entropy = calculate_entropy(query)
    testEntropy.append(entropy)
    noOfCharacters.append(calculate_no_of_characters(query))
    noOfUpperCharacters.append(calculate_no_of_upper_characters(query))
    noOfNumericCharacters.append(calculate_no_of_numeric_characters(query))
    noOfLabels.append(calculate_no_of_labels(query))


test_data['Entropy'] = testEntropy
test_data['Numeric'] = noOfNumericCharacters
test_data['Upper'] = noOfUpperCharacters
test_data['Labels'] = noOfLabels
test_data['Characters'] = noOfCharacters

test_data.head(5)

In [None]:
X_train = train_data[['Entropy','Characters','Numeric','Upper','Labels']] #Train Data
Y_train = train_data['Label']

X_train, Y_train = shuffle(X_train, Y_train) # to reduce overfitting during training

## **Training**

### *1. Decision Tree Classifier*

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_dtc = DecisionTreeClassifier()
model_dtc.fit(X_train, Y_train)

### *2. Naive Bayes*

In [None]:
from sklearn.naive_bayes import GaussianNB
model_nb = GaussianNB()
model_nb.fit(X_train, Y_train)

### *3. Support Vector Machine*

In [None]:
from sklearn.svm import SVC
model_svm = SVC(kernel='linear', random_state=21)
model_svm.fit(X_train, Y_train)

## **Model Testing and Predictions**

In [None]:
X_test = test_data[['Entropy','Characters','Numeric','Upper','Labels']] #Test Data
Y_test = test_data['Label']
X_test, Y_test = shuffle(X_test, Y_test)


y_preds_dtc = model_dtc.predict(X_test)
print("Detection Accuracy: ",accuracy_score(Y_test, y_preds_dtc)*100,"%")
print("Classification Report:")
print(classification_report(Y_test, y_preds_dtc))



Detection Accuracy:  99.75 %
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1000
           1       1.00      1.00      1.00      1000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



## **Results**

In [None]:
'''
1. Malicious 3000 Legit 3000 in X Train
Naive Bayes -> Accuracy: 99.85000000000001 %
Decision Tree Classifier -> Accuracy: 99.75 %
SVM -> 100.0 %
'''

'''
2. Malicious 6000 Legit 3000 in X Train, 1000 Legit 2000 Malicious X Test
SVM -> Accuracy: 99.73333333333333 %
Decision Tree Classifier -> Accuracy: 99.8 %
Naive Bayes -> Accuracy: 99.86666666666667 %
'''

'''
3. Malicious 6000 Legit 3000 in X Train, 1000 Legit 1000 Malicious X Test
Naive Bayes -> Accuracy: 99.85000000000001 %
SVM -> Accuracy: 99.6 %
Decision Tree Classiifer -> Accuracy: 99.75 %
'''
