# **Data Collection & Feature Extraction on DGA**

### **Objective : Collect data and extract necessary features from that data to train Machine Learning models**

# **1.0] Data Collection**

In [128]:
pip install publicsuffixlist




In [129]:
import pandas as pd
import re
import math
import collections
from publicsuffixlist import PublicSuffixList

In [130]:
f_path = "E:\\University\\Year 3\\Methods for detecting cyber attacks\\Project\\datasets\\dga_domains_full.csv"
df = pd.read_csv(f_path,names=["label","dga_type","url"], header=None)

df.head(10)

Unnamed: 0,label,dga_type,url
0,dga,gozi,mortiscontrastatim.com
1,dga,corebot,cvyh1po636avyrsxebwbkn7.ddns.net
2,legit,alexa,plasticbags.sa.com
3,legit,alexa,mzltrack.com
4,legit,alexa,miss-slim.ru
5,dga,ranbyus,txumyqrubwutbb.cc
6,legit,alexa,myhostingpack.com
7,dga,symmi,ixekrihagimau.ddns.net
8,dga,emotet,rjyuosmhfnaedlyg.eu
9,dga,dircrypt,djqrmauttlloabj.com


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674898 entries, 0 to 674897
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   label     674898 non-null  object
 1   dga_type  674898 non-null  object
 2   url       674898 non-null  object
dtypes: object(3)
memory usage: 15.4+ MB


In [132]:
df.shape

(674898, 3)

In [133]:
# Printing number of legit and fraud domain urls
df["label"].value_counts()

dga      337500
legit    337398
Name: label, dtype: int64

In [134]:
def labelTo_Binary(type):
  # Convert Type to Binary variable DGA = 1, Normal = 0
  if type == 'dga':
    return 1
  else:
    return 0

df['label'] = df['label'].apply(lambda i: labelTo_Binary(i))

In [135]:
df.head(10)

Unnamed: 0,label,dga_type,url
0,1,gozi,mortiscontrastatim.com
1,1,corebot,cvyh1po636avyrsxebwbkn7.ddns.net
2,0,alexa,plasticbags.sa.com
3,0,alexa,mzltrack.com
4,0,alexa,miss-slim.ru
5,1,ranbyus,txumyqrubwutbb.cc
6,0,alexa,myhostingpack.com
7,1,symmi,ixekrihagimau.ddns.net
8,1,emotet,rjyuosmhfnaedlyg.eu
9,1,dircrypt,djqrmauttlloabj.com


# **2.0] Feature Extraction**

In [136]:
psl = PublicSuffixList()

In [137]:
# Load Valid Top Level Domains data
def load_topLevelDomain():
  topLevelDomain = []
  with open('./tlds-alpha-by-domain.txt', 'r') as content:
      for line in content:
          topLevelDomain.append((line.strip('\n')))
  return topLevelDomain

In [138]:
topLevelDomain = load_topLevelDomain()

In [139]:
def ignoreVPS(domain):
    # Return the rest of domain after ignoring the Valid Public Suffixes:
    validPublicSuffix = '.' + psl.publicsuffix(domain)
    if len(validPublicSuffix) < len(domain):
         # If it has VPS
        subString = domain[0: domain.index(validPublicSuffix)]  
    elif len(validPublicSuffix) == len(domain):
        return 0
    else:
        # If not
        subString = domain
    
    return subString

In [140]:
def domain_length(domain):
  # Generate Domain Name Length (DNL)
  return len(domain)

In [141]:
def subdomains_number(domain):
  # Generate Number of Subdomains (NoS)
  subdomain = ignoreVPS(domain)
  return (subdomain.count('.') + 1)

In [142]:
def subdomain_length_mean(domain):
  # enerate Subdomain Length Mean (SLM) 
  subdomain = ignoreVPS(domain)
  result = (len(subdomain) - subdomain.count('.')) / (subdomain.count('.') + 1)
  return result


In [143]:
def has_www_prefix(domain):
  # Generate Has www Prefix (HwP)
  if domain.split('.')[0] == 'www':
    return 1
  else:
    return 0

In [144]:
def has_hvltd(domain):
  # Generate Has a Valid Top Level Domain (HVTLD)
  if domain.split('.')[len(domain.split('.')) - 1].upper() in topLevelDomain:
    return 1
  else:
    return 0

In [145]:
def contains_single_character_subdomain(domain):
  # Generate Contains Single-Character Subdomain (CSCS) 
  domain = ignoreVPS(domain)
  str_split = domain.split('.')
  minLength = len(str_split[0])
  for i in range(0, len(str_split) - 1):
      minLength = len(str_split[i]) if len(str_split[i]) < minLength else minLength
  if minLength == 1:
    return 1
  else:
    return 0

In [146]:
def contains_TLD_subdomain(domain):
  # Generate Contains TLD as Subdomain (CTS)
  subdomain = ignoreVPS(domain)
  str_split = subdomain.split('.')
  for i in range(0, len(str_split) - 1):
        if str_split[i].upper() in topLevelDomain:
            return 1
  return 0

In [147]:
def underscore_ratio(domain):
  # Generate Underscore Ratio (UR) on dataset
  subString = ignoreVPS(domain)
  result = subString.count('_') / (len(subString) - subString.count('.'))
  return result

In [148]:
def contains_IP_address(domain):
  # Generate Contains IP Address (CIPA) on datasetx
    splitSet = domain.split('.')
    for element in splitSet:
        if(re.match("\d+", element)) == None:
            return 0
    return 1  


In [149]:
def contains_digit(domain):
  """
   Contains Digits 
  """
  subdomain = ignoreVPS(domain)
  for item in subdomain:
    if item.isdigit():
      return 1
  return 0

In [150]:
def vowel_ratio(domain):
  """
  calculate Vowel Ratio 
  """
  VOWELS = set('aeiou')
  v_counter = 0
  a_counter = 0
  subdomain = ignoreVPS(domain)
  for item in subdomain:
    if item.isalpha():
      a_counter+=1
      if item in VOWELS:
        v_counter+=1
  if a_counter>1:
    ratio = v_counter/a_counter
    return ratio

In [151]:
def digit_ratio(domain):
  """
  calculate digit ratio
  """
  d_counter = 0
  counter = 0
  subdomain = ignoreVPS(domain)
  for item in subdomain:
    if item.isalpha() or item.isdigit():
      counter+=1
      if item.isdigit():
        d_counter+=1
  if counter>1:
    ratio = d_counter/counter
    return ratio

In [152]:
def prc_rrc(domain):
  """
  calculate the Ratio of Repeated Characters in a subdomain
  """
  subdomain = ignoreVPS(domain)
#   subdomain =''.join(re.findall('[a-zA-Z]+', subdomain)) 
  subdomain = re.sub("[.]", "", subdomain)
  char_num=0
  repeated_char_num=0
  d = collections.defaultdict(int)
  for c in list(subdomain):
      d[c] += 1
  for item in d:
    char_num +=1
    if d[item]>1:
      repeated_char_num +=1
  ratio = repeated_char_num/char_num
  return ratio

In [153]:
def prc_rcd(domain):
  """
  calculate the ratio of consecutive digits
  """
  counter = 0
  digit_counter=0
  subdomain = ignoreVPS(domain)
#   subdomain =''.join(re.findall('[a-zA-Z]+', subdomain)) 
  for item in subdomain:
    i = 0
    if item.isdigit():
      counter+=1
    else:
      if counter>1:
        digit_counter+=counter
      counter=0
    i+=1
  if i==len(subdomain) and counter>1:
    digit_counter+=counter
  ratio = digit_counter/len(subdomain)
  return ratio

In [154]:
def prc_rcc(domain):
  """
  calculate the Ratio of Consecutive Consonants
  """
  VOWELS = set('aeiou')
  counter = 0
  cons_counter=0
  subdomain = ignoreVPS(domain)
#   subdomain =''.join(re.findall('[a-zA-Z]+', subdomain)) 
  for item in subdomain:
    i = 0
    if item.isalpha() and item not in VOWELS:
      counter+=1
    else:
      if counter>1:
        cons_counter+=counter
      counter=0
    i+=1
  if i==len(subdomain) and counter>1:
    cons_counter+=counter
  ratio = cons_counter/len(subdomain)
  return ratio   

In [155]:
def prc_entropy(domain):
    """
    calculate the entropy of subdomain
    :param domain_str: subdomain
    :return: the value of entropy
    """
    subdomain = ignoreVPS(domain)
    # get probability of chars in string
    prob = [float(subdomain.count(c)) / len(subdomain) for c in dict.fromkeys(list(subdomain))]

    # calculate the entropy
    entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob])
    return entropy

In [156]:
def extract_features():
  df['DNL'] = df['url'].apply(lambda x: domain_length(x))
  df['NoS'] = df['url'].apply(lambda x: subdomains_number(x))
  df['SLM'] = df['url'].apply(lambda x: subdomain_length_mean(x))
# df['HwP'] = df['url'].apply(lambda x: has_www_prefix(x))
  df['HVTLD'] = df['url'].apply(lambda x: has_hvltd(x))
# df['CSCS'] = df['url'].apply(lambda x: contains_single_character_subdomain(x))
  df['CTS'] = df['url'].apply(lambda x: contains_TLD_subdomain(x))
  df['UR'] = df['url'].apply(lambda x: underscore_ratio(x))
# df['CIPA'] = df['url'].apply(lambda x: contains_IP_address(x))
  df['contains_digit']= df['url'].apply(lambda x:contains_digit(x))
  df['vowel_ratio']= df['url'].apply(lambda x:vowel_ratio(x))
  df['digit_ratio']= df['url'].apply(lambda x:digit_ratio(x))
  df['RRC']= df['url'].apply(lambda x:prc_rrc(x))
  df['RCC']= df['url'].apply(lambda x:prc_rcc(x))
  df['RCD']= df['url'].apply(lambda x:prc_rcd(x))
  df['Entropy']= df['url'].apply(lambda x:prc_entropy(x))

In [157]:
extract_features()
df.head()

Unnamed: 0,label,dga_type,url,DNL,NoS,SLM,HVTLD,CTS,UR,contains_digit,vowel_ratio,digit_ratio,RRC,RCC,RCD,Entropy
0,1,gozi,mortiscontrastatim.com,22,1,18.0,1,0,0.0,0,0.333333,0.0,0.777778,0.5,0.0,3.058814
1,1,corebot,cvyh1po636avyrsxebwbkn7.ddns.net,32,1,23.0,1,0,0.0,1,0.166667,0.217391,0.210526,0.608696,0.130435,4.175736
2,0,alexa,plasticbags.sa.com,18,1,11.0,1,0,0.0,0,0.272727,0.0,0.222222,0.545455,0.0,3.095795
3,0,alexa,mzltrack.com,12,1,8.0,1,0,0.0,0,0.125,0.0,0.0,0.625,0.0,3.0
4,0,alexa,miss-slim.ru,12,1,9.0,1,0,0.0,0,0.25,0.0,0.6,0.444444,0.0,2.19716


In [158]:
df.to_csv("DGA_Processed.csv")