In [2]:
import pandas as pd

In [3]:
# Load data
filepaths = ['../data/dns.json']
df = pd.concat([pd.read_json(file, lines=True) for file in filepaths], ignore_index=True)

In [4]:
# Split query into tier-2 domain and subdomain
df['t2_domain'] = df['query'].apply(lambda x: '.'.join(str(x).split('.')[-2:]))
df['subdomain'] = df['query'].apply(lambda x: '.'.join(str(x).split('.')[:-2]))

In [5]:
# Group by domain and count...

# Number of queries
num_queries = df[['t2_domain', 'subdomain']].groupby(['t2_domain']).count().rename(columns={'subdomain':'num_queries'})

In [6]:
# Number of unique subdomains
num_unique_subdomains = pd.DataFrame(df[['t2_domain', 'subdomain']].groupby(['t2_domain']).subdomain.nunique())\
    .rename(columns={'subdomain':'num_unique_subdomains'})

In [7]:
# Number of unique id_orig_h's per rootdomain
num_unique_id_orig = pd.DataFrame(df[['t2_domain', 'id_orig_h']].groupby(['t2_domain']).id_orig_h.nunique())\
    .rename(columns={'id_orig_h':'num_unique_id_orig'})

In [8]:
# Summarize the data
summary_df = pd.concat([num_queries, num_unique_subdomains, num_unique_id_orig], axis=1)
# Ratio the number of queries to the number of unique subdomains
summary_df['ratio'] = summary_df['num_queries'] / summary_df['num_unique_subdomains']
# Exaggerate those entries that have a high number of unique subdomains and a low 'ratio'
summary_df['subdom^inv_ratio'] = pow(summary_df['num_unique_subdomains'], (1 / summary_df['ratio']))

In [9]:
# Sanity check
summary_df.sort_values('subdom^inv_ratio', ascending=False).head()

Unnamed: 0_level_0,num_queries,num_unique_subdomains,num_unique_id_orig,ratio,subdom^inv_ratio
t2_domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
calculus.ski,1130,1125,1,1.004444,1090.565733
sourceforge.net,4,4,1,1.0,4.0
msedge.net,20,8,1,2.5,2.297397
taboola.com,2,2,1,1.0,2.0
slashdotmedia.com,3,2,1,1.5,1.587401


In [10]:
# We use very naive stats to define the boundaries of "normal".
means = summary_df.mean()
stds = summary_df.std ()
stats = pd.concat((means, stds), axis=1).rename(columns={0:'mean', 1:'std'})

def lower_bound(mean, std):
    return max(mean - std, 0) #negative values don't make sense
        
def upper_bound(mean, std):
    return mean + std

In [11]:
# Returns set of rootdomains with many subdomains.
def detect_tunnel_subdomains(summary, stats):
        mean_subdom = stats.loc['num_unique_subdomains', 'mean']
        std_subdom = stats.loc['num_unique_subdomains', 'std']
        ub_subdom = upper_bound(mean_subdom, std_subdom)
        susp_subdom = set(summary[summary['num_unique_subdomains'] >= ub_subdom].index)
        return susp_subdom

susp_subdomains = detect_tunnel_subdomains(summary_df, stats) 

In [12]:
# Returns set of rootdomains with low subdomain:rootdomain ratio.   
def detect_tunnel_ratio(summary, stats):
    mean_ratio = stats.loc['ratio', 'mean']
    std_ratio = stats.loc['ratio', 'std']
    ub_ratio = upper_bound(mean_ratio, std_ratio)
    susp_ratio = set(summary[summary['ratio'] <= ub_ratio].index)
    return susp_ratio    

susp_ratio = detect_tunnel_ratio(summary_df, stats) 

In [13]:
# Return set of rootdomains with high (num_subdom)^(1/ratio).
def detect_tunnel_pow(summary, stats):
    mean_pow = stats.loc['subdom^inv_ratio', 'mean']
    std_pow = stats.loc['subdom^inv_ratio', 'std']
    ub_pow = upper_bound(mean_pow, std_pow)
    susp_pow = set(summary[summary['subdom^inv_ratio'] >= ub_pow].index)
    return susp_pow

susp_pow = detect_tunnel_pow(summary_df, stats)

In [14]:
# set intersection of all domains judged suspicious by a variety of factors
susp_subdomains & susp_ratio & susp_pow

{'calculus.ski'}