In [19]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../src/")

from src import CSV_DELIMITER
from src.fingerprint import Fingerprint


DATA_PATH_REL = "../../data/"

In [33]:
ATTRIBUTES_TO_EXCLUDE = [Fingerprint.COUNTER, Fingerprint.CREATION_TIME, Fingerprint.END_TIME, Fingerprint.CONSISTENT]

In [7]:
df = pd.read_csv(DATA_PATH_REL + "processed/fingerprint_dataset.csv", delimiter=CSV_DELIMITER, engine="python")

In [34]:
attributes_to_analyze = [attribute for attribute in df.columns.tolist() if attribute not in ATTRIBUTES_TO_EXCLUDE]

print("Statistics on the whole dataset: consistent + inconsistent browsers")
for attribute in attributes_to_analyze:
    nb_distinct_values = df[attribute].value_counts().count()
    print("{} : {:d} distinct values".format(attribute, nb_distinct_values))

Statistics on the whole dataset: consistent + inconsistent browsers
id : 5808 distinct values
addressHttp : 23302 distinct values
userAgentHttp : 3550 distinct values
acceptHttp : 16 distinct values
connectionHttp : 2 distinct values
encodingHttp : 17 distinct values
languageHttp : 660 distinct values
orderHttp : 201 distinct values
pluginsJSHashed : 5843 distinct values
platformJS : 32 distinct values
cookiesJS : 2 distinct values
dntJS : 2 distinct values
timezoneJS : 35 distinct values
resolutionJS : 595 distinct values
localJS : 2 distinct values
sessionJS : 2 distinct values
canvasJSHashed : 4001 distinct values
fontsFlashHashed : 5360 distinct values
resolutionFlash : 311 distinct values
languageFlash : 27 distinct values
platformFlash : 290 distinct values


In [12]:
df_consistent = df[df[Fingerprint.CONSISTENT] == True ]

In [15]:
print("Statistics on dataset filtered: only consistent browsers")
for attribute in attributes_to_analyze:
    nb_distinct_values = df_consistent[attribute].value_counts().count()
    print("{} : {:d} distinct values".format(attribute, nb_distinct_values))

Statistics on dataset filtered: only consistent browsers
id : 2128 distinct values
addressHttp : 1893 distinct values
userAgentHttp : 629 distinct values
acceptHttp : 6 distinct values
connectionHttp : 1 distinct values
encodingHttp : 10 distinct values
languageHttp : 222 distinct values
orderHttp : 47 distinct values
pluginsJSHashed : 872 distinct values
platformJS : 16 distinct values
cookiesJS : 2 distinct values
dntJS : 2 distinct values
timezoneJS : 29 distinct values
resolutionJS : 135 distinct values
localJS : 2 distinct values
sessionJS : 2 distinct values
canvasJSHashed : 480 distinct values
fontsFlashHashed : 612 distinct values
resolutionFlash : 74 distinct values
languageFlash : 22 distinct values
platformFlash : 76 distinct values
consistent : 1 distinct values


In [43]:
# not really precise since having the same browser multiple times with the same value decrease the entropy,
# which should not be the case

def compute_entropy(df, attribute):
    entropy = 0
    serie = df[attribute].value_counts()
    
    if len(serie) == 1:
        return 0
    
    total_values = serie.sum()
    for index, value in serie.iteritems():
        pi = float(value)/float(total_values)
        entropy += pi * np.log2(pi)
    
    return -entropy

def compute_normalized_entropy(df, attribute):
    number_distinct = df[attribute].value_counts().sum()
    
    if number_distinct == 1:
        return 0
    
    pi_worst = 1.0 / float(number_distinct)
    worst_case = -number_distinct * pi_worst * np.log2(pi_worst)
    
    df[attribute].value_counts().count()
    entropy = compute_entropy(df, attribute)
    return entropy / worst_case


print("Entropy on dataset filtered: only consistent browsers")
for attribute in attributes_to_analyze:
    entropy = compute_normalized_entropy(df_consistent, attribute)
    print("{} : {:f} bits".format(attribute, entropy)) 


Entropy on dataset filtered: only consistent browsers
id : 1.000000 bits
addressHttp : 0.975284 bits
userAgentHttp : 0.755265 bits
acceptHttp : 0.088435 bits
connectionHttp : 0.000000 bits
encodingHttp : 0.169798 bits
languageHttp : 0.383888 bits
orderHttp : 0.255276 bits
pluginsJSHashed : 0.709155 bits
platformJS : 0.153947 bits
cookiesJS : 0.012660 bits
dntJS : 0.088182 bits
timezoneJS : 0.327736 bits
resolutionJS : 0.344783 bits
localJS : 0.044435 bits
sessionJS : 0.044435 bits
canvasJSHashed : 0.650355 bits
fontsFlashHashed : 0.465055 bits
resolutionFlash : 0.274537 bits
languageFlash : 0.231278 bits
platformFlash : 0.263046 bits
