In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import ast
import re
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source

def extract_memory_value(memory_str):
    try:
        return float(memory_str.split()[0])
    except (ValueError, IndexError):
        return None

def load_df(file_path):
    df = pd.read_csv(file_path)
    df['Attributes'] = df['Attributes'].apply(ast.literal_eval)
    for key in set().union(*(df['Attributes'].dropna().tolist())):
        df[key] = df['Attributes'].apply(lambda x: x.get(key) if isinstance(x, dict) else None)
    df['Memory_GB'] = df['Attributes'].apply(lambda x: extract_memory_value(x.get('Memory', '0 GB')))
    cols_to_del = ['IP', 'ID', 'Log', 'Attributes', 'AttributesHash', 'Audio', 'Fonts', 'Geom Canvas', 'MediaHash', 'Name', 'Plugins', 'PluginsHash', 'TXT Canvas', 'Media Capabilities', 'Encryption methods', 'Brave', 'Navigator Vendor', 'Do not track', 'Shading Langueage Versions', 'Browser permissions', 'Browser core', 'Unmasked Renderer', 'Unmasked Vendor', 'Browser name', 'Renderer', 'Vendor', 'Memory']
    df = df.drop(columns=[col for col in cols_to_del if col in df.columns], errors='ignore')
    return df

# Only files with Safari, Firefox, or Chrome in their names
path = '../data/browser_data/'
pattern = f"{path}/*.csv"
csv_files = [f for f in glob.glob(pattern) if any(x in f for x in ["Safari", "Firefox", "Chrome"])]

df_list = []
for file in csv_files:
    df = load_df(file)
    # Add a column with the file name (or a label derived from it)
    df['SourceFile'] = file.split('/')[-1].replace('.csv', '')
    df_list.append(df)

combined_data = pd.concat(df_list)
combined_data.replace({'Disabled': 0, 'Enabled': 1}, inplace=True)
combined_data.replace({'False': 0, 'True': 1}, inplace=True)
if 'Color Depth' in combined_data.columns:
    combined_data['Color Depth'] = combined_data['Color Depth'].astype(str).str.extract('(\d+)').astype(float)

# Hash all string columns except the target
for col in combined_data.columns:
    if combined_data[col].dtype == 'object' and col != 'SourceFile':
        combined_data[col] = combined_data[col].apply(lambda x: hash(x) if pd.notnull(x) else 0)

# Target is now the file label
target = 'SourceFile'
features = combined_data.columns.to_list()
features.remove(target)

X = combined_data[features]
y = combined_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42, max_depth=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Visualize the tree
dot_data = export_graphviz(
    clf, out_file=None,
    feature_names=features,
    class_names=clf.classes_.astype(str),
    filled=True, rounded=True,
    special_characters=True
)
graph = Source(dot_data)
graph.render('decision_tree_sourcefile', format='png', view=True)

  combined_data['Color Depth'] = combined_data['Color Depth'].astype(str).str.extract('(\d+)').astype(float)
  combined_data = pd.concat(df_list)
  combined_data.replace({'Disabled': 0, 'Enabled': 1}, inplace=True)
  combined_data.replace({'False': 0, 'True': 1}, inplace=True)


Accuracy: 0.616


'decision_tree_sourcefile.png'

In [2]:
# After fitting your clf and making predictions

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report (precision, recall, f1-score, support for each class)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=clf.classes_))

# Tree depth
print("\nTree Depth:", clf.get_depth())

# Number of leaves
print("Number of Leaves:", clf.get_n_leaves())

# Feature importances
importances = pd.Series(clf.feature_importances_, index=features)
print("\nFeature Importances:")
print(importances.sort_values(ascending=False))

# Optionally, show the top N most important features
N = 10
print(f"\nTop {N} Features:")
print(importances.sort_values(ascending=False).head(N))

# Number of nodes
print("\nNumber of Nodes:", clf.tree_.node_count)

# Minimum samples per leaf
print("Min samples per leaf:", np.min(clf.tree_.n_node_samples))

# Maximum samples in a leaf
print("Max samples in a leaf:", np.max(clf.tree_.n_node_samples))

Accuracy: 0.616

Classification Report:
                                precision    recall  f1-score   support

PC_A_Ubuntu_Firefox_Aggressive       0.00      0.00      0.00        12
   PC_A_Ubuntu_Firefox_Classic       0.40      1.00      0.57         8
 PC_A_WIN10_Firefox_Aggressive       0.50      1.00      0.67        21
    PC_A_WIN10_Firefox_Classic       0.00      0.00      0.00        21
 PC_B_WIN11_Firefox_Aggressive       0.00      0.00      0.00        15
    PC_B_WIN11_Firefox_Classic       0.57      1.00      0.73        20
 PC_C_MacOS_Firefox_Aggressive       1.00      1.00      1.00        19
             PC_C_MacOS_Safari       1.00      1.00      1.00         2
     PC_C_MacOS_Safari_Private       1.00      1.00      1.00         7

                      accuracy                           0.62       125
                     macro avg       0.50      0.67      0.55       125
                  weighted avg       0.43      0.62      0.49       125


Confusion Matrix:
[[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
