# CookieGuard - Feature Engineering

This notebook demonstrates the feature extraction process for cookie classification.

In [None]:
import sys
sys.path.append('../src')

from feature_extractor import FeatureExtractor
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load Cookie Data

In [None]:
with open('../data/raw/cookies.json', 'r') as f:
    cookies = json.load(f)

print(f"Loaded {len(cookies)} cookies")

## 2. Extract Features

In [None]:
extractor = FeatureExtractor()

features_list = []
for cookie in cookies[:1000]:
    features = extractor.extract_features(cookie)
    features['name'] = cookie['name']
    features['domain'] = cookie['domain']
    features_list.append(features)

features_df = pd.DataFrame(features_list)
print(f"Extracted features for {len(features_df)} cookies")
features_df.head()

## 3. Feature Statistics

In [None]:
numeric_features = features_df.select_dtypes(include=[np.number])
numeric_features.describe()

## 4. Feature Distributions

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(16, 12))
axes = axes.flatten()

for idx, col in enumerate(numeric_features.columns[:16]):
    axes[idx].hist(numeric_features[col], bins=30, edgecolor='black')
    axes[idx].set_title(col)
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Feature Correlations

In [None]:
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_features.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 6. Pattern Matching Analysis

In [None]:
print(f"Cookies matching tracking patterns: {features_df['matchesTrackingPattern'].sum()}")
print(f"Cookies with potential PII: {features_df['hasPII'].sum()}")
print(f"Cookies with UUID format: {features_df['hasUUID'].sum()}")
print(f"Cookies with numeric-only values: {features_df['hasNumericOnly'].sum()}")
print(f"Cookies with Base64 values: {features_df['hasBase64'].sum()}")