# CookieGuard - Data Collection Notebook

This notebook demonstrates the cookie data collection process for training the ML model.

In [None]:
import sys
sys.path.append('../src')

from data_collector import CookieDataCollector
import json
import pandas as pd

## 1. Initialize Data Collector

In [None]:
collector = CookieDataCollector(output_dir='../data/raw')

## 2. Fetch EasyList Tracking Domains

In [None]:
tracking_domains = collector.fetch_easylist_domains()
print(f"Fetched {len(tracking_domains)} tracking domains")
print("\nSample domains:")
for domain in tracking_domains[:10]:
    print(f"  - {domain}")

## 3. Generate URL List for Cookie Collection

In [None]:
urls = collector.generate_training_urls()
print(f"Generated {len(urls)} URLs for cookie collection")
print("\nURLs:")
for url in urls:
    print(f"  - {url}")

## 4. Collect Cookies from URLs

In [None]:
cookies = collector.collect_from_url_list(urls, output_file='cookies.json')

## 5. Analyze Collected Cookies

In [None]:
df = pd.DataFrame(cookies)
print(f"Total cookies collected: {len(df)}")
print(f"\nUnique domains: {df['domain'].nunique()}")
print(f"\nTop 10 domains by cookie count:")
print(df['domain'].value_counts().head(10))

print(f"\nCookies with secure flag: {df['secure'].sum()}")
print(f"Cookies with httpOnly flag: {df['httpOnly'].sum()}")
print(f"Session cookies: {df['expirationDate'].isna().sum()}")

## 6. Sample Cookie Data

In [None]:
df.head(10)

In [None]:
collector.close()