# CESNET-QUIC22 Day Sample Check
Quick sanity check on the first day's subset to understand available columns and label coverage.

In [4]:
from pathlib import Path
import pandas as pd

# Load a manageable slice of the first day's QUIC flows to inspect schema and labels.
data_path = Path('/Users/mehulbasu/Desktop/740-project/dataset/raw/cesnet-quic22/W-2022-44/1_Mon/flows-20221031.csv.gz')
if not data_path.exists():
    raise FileNotFoundError(f'Missing data file: {data_path}')

df = pd.read_csv(data_path, nrows=10000)
print(f'Loaded rows: {len(df):,}')
print('Columns:')
print(sorted(df.columns))

print('APP label distribution (top 10):')
print(df['APP'].value_counts().head(10))

if 'CATEGORY' in df.columns:
    print('CATEGORY distribution (top 10):')
    print(df['CATEGORY'].value_counts().head(10))

if {'APP', 'CATEGORY'} <= set(df.columns):
    print('APP/CATEGORY pairs (top 10):')
    print(df.groupby(['CATEGORY', 'APP']).size().sort_values(ascending=False).head(10))

Loaded rows: 10,000
Columns:
['APP', 'BYTES', 'BYTES_REV', 'CATEGORY', 'DST_ASN', 'DST_IP', 'DST_PORT', 'DURATION', 'FLOW_ENDREASON_ACTIVE', 'FLOW_ENDREASON_IDLE', 'FLOW_ENDREASON_OTHER', 'ID', 'PACKETS', 'PACKETS_REV', 'PHIST_DST_IPT', 'PHIST_DST_SIZES', 'PHIST_SRC_IPT', 'PHIST_SRC_SIZES', 'PPI', 'PPI_DURATION', 'PPI_LEN', 'PPI_ROUNDTRIPS', 'PROTOCOL', 'QUIC_SNI', 'QUIC_USERAGENT', 'QUIC_VERSION', 'SRC_IP', 'SRC_PORT', 'TIME_FIRST', 'TIME_LAST']
APP label distribution (top 10):
APP
instagram             913
google-services       600
youtube               587
facebook-graph        570
default-background    530
google-www            512
google-play           510
discord               504
google-gstatic        433
facebook-web          401
Name: count, dtype: int64
CATEGORY distribution (top 10):
CATEGORY
Other services and APIs    2488
Streaming media            2130
Social                     1424
default                     852
Instant messaging           624
Advertising              