## Imports

In [None]:
import os
import gzip
from warcio.archiveiterator import ArchiveIterator
import tldextract # easier to import
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

## Get file names

In [None]:
data_path = '../data'
files = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]
len_files = len(files)

## Count domains

In [None]:
domains = set()
pages_per_domain = Counter()
tokens_per_page = []
total_pages_valid = 0
all_urls = []

In [None]:
for f in files:
    filepath = data_path + '/' + f
    try:
        with gzip.open(filepath, 'rb') as stream:
            for record in ArchiveIterator(stream):
                url = record.rec_headers.get_header('WARC-Target-URI')
                
                if not url:
                    raise ValueError(f"Invalid URL: [{url}]")
                
                
                all_urls.append(url)
                extracted = tldextract.extract(url)
                domain = f"{extracted.domain}.{extracted.suffix}"

                domains.add(domain)
                pages_per_domain[domain] += 1
                total_pages_valid += 1
                
                payload = record.content_stream().read()
                html = payload.decode('utf-8')
                
                tokens_per_page.append(len(html))
                    
                
    except Exception as e:
        print(f"Error in {filepath}: {e}")
        continue

## Dominios encontrados x todas URLs

In [None]:
plt.figure(figsize=(6,4))
plt.bar(["Total URLs", "Domínios únicos"],
        [total_pages_valid, len(domains)],
        color=["steelblue", "seagreen"])
plt.ylabel("Contagem")
plt.grid(axis="y")
plt.tight_layout()

## Paginas por dominio

In [None]:
pages_per_domain_values = list(pages_per_domain.values())
curve = []

In [None]:
plt.figure(figsize=(7, 4))


sns.kdeplot(
    pages_per_domain_values,
    bw_adjust=2,
    log_scale=(True, False)
)

plt.xlabel("# webpages por domínio")
plt.ylabel("Densidade (log)")
plt.tight_layout()
plt.show()

## Media de caracteres por pagina

In [None]:
plt.figure(figsize=(7,4))
plt.hist(tokens_per_page, bins=50, range=(00, min(max(tokens_per_page), 2000000)))
plt.xlabel("# tokens")
plt.ylabel("Número de páginas")
plt.tight_layout()

plt.show()

In [None]:
tokens_clean = [t for t in tokens_per_page if t > 0 and t < 500000]

plt.figure(figsize=(7, 4))

sns.kdeplot(
    tokens_clean,
    bw_adjust=0.3,
    log_scale=(False, False)
)

plt.xlabel("# tokens")
plt.ylabel("Densidade")
plt.tight_layout()
plt.show()