# Identifying domains with responsible editor

The objective of this notebook is to automatically identify web archive material that falls in access category 2, meaning domains that on their front page have decleared responsible editorship.

Identification is rule-based, and based on two different forms of declarations:
- in text (variants of "Ansvarlig redaktør")
- in links (variants of "nored.no/redaktoeransvar" or "presse.no/Etisk-regelverk/Redaktoerplakaten"

Based on analysis with regular expressions (regex), the script calculates a simple confidence score:
- if regex is found on the front page for link, it appends a score of 0.5
- if the regex matches text on the front page, the domain is appended a score of 0.5
    - if regex does not match text on the front page, it text a regex pattern with with less authoritative terms that, combined with a link match, still is considered probable
    - if alternative text pattern is found, the domain is appended a score of 0.25
- if regex is not found for neither text nor link, the confidence score is 0.0

From testing, we have not experienced any false positives. However, the script still produces some false negatives. The common reason for this is the these pages desclare this in dynamic content (.js) which is not handled yet by this script.

Domains with a confidence score below 0.75 should therefore - until further - be checked manually.

## Import packages

In [None]:
import os
import gzip
import warcio
from warcio.archiveiterator import ArchiveIterator
import re
import glob
from urllib.parse import urlparse
import pandas as pd
import openpyxl

## Extract html front pages from WARC

In [None]:
# Function for filtering front pages
def is_front_page(url):
    """
    Determines if URL is front page, based on path structure.
    """
    parsed_url = urlparse(url)
    if parsed_url.path in ['', '/']:
        return True
    return False

# Function for excluding "front pages" of certain subdomains
def exclude_subdomain(url, excluded_subdomains):
    """
    Checks if URL's subdomain is in the list of excluded subdomains.
    """
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname
    # Handle cases with 'www.' as subdomain
    subdomains = hostname.replace('www.', '').split('.')[:-2]
    for subdomain in subdomains:
        if subdomain in excluded_subdomains:
            return True
    return False

# Subdomains to exclude
excluded_subdomains = ['samtykke', 'samtykker', 'stilling', 'stillinger', 'personvern']

In [None]:
# Paths for WARC source and HTML extraction
warc_path = '../data/veidemann/filename.warc.gz'
html_folder = 'htmls/veidemann/'
os.makedirs(html_folder, exist_ok=True)

with gzip.open(warc_path, 'rb') as stream:
    for record in ArchiveIterator(stream):
        content_type = record.http_headers.get_header('Content-Type') if record.http_headers else None
        status_code = record.http_headers.get_statuscode() if record.http_headers else None
        if record.rec_type == 'response' and content_type and 'text/html' in content_type and status_code.startswith('2'):
            url = record.rec_headers.get_header('WARC-Target-URI')
            # Check if URL is front page or not
            if is_front_page(url) and not exclude_subdomain(url, excluded_subdomains):
                html_content = record.content_stream().read()
                # Generate unique filename for each html file with reference to WARC record timestamp and (sub)domain
                filename = f"{record.rec_headers.get_header('WARC-Date')}_{url.split('//')[1].split('/')[0]}.html"
                with open(os.path.join(html_folder, filename), 'wb') as f:
                    f.write(html_content)


## Indentify regex in text and link

In [None]:
# Regex pattern for textual declaration of responsible editorship
text_pattern = r"Ansvarlig redaktør|Ansvarleg redaktør|Ansv\.?\s*redaktør|Ansv\.?\s*red\.|Ansvarlig red\.|Ansvarleg red\.|Sjefsred\.?|Sjefsredaktør|Sjefred\.?|Sjefredaktør|Ansvarlig redakt&oslash;r|Ansvarleg redakt&oslash;r|Ansv\.?\s*redakt&oslash;r|Sjefsredakt&oslash;r|Sjefredakt&oslash;r|Váldodoaimmaheaddji"

# Regex pattern for alternative textual declaration if no match for 'text_pattern'
text_pattern_alternative = r"Redaktør|Konstituert redaktør"

# Regex pattern for links declaring responsible editorship
url_pattern = r"https?://(www\.)?nored\.no/Redaktoeransvar(/Redaktoerplakaten)?/?|https?://(www\.)?nored\.no/Redaktoerplakaten(/Redaktoerplakaten)?/?|https?://(www\.)?presse\.no/pfu/etiske-regler(/Redaktoerplakaten)?/?|https?://(www\.)?presse\.no/Etisk-regelverk(/Redaktoerplakaten)?/?|https?://presse\.no/etisk-regelverk/vaer-varsom-plakaten|http://presse.no/pfu/etiske-regler/vaer-varsom-plakaten/|https?://(www\.)?presse\.no/pfu/etiske-regler/(redaktorplakaten)/?|http://presse\.no/etisk-regelverk/vaer-varsom-plakaten|https?://(no\.)?wikipedia\.org/wiki/V%C3%A6r_Varsom-plakaten"

# List for results
results = []

In [None]:
# Loop to iterate over html files
for html_file in glob.glob(os.path.join('htmls/veidemann/newWarc/', '*.html')):
    print("Processing file:", html_file)
    
    try:
        with open(html_file, 'r', encoding='utf-8') as file:
            html_content = file.read()
    except UnicodeDecodeError:
        try:
            with open(html_file, 'r', encoding='latin-1') as file:
                html_content = file.read()
        except UnicodeDecodeError:
            print(f"Could not read file {html_file} due to encoding issues.")
            continue  # Skip to next file

    # Extract URL from file name
    file_name = html_file.split('/')[-1]
    url = 'http://' + file_name.split('_')[1].split('/')[0]

    # Parse URL and extract host
    parsed_url = urlparse(url)
    host = parsed_url.netloc

    # Check text and link
    text_found = bool(re.search(text_pattern, html_content, re.IGNORECASE))
    alt_text_found = False

    # If no match, check the alternative pattern
    if not text_found:
        alt_text_found = bool(re.search(text_pattern_alternative, html_content, re.IGNORECASE))

    link_found = bool(re.search(url_pattern, html_content, re.IGNORECASE))

    # Append to results
    results.append((host, text_found, alt_text_found, link_found))

## Convert results to DataFrame

In [None]:
# Display all lines
pd.set_option('display.max_rows', None)
df = pd.DataFrame(results, columns=['Host', 'Text_Found', 'Alt_Text_Found', 'Link_Found'])
print(df)

## Report

##### Clean data

In [None]:
# Clean values in 'Host' column for port and file extension
df['Cleaned_Host'] = df['Host'].str.replace(r"(:\d+)?\.html", "", regex=True)

##### Aggregate results and calculate confidence score

In [None]:
# Calculate percentage of files which matches either text, link or both.
total_files = len(df)
text_percentage = (df['Text_Found'].sum() / total_files) * 100
alt_text_percentage = (df['Alt_Text_Found'].sum() / total_files) * 100
link_percentage = (df['Link_Found'].sum() / total_files) * 100
both_percentage = ((df['Text_Found'] & df['Link_Found']).sum() / total_files) * 100

# Calculate confidence score
def assign_confidence_score(row):
    score = 0
    if row['Text_Found']:
        score += 0.5
    if 'Alt_Text_Found' in row and row['Alt_Text_Found']:
        score += 0.25
    if row['Link_Found']:
        score += 0.5
    return score


df['Confidence_Score'] = df.apply(assign_confidence_score, axis=1)

# Group by cleaned host and calculate confidence score for each host (rounded to 2 decimals)
grouped = df.groupby('Cleaned_Host')['Confidence_Score'].mean().reset_index()
grouped['Confidence_Score'] = grouped['Confidence_Score'].round(2)

# Sort host by 'Confidence_Score' (descending)
grouped_sorted = grouped.sort_values(by='Confidence_Score', ascending=False)

# Print percentages
print(f"Tekst funnet totalt: {text_percentage:.2f}%")
print(f"Alt. tekst funnet totalt: {alt_text_percentage:.2f}%")
print(f"Link funnet totalt: {link_percentage:.2f}%")
print(f"Tekst OG lenke funnet totalt: {both_percentage:.2f}%")

# Print hosts by confidence score
print(grouped_sorted)

## Export results

In [None]:
# File path for export
# excel_path = './report/frontpages_grouped_sorted.xlsx'
# grouped_sorted.to_excel(excel_path, index=False, engine='openpyxl')

# print(f"Data exported to {excel_path}")