# Setup

## Import libraries

In [None]:
import os
import subprocess
import glob
import json

## Change directory to root of repository

In [None]:
os.chdir("../../PoliGraph-Setup/")

## Install dependencies for repository

Make sure you create the conda environment:

```sh
conda env create -f ./environment.yml
```

## Create the cache directory

In [None]:
cache_dir = "./poligrapher/cache"

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

## Import installed packages

In [None]:
import gdown
import yaml
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

## Download the model file from researchers' Google Drive

In [None]:
url = "https://drive.google.com/uc?id=1qHifRx93EfTkg2x1e2W_lgQAgk7HcXhP"
output = "./poligrapher/cache/poligrapher-extra-data.tar.gz"

if os.path.exists(output):
    print(f"Using cached file: {output}")
else:
    print(f"Downloading file from {url}")
    gdown.download(url, output, quiet=False)
    print(f"File downloaded to: {output}")

## Download spaCy model

In [1]:
import spacy

# Download spaCy model
print(f"Downloading spaCy model...")
spacy.cli.download("en_core_web_md")
print(f"SpaCy model downloaded.")

Downloading spaCy model...
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
SpaCy model downloaded.


## Unzip and move model file to correct folder

In [None]:
import tarfile

# Extract the tar.gz file
with tarfile.open("./poligrapher/cache/poligrapher-extra-data.tar.gz", "r:gz") as tar:
    tar.extractall(path="./poligrapher/extra-data")

## Install tool as python package

In [None]:
subprocess.run(
    ["conda", "run", "pip", "install", "--editable", "."],
    check=True,
)

## Install browsers so that playwright can scrape web pages

```sh
playwright install firefox
playwright install chromium
playwright install msedge
```

# Convert privacy policy to knowledge graph

## Create `output/` folder if not present

In [None]:
folder_path = "./output"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

## Import poligrapher scripts

In [None]:
from poligrapher.scripts import (
    build_graph,
    html_crawler,
    init_document,
    pdf_parser,
    run_annotators,
)

## Get policy documents from `policy_list.json` file and generate their knowledge graphs

In [None]:
from requests import RequestException


async def generate_graph_from_html(html_path, output_folder):
    """
    Generate a graph from an HTML file.
    """
    # Run the html crawler
    await html_crawler.main(html_path, output_folder)
    # Initialize the document
    init_document.main(workdirs=[output_folder])
    # Run the annotators
    run_annotators.main(workdirs=[output_folder])
    # Create the graph and generate a .yaml file
    build_graph.main(workdirs=[output_folder])
    # Create the graph and generate a .graphml file
    build_graph.main(pretty=True, workdirs=[output_folder])


def needs_graph_generation(output_folder):
    """
    Check if the graph needs to be generated.
    """
    # Check if the output folder contains a .graphml file
    graphml_files = glob.glob(os.path.join(output_folder, "*.graphml"))
    return len(graphml_files) == 0


# Open the policy urls file
with open("./notebooks/policy_list.json", "r") as file:
    policy_urls = json.load(file)["policy_urls"]

output_folder_prefix = "./output/"

for policy in policy_urls:
    policy_name = policy["name"]
    policy_url = policy["path"]
    policy_kind = policy["kind"]

    # get domain name from url for folder name
    output_folder = output_folder_prefix + policy_name.replace(" ", "_")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    if not needs_graph_generation(output_folder):
        print(f"Graph already exists for {policy_name}, skipping generation.")
        continue
    else:
        print(f"Generating graph for {policy_name} from {policy_url}")

    if policy_kind == "pdf":
        try:
            # Run the pdf parser
            await pdf_parser.main(policy_url, output_folder)
            html_path = os.path.join(output_folder, "output.html")
            await generate_graph_from_html(html_path, output_folder)
            print(f"Graphs for {policy_url} have been generated using PDF parser")
        except Exception as e:
            print(f"Error generating graphs for {policy_url}")
            print(e)
    elif policy_kind == "webpage":
        try:
            await generate_graph_from_html(policy_url, output_folder)
            print(f"Graphs for {policy_url} have been generated using webpage parser")
        except Exception as e:
            print(f"Error generating graphs for {policy_url}")
            print(e)
    elif policy_kind == "auto":
        try:
            await generate_graph_from_html(policy_url, output_folder)
            print(f"Graphs for {policy_url} have been generated using webpage parser")
        except RequestException as ex:
            print(f"Error generating graphs for {policy_url}")
            print(ex)
        except BaseException as e:
            try:
                # Fallback to the pdf parser method
                print(f"Falling back to PDF parser for {policy_url}")
                await pdf_parser.main(policy_url, output_folder)
                html_path = os.path.join(output_folder, "output.html")
                await generate_graph_from_html(html_path, output_folder)
                print(f"Graphs for {policy_url} have been generated using PDF parser")
            except BaseException as e:
                print(f"Error generating graphs for {policy_url}")
                print(e)
    else:
        print(f"Unknown policy kind for {policy_name}: {policy_kind}")
        continue

# View output

If you just ran the basic command to generate a graph then `graph-original.full.yml` and `graph-original.yml` are the final ouptut. 

For the pretty graph the output is a `graph-original.graphml` file

In [None]:
subprocess.run(["ls", "-R", "./output"], check=True)

## Visualize the `graph-original.full.yml` file

### Knowledge Graph

In [None]:
os.chdir("./output")


def needs_graphml_visual(folder):
    pattern = os.path.join(folder, "*" + ".yml")
    has_graphml = len(glob.glob(pattern)) > 0
    if has_graphml:
        return not os.path.exists(f"{folder}/knowledge_graph.png")
    return False


# loop through the output folder and get the graph files
graph_files = []
for root, dirs, files in os.walk("."):
    for dir in dirs:
        full_dir_path = os.path.join(root, dir)
        if needs_graphml_visual(full_dir_path):
            yml_file = os.path.join(full_dir_path, "graph-original.full.yml")
            if os.path.exists(yml_file) and yml_file not in graph_files:
                graph_files.append(yml_file)

for graph_file in graph_files:
    parent_folder = os.path.dirname(graph_file)
    output_png = os.path.join(parent_folder, "knowledge_graph.png")
    print(f"Converting {graph_file} to PNG")

    with open(graph_file, "r") as file:
        data = yaml.safe_load(file)

    G = nx.DiGraph()
    # nodes
    for node in data.get("nodes", []):
        G.add_node(node["id"], type=node["type"])
    # edges
    for link in data.get("links", []):
        G.add_edge(link["source"], link["target"], label=link["key"])

    plt.figure(figsize=(20, 15), facecolor="white")
    pos = nx.spring_layout(G, k=0.5)
    nx.draw(
        G,
        pos,
        with_labels=True,
        node_size=3000,
        node_color="lightblue",
        edge_color="gray",
    )
    edge_labels = nx.get_edge_attributes(G, "label")
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    plt.title("Knowledge Graph - " + parent_folder)
    plt.savefig(output_png, facecolor="white")
    plt.close()

### Table of Relations

In [None]:
def needs_csv_extract(folder):
    pattern = os.path.join(folder, "*" + ".yml")
    has_graphml = len(glob.glob(pattern)) > 0
    if has_graphml:
        return not os.path.exists(f"{folder}/complete_extracted_data.csv")
    return False


# load yml file
def load_yml(file_path):
    if os.path.exists(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                return yaml.safe_load(file)
        except Exception as e:
            print(f"Error loading YAML: {e}")
    else:
        print(f"File not found: {file_path}")
    return None


# get relationships from yml file
def extract_yml_relationships(yaml_data):
    relationships = []
    if yml_data and "links" in yml_data:
        for link in yml_data["links"]:
            source = link.get("source", "Unknown Source")
            target = link.get("target", "Unknown Target")
            relation = link.get("key", "Unknown Relationship")
            # combine policy excerpts(references)
            text = " | ".join(link.get("text", []))
            purposes = (
                " | ".join(
                    [
                        f"{k}: {', '.join(v)}"
                        for k, v in link.get("purposes", {}).items()
                    ]
                )
                if link.get("purposes")
                else "None"
            )
            relationships.append((source, relation, target, text, purposes))
    return relationships


# get file paths
yml_path = "graph-original.full.yml"

# loop through the output folder and get the graph files
graph_files = []
for root, dirs, files in os.walk("."):
    for dir in dirs:
        full_dir_path = os.path.join(root, dir)
        if needs_csv_extract(full_dir_path):
            yml_file = os.path.join(full_dir_path, yml_path)
            if os.path.exists(yml_file) and yml_file not in graph_files:
                graph_files.append(yml_file)

for graph_file in graph_files:
    print(f"\nExtracting relationships from '{graph_file}'")
    parent_folder = os.path.dirname(graph_file)
    output_csv_path = os.path.join(parent_folder, "complete_extracted_data.csv")
    # call the funtions
    yml_data = load_yml(graph_file)

    # get relationships from both files
    yml_relationships = extract_yml_relationships(yml_data) if yml_data else []

    # combine results to a DF
    df_combined = pd.DataFrame(
        yml_relationships,
        columns=["Entity", "Relation", "Target Entity", "Policy Text", "Purposes"],
    )

    # save the csv
    df_combined.to_csv(output_csv_path, index=False)
    print(f"\nSaved extracted data to '{output_csv_path}'")

In [None]:
# reset current directory location to where the script started
os.chdir("../notebooks")