# Setup

## Import libraries

In [None]:
import os
import subprocess
import glob
import json

## Create folder and clone Git repo

In [None]:
repo_url = "https://github.com/lukeblevins/PoliGraph-Setup.git"
folder_path = "./Poligraph_Tool"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    os.makedirs(folder_path + "/PoliGraph-Setup")


def clone_repository(repo_url, clone_path):
    try:
        result = subprocess.run(
            ["git", "clone", repo_url, clone_path],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
        print("Repository cloned successfully:")
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error cloning repository:")
        print(e.stderr)


clone_repository(repo_url, folder_path + "/PoliGraph-Setup")

## Install dependencies for repository

In [None]:
cache_dir = "./Poligraph_Tool/cache"
requirements_file = "./Poligraph_Tool/PoliGraph-Setup/requirements.txt"

if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

subprocess.run(
    ["pip", "install", "--cache-dir", cache_dir, "-r", requirements_file], check=True
)
subprocess.run(
    [
        "pip",
        "install",
        "gdown",
        "pyyaml",
        "networkx",
        "pandas",
        "install-playwright",
        "matplotlib",
        "--cache-dir",
        cache_dir,
    ],
    check=True,
)

## Import installed libraries

In [None]:
import gdown
import yaml
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

## Download the model file from researchers' Google Drive

In [None]:
url = "https://drive.google.com/uc?id=1qHifRx93EfTkg2x1e2W_lgQAgk7HcXhP"
output = "./Poligraph_Tool/cache/poligrapher-extra-data.tar.gz"

if os.path.exists(output):
    print(f"Using cached file: {output}")
else:
    print(f"Downloading file from {url}")
    gdown.download(url, output, quiet=False)
    print(f"File downloaded to: {output}")

## Download spaCy model

In [None]:
import spacy

# Download spaCy model
print(f"Downloading spaCy model...")
spacy.cli.download("en_core_web_trf")
print(f"SpaCy model downloaded.")

## Unzip and move model file to correct folder

In [None]:
import tarfile

# Change the current working directory
os.chdir("./Poligraph_Tool/")

# Extract the tar.gz file
with tarfile.open("./cache/poligrapher-extra-data.tar.gz", "r:gz") as tar:
    os.chdir("./PoliGraph-Setup/")
    tar.extractall(path="./poligrapher/extra-data")

# Remove the tar.gz file
# os.remove("../cache/poligrapher-extra-data.tar.gz")

## Install tool as python package

In [None]:
subprocess.run(
    ["pip", "install", "--cache-dir", "../cache/", "--editable", "."], check=True
)

## Install browsers so that playwright can scrape web pages

In [None]:
from install_playwright import install
from playwright.async_api import async_playwright


async def main():
    async with async_playwright() as p:
        install(p.chromium)


# Run the async function
await main()

# Convert privacy policy to knowledge graph

## Get policy documents from `policy_list.json` file and generate their knowledge graphs

In [None]:
# Open the policy urls file
with open("../../policy_list.json", "r") as file:
    policy_urls = json.load(file)["policy_urls"]

output_folder_prefix = "../../output/"

for policy in policy_urls:
    policy_name = policy["name"]
    policy_url = policy["path"]
    policy_kind = policy["kind"]

    # get domain name from url for folder name
    output_folder = output_folder_prefix + policy_name.replace(" ", "_")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    print(f"Generating graph for {policy_name} from {policy_url}")
    try:
        if policy_kind == "pdf":
            # Run the pdf parser
            subprocess.run(
                [
                    "python",
                    "-m",
                    "poligrapher.scripts.pdf_parser",
                    policy_url,
                    output_folder,
                ],
                check=True,
            )
            html_path = os.path.join(output_folder, "output.html")
            # Run the html crawler
            subprocess.run(
                [
                    "python",
                    "-m",
                    "poligrapher.scripts.html_crawler",
                    html_path,
                    output_folder,
                ],
                check=False,
            )
        else:
            # Run the html crawler
            subprocess.run(
                [
                    "python",
                    "-m",
                    "poligrapher.scripts.html_crawler",
                    policy_url,
                    output_folder,
                ],
                check=False,
            )
            # Run the html crawler
            subprocess.run(
                [
                    "python",
                    "-m",
                    "poligrapher.scripts.html_crawler",
                    policy_url,
                    output_folder,
                ],
                check=True,
            )

        subprocess.run(
            [
                "python",
                "-m",
                "poligrapher.scripts.init_document",
                output_folder,
            ],
            check=False,
        )
        # Run the annotators
        subprocess.run(
            [
                "python",
                "-m",
                "poligrapher.scripts.run_annotators",
                output_folder,
            ],
            check=True,
        )
        # Command to create the graph generates a .yaml file with all the data
        subprocess.run(
            [
                "python",
                "-m",
                "poligrapher.scripts.build_graph",
                output_folder,
            ],
            check=True,
        )
        # Command to create the graph generates a .graphml file.
        subprocess.run(
            [
                "python",
                "-m",
                "poligrapher.scripts.build_graph",
                "--pretty",
                output_folder,
            ],
            check=True,
        )
        print(f"Graphs for {policy_url} have been generated")
    except subprocess.CalledProcessError as e:
        print(f"Error generating graphs for {policy_url}")
        print(e.stderr)

# View output

If you just ran the basic command to generate a graph then `graph-original.full.yml` and `graph-orginal.yml` are the final ouptut. 

For the pretty graph the output is a `graph-orginal.graphml` file

In [None]:
# Change the current working directory
os.chdir("../../output/")
subprocess.run(["ls", "-R"], check=True)

## Visualize the `graph-original.full.yml` file

### Knowledge Graph

In [None]:
def needs_graphml_visual(folder):
    pattern = os.path.join(folder, "*" + ".yml")
    has_graphml = len(glob.glob(pattern)) > 0
    if has_graphml:
        return not os.path.exists(f"{folder}/knowledge_graph.png")
    return False


# loop through the output folder and get the graph files
graph_files = []
for root, dirs, files in os.walk("."):
    for dir in dirs:
        full_dir_path = os.path.join(root, dir)
        if needs_graphml_visual(full_dir_path):
            yml_file = os.path.join(full_dir_path, "graph-original.full.yml")
            if os.path.exists(yml_file):
                graph_files.append(yml_file)

for graph_file in graph_files:
    parent_folder = os.path.dirname(graph_file)
    output_png = os.path.join(parent_folder, "knowledge_graph.png")
    print(f"Converting {graph_file} to PNG")

    with open(graph_file, "r") as file:
        data = yaml.safe_load(file)

    G = nx.DiGraph()
    # nodes
    for node in data.get("nodes", []):
        G.add_node(node["id"], type=node["type"])
    # edges
    for link in data.get("links", []):
        G.add_edge(link["source"], link["target"], label=link["key"])

    plt.figure(figsize=(20, 15), facecolor="white")
    pos = nx.spring_layout(G, k=0.5)
    nx.draw(
        G,
        pos,
        with_labels=True,
        node_size=3000,
        node_color="lightblue",
        edge_color="gray",
    )
    edge_labels = nx.get_edge_attributes(G, "label")
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    plt.title("Knowledge Graph - " + parent_folder)
    plt.savefig(output_png, facecolor="white")

### Table of Relations

In [None]:
def needs_csv_extract(folder):
    pattern = os.path.join(folder, "*" + ".yml")
    has_graphml = len(glob.glob(pattern)) > 0
    if has_graphml:
        return not os.path.exists(f"{folder}/complete_extracted_data.csv")
    return False


# load yml file
def load_yml(file_path):
    if os.path.exists(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                return yaml.safe_load(file)
        except Exception as e:
            print(f"Error loading YAML: {e}")
    else:
        print(f"File not found: {file_path}")
    return None


# get relationships from yml file
def extract_yml_relationships(yaml_data):
    relationships = []
    if yml_data and "links" in yml_data:
        for link in yml_data["links"]:
            source = link.get("source", "Unknown Source")
            target = link.get("target", "Unknown Target")
            relation = link.get("key", "Unknown Relationship")
            # combine policy excerpts(references)
            text = " | ".join(link.get("text", []))
            purposes = (
                " | ".join(
                    [
                        f"{k}: {', '.join(v)}"
                        for k, v in link.get("purposes", {}).items()
                    ]
                )
                if link.get("purposes")
                else "None"
            )
            relationships.append((source, relation, target, text, purposes))
    return relationships


# get file paths
yml_path = "graph-original.full.yml"

# loop through the output folder and get the graph files
graph_files = []
for root, dirs, files in os.walk("."):
    for dir in dirs:
        full_dir_path = os.path.join(root, dir)
        if needs_csv_extract(full_dir_path):
            yml_file = os.path.join(full_dir_path, yml_path)
            if os.path.exists(yml_file):
                graph_files.append(yml_file)

for graph_file in graph_files:
    print(f"\nExtracting relationships from '{graph_file}'")
    parent_folder = os.path.dirname(graph_file)
    output_csv_path = os.path.join(parent_folder, "complete_extracted_data.csv")
    # call the funtions
    yml_data = load_yml(graph_file)

    # get relationships from both files
    yml_relationships = extract_yml_relationships(yml_data) if yml_data else []

    # combine results to a DF
    df_combined = pd.DataFrame(
        yml_relationships,
        columns=["Entity", "Relation", "Target Entity", "Policy Text", "Purposes"],
    )

    # save the csv
    df_combined.to_csv(output_csv_path, index=False)
    print(f"\nSaved extracted data to '{output_csv_path}'")