# Graph Filtering Notebook

This notebook loads an SQLite database, builds a graph using **python‑igraph**, and filters the graph so that only nodes with a certain type and a minimum number of relationships are plotted. The notebook also shows how to save both the graph visualization and the graph structure.

## Setup and Configuration

Make sure to install the required libraries before running the cells:

```bash
pip install python-igraph pandas tqdm
```

This cell sets up the configuration variables and helper functions.

In [7]:
import sqlite3
import pandas as pd
from tqdm import tqdm
from igraph import Graph, plot

# -----------------------------
# Configuration Options
# -----------------------------



# -----------------------------
# Vertex initialization
# -----------------------------
vertex_dict = {}  # Mapping from our custom node IDs (strings) to igraph vertex indices
vertices = []     # List of vertex names (custom IDs)
vertex_attrs = [] # List of dictionaries with vertex attributes

def add_vertex(v_id, attr):
    """Adds a vertex if not already added, storing its attributes."""
    if v_id not in vertex_dict:
        index = len(vertices)
        vertex_dict[v_id] = index
        vertices.append(v_id)
        vertex_attrs.append(attr)

# Path to your SQLite database
db_path = 'output/social_network_anonymized.db'

## Load Vertices from the Database

This cell reads in the Profiles, Activity, and Media tables from the database and adds each row as a vertex.

In [8]:
# Connect to the database
conn = sqlite3.connect(db_path)

# --- Process Profiles ---
profiles_df = pd.read_sql_query("SELECT id, name, profile_type, platform, profile_url, region FROM Profiles", conn)
for _, row in tqdm(profiles_df.iterrows(), total=profiles_df.shape[0], desc="Processing Profiles"):
    v_name = f"profile_{row['id']}"
    add_vertex(v_name, {
        "label": row["name"],
        "type": "profile",
        "profile_type": row["profile_type"],
        "platform": row["platform"],
        "profile_url": row["profile_url"],
        "region": row["region"]
    })

# --- Process Activities ---
activity_df = pd.read_sql_query("SELECT id, type, timestamp, content, description, platform FROM Activity", conn)
for _, row in tqdm(activity_df.iterrows(), total=activity_df.shape[0], desc="Processing Activities"):
    v_name = f"activity_{row['id']}"
    add_vertex(v_name, {
        "type": "activity",
        "activity_type": row["type"],
        "timestamp": row["timestamp"],
        "content": row["content"],
        "description": row["description"],
        "platform": row["platform"]
    })

# --- Process Media ---
media_df = pd.read_sql_query("SELECT id, type, file_reference, original_url FROM Media", conn)
for _, row in tqdm(media_df.iterrows(), total=media_df.shape[0], desc="Processing Media"):
    v_name = f"media_{row['id']}"
    add_vertex(v_name, {
        "type": "media",
        "media_type": row["type"],
        "file_reference": row["file_reference"],
        "original_url": row["original_url"]
    })

conn.close()

Processing Profiles: 100%|██████████| 25461/25461 [00:02<00:00, 11040.52it/s]
Processing Activities: 100%|██████████| 102738/102738 [00:09<00:00, 10764.18it/s]
Processing Media: 100%|██████████| 27037/27037 [00:02<00:00, 12644.38it/s]


## Load Edges from the Database

This cell reads the relationship tables from the database and builds the edge list with attributes.

In [9]:
edges = []      # List of tuples (source_index, target_index)
edge_attrs = [] # List of dictionaries for edge attributes

conn = sqlite3.connect(db_path)

# --- ProfileConnection edges (Profile-to-Profile) ---
profile_conn_df = pd.read_sql_query("SELECT source_id, target_id, connection_type FROM ProfileConnection", conn)
for _, row in tqdm(profile_conn_df.iterrows(), total=profile_conn_df.shape[0], desc="Processing Profile Connections"):
    source = f"profile_{row['source_id']}"
    target = f"profile_{row['target_id']}"
    if source in vertex_dict and target in vertex_dict:
        edges.append((vertex_dict[source], vertex_dict[target]))
        edge_attrs.append({"type": row["connection_type"]})

# --- ProfileActivity edges (Profile-to-Activity) ---
profile_activity_df = pd.read_sql_query("SELECT profile_id, activity_id, relationship_type FROM ProfileActivity", conn)
for _, row in tqdm(profile_activity_df.iterrows(), total=profile_activity_df.shape[0], desc="Processing Profile Activities"):
    source = f"profile_{row['profile_id']}"
    target = f"activity_{row['activity_id']}"
    if source in vertex_dict and target in vertex_dict:
        edges.append((vertex_dict[source], vertex_dict[target]))
        edge_attrs.append({"type": row["relationship_type"]})

# --- ActivityMedia edges (Activity-to-Media) ---
activity_media_df = pd.read_sql_query("SELECT activity_id, media_id, relationship_type FROM ActivityMedia", conn)
for _, row in tqdm(activity_media_df.iterrows(), total=activity_media_df.shape[0], desc="Processing Activity Media"):
    source = f"activity_{row['activity_id']}"
    target = f"media_{row['media_id']}"
    if source in vertex_dict and target in vertex_dict:
        edges.append((vertex_dict[source], vertex_dict[target]))
        edge_attrs.append({"type": row["relationship_type"]})

conn.close()

Processing Profile Connections: 100%|██████████| 43380/43380 [00:03<00:00, 12385.85it/s]
Processing Profile Activities: 100%|██████████| 142865/142865 [00:10<00:00, 13318.80it/s]
Processing Activity Media: 100%|██████████| 48022/48022 [00:04<00:00, 11357.19it/s]


## Build the Graph with igraph

This cell creates the igraph graph by adding vertices, setting attributes, and then adding the edges.

In [10]:
# Create the graph
g = Graph()
g.add_vertices(len(vertices))
g.vs["name"] = vertices  # Set vertex names

# Set vertex attributes
all_vertex_keys = set()
for attr in vertex_attrs:
    all_vertex_keys.update(attr.keys())
for key in all_vertex_keys:
    g.vs[key] = [attr.get(key, None) for attr in vertex_attrs]

g.add_edges(edges)

# Set edge attributes
all_edge_keys = set()
for attr in edge_attrs:
    all_edge_keys.update(attr.keys())
for key in all_edge_keys:
    g.es[key] = [attr.get(key, None) for attr in edge_attrs]

In [19]:
from ipysigma import Sigma
Sigma(g)

Sigma(ig.Graph with 155,236 nodes and 213,769 edges)

## Filter the Graph

This cell filters the graph in two steps:

1. **Node-Type Filtering:** Keep only vertices whose `type` attribute is in `selected_node_types`.
2. **Relationship Filtering:** Further keep only those nodes that have at least `min_relationships` (i.e. a minimum degree).

An induced subgraph is created with the selected vertices.

In [14]:
# Specify which node types to plot (choose from "profile", "activity", "media")
selected_node_types = {"profile"}#, "activity", "media"}

# Specify the minimum number of relationships (edges) a node must have to be included
min_relationships = 5
# Filter by selected node types
selected_indices = [v.index for v in g.vs if v["type"] in selected_node_types]
subgraph = g.induced_subgraph(selected_indices)

# Further filter: Only keep vertices with at least min_relationships
final_indices = [v.index for v in subgraph.vs if subgraph.degree(v.index) >= min_relationships]
final_subgraph = subgraph.induced_subgraph(final_indices)

## Layout, Plot, and Save the Graph

This cell computes a layout using the Fruchterman-Reingold algorithm, plots the graph with vertex colors based on type, and saves the visualization as an image. An optional cell to save the graph structure is provided.

In [15]:
# Compute the layout
layout = final_subgraph.layout("fr")

# Optionally set vertex colors based on type
color_map = {"profile": "skyblue", "activity": "lightgreen", "media": "lightcoral"}
vertex_colors = [color_map.get(v["type"], "grey") for v in final_subgraph.vs]

# Plot and save the graph image
plot(
    final_subgraph,
    layout=layout,
    vertex_color=vertex_colors,
    vertex_label=final_subgraph.vs["name"],
    margin=40,
    bbox=(8000, 8000),
    target="filtered_social_network.png"  # Change or comment out this line to only display the plot
)

# Optionally, save the graph structure to a GraphML file
final_subgraph.write_graphml("filtered_social_network.graphml")

In [18]:
from google import genai
from google.genai import types
import base64

def generate():
  client = genai.Client(
      vertexai=True,
      project="electricwin25lon-513",
      location="us-central1",
  )

  si_text1 = """Answer the users prompt with just True or False depending on if you believe the text related to animal trafficking"""

  model = "gemini-2.0-pro-exp-02-05"
  contents = [
    types.Content(
      role="user",
      parts=[
        types.Part.from_text(text="""I am looking to buy a cider""")
      ]
    )
  ]
  generate_content_config = types.GenerateContentConfig(
    temperature = 0,
    top_p = 1,
    seed = 0,
    max_output_tokens = 512,
    response_modalities = ["TEXT"],
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
    system_instruction=[types.Part.from_text(text=si_text1)],
  )

  for chunk in client.models.generate_content_stream(
    model = model,
    contents = contents,
    config = generate_content_config,
    ):
    print(chunk.text, end="")

generate()



False
