<div style="text-align: right;">Â© 2026 Moses Boudourides. All Rights Reserved.</div>

# LLMs for Qualitative and Mixed-Methods Social Network Analysis (SNA)
## Moses Boudourides

# Session 5: Computational Practice and Ethical Responsibility

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from pyvis.network import Network
import os
import json
import sys
import hashlib
import random
import re
from sklearn.datasets import fetch_20newsgroups
import IPython
from openai import OpenAI
# import google.generativeai as genai

In [2]:
# --- 1. & 2. KEY LOADING & INITIALIZATION ---

# Force Google to use REST to avoid ALTS/GCP credential errors
os.environ["GOOGLE_API_USE_MTLS"] = "never" 

def get_api_key(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return f.read().strip().replace('"', '').replace("'", "")
    return None

oa_key = get_api_key("openai_key.txt")
# gem_key = get_api_key("gemini_key.txt")

# Initialize OpenAI
client_oa = OpenAI(api_key=oa_key)

# # Initialize Gemini using 'rest' transport to bypass gRPC/ALTS errors
# genai.configure(api_key=gem_key, transport='rest')

# # Dynamic Model Selection
# available_models = [m.name for m in genai.list_models() if 'generateContent' in m.supported_generation_methods]
# target_model = 'gemini-1.5-flash' if 'models/gemini-1.5-flash' in available_models else available_models[0].split('/')[-1]
# model_gemini = genai.GenerativeModel(target_model)

## Part 1: Computational Practice and Ethical Responsibility

**Goal:**  
Demonstrate an end-to-end LLM-augmented qualitative SNA workflow with explicit ethical checkpoints.

**Important Note:** This notebook is illustrative. Any real-world application requires ethics approval (where applicable), informed consent, and careful consideration of harm.

In [3]:
# --- 3. DATA & PERSISTENT QUERY STEP ---

# 2. Persistence Logic
CACHE_FILE = "llm_cache_s5.json"

if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        cache = json.load(f)
else:
    cache = {}

def get_label(model_id, text, api_func, prompt_type="practice"):
    # Unique key ensures cache stays valid even if prompt or text changes
    cache_key = f"{model_id}_{prompt_type}_{text[:50]}"
    
    if cache_key in cache:
        return cache[cache_key]
    
    # Cache Miss: Call API
    result = api_func(text)
    cache[cache_key] = result
    
    # Save updated cache to disk
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f)
    return result

# 3. API Execution Wrappers
def query_openai_relationships(text):
    prompt = f"""Identify social relationships described in the following text.
Return actor pairs and a brief description of the relationship.

Text: {text}"""
    res = client_oa.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7
    )
    return res.choices[0].message.content.strip()

# Simulated relational text data (synthetic to avoid ethical violations)
texts_small = [
    "I depend on Leah for confidential advice.",
    "Marcus officially manages the team but avoids conflict.",
    "Leah and Marcus cooperate publicly but mistrust each other privately.",
    "Tina feels excluded from informal decision-making."
]

df_small = pd.DataFrame({"text": texts_small})
df_small

Unnamed: 0,text
0,I depend on Leah for confidential advice.
1,Marcus officially manages the team but avoids ...
2,Leah and Marcus cooperate publicly but mistrus...
3,Tina feels excluded from informal decision-mak...


## Data Preprocessing and Cleaning

Real-world text data is messy. Preprocessing is a crucial first step before any analysis.

In [4]:
# Simple preprocessing
def preprocess_text(text):
    # Strip whitespace
    cleaned = text.strip()
    # Remove extra whitespace
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned

df_small['cleaned_text'] = df_small['text'].apply(preprocess_text)

print("PREPROCESSING RESULTS:")
print()
for i, (orig, clean) in enumerate(zip(df_small['text'], df_small['cleaned_text'])):
    print(f"Text {i+1}:")
    print(f"  Original: {repr(orig)}")
    print(f"  Cleaned:  {repr(clean)}")
    print()

PREPROCESSING RESULTS:

Text 1:
  Original: 'I depend on Leah for confidential advice.'
  Cleaned:  'I depend on Leah for confidential advice.'

Text 2:
  Original: 'Marcus officially manages the team but avoids conflict.'
  Cleaned:  'Marcus officially manages the team but avoids conflict.'

Text 3:
  Original: 'Leah and Marcus cooperate publicly but mistrust each other privately.'
  Cleaned:  'Leah and Marcus cooperate publicly but mistrust each other privately.'

Text 4:
  Original: 'Tina feels excluded from informal decision-making.'
  Cleaned:  'Tina feels excluded from informal decision-making.'



## Ethical Checkpoint 1: Data Source

Before analysis, ask:
- Who produced these texts?
- Did they consent to analysis?
- Who is implicated indirectly?

In [5]:
print("ETHICAL CHECKPOINT 1: DATA SOURCE")
print("="*60)
print()
print("Critical Questions:")
print("1. Who produced these texts?")
print("   - In this example: Synthetic data (no real people)")
print()
print("2. Did they consent to analysis?")
print("   - In this example: N/A (synthetic)")
print("   - In real research: Informed consent is essential")
print()
print("3. Who is implicated indirectly?")
print("   - In this example: Fictional actors (Leah, Marcus, Tina)")
print("   - In real research: Consider all affected parties")
print()
print("Proceeding with synthetic data for illustration purposes.")

ETHICAL CHECKPOINT 1: DATA SOURCE

Critical Questions:
1. Who produced these texts?
   - In this example: Synthetic data (no real people)

2. Did they consent to analysis?
   - In this example: N/A (synthetic)
   - In real research: Informed consent is essential

3. Who is implicated indirectly?
   - In this example: Fictional actors (Leah, Marcus, Tina)
   - In real research: Consider all affected parties

Proceeding with synthetic data for illustration purposes.


## LLM-Assisted Interpretation (Provisional)

LLM outputs are suggestions, not facts. They require researcher interpretation and validation.

In [6]:
print("LLM-ASSISTED INTERPRETATION")
print("="*60)
print()

for i, text in enumerate(texts_small):
    interpretation = get_label("openai", text, query_openai_relationships, "relationships")
    print(f"Text {i+1}: {text}")
    print(f"LLM Interpretation:\n{interpretation}")
    print("-" * 60)

LLM-ASSISTED INTERPRETATION

Text 1: I depend on Leah for confidential advice.
LLM Interpretation:
1. **Actor Pair**: "I" and "Leah"  
   **Relationship Description**: The speaker relies on Leah for confidential advice, indicating a relationship of trust and dependency.
------------------------------------------------------------
Text 2: Marcus officially manages the team but avoids conflict.
LLM Interpretation:
1. **Marcus - Team**: Marcus is the manager of the team, indicating a hierarchical relationship where he holds authority over the team members. However, he tends to avoid conflict, suggesting a possibly passive or non-confrontational approach to his leadership role.

2. **Marcus - Conflict**: Although not a direct relationship with another actor, Marcus has a relationship with conflict, which he avoids. This indicates a personal characteristic or behavior trait that impacts his management style and interactions with the team.
----------------------------------------------------

## Ethical Checkpoint 2: Hallucination Risk

Ask:
- Are relationships inferred or stated?
- Are motives attributed without evidence?

In [7]:
print("ETHICAL CHECKPOINT 2: HALLUCINATION RISK")
print("="*60)
print()
print("Critical Questions:")
print("1. Are relationships inferred or stated?")
print("   - Check: Does the LLM output match the original text?")
print("   - Risk: LLM may infer relationships not explicitly stated")
print()
print("2. Are motives attributed without evidence?")
print("   - Check: Are psychological states inferred?")
print("   - Risk: LLM may hallucinate internal states or intentions")
print()
print("Mitigation: Retain only relationships explicitly stated in text.")

ETHICAL CHECKPOINT 2: HALLUCINATION RISK

Critical Questions:
1. Are relationships inferred or stated?
   - Check: Does the LLM output match the original text?
   - Risk: LLM may infer relationships not explicitly stated

2. Are motives attributed without evidence?
   - Check: Are psychological states inferred?
   - Risk: LLM may hallucinate internal states or intentions

Mitigation: Retain only relationships explicitly stated in text.


## Researcher Filtering

We retain only relationships explicitly stated in text, filtering out inferred or hallucinated content.

In [8]:
# Build network with explicit relationships only
edges = [
    ("Narrator", "Leah", {"meaning": "confidential trust", "explicit": True}),
    ("Narrator", "Marcus", {"meaning": "formal authority", "explicit": True}),
    ("Leah", "Marcus", {"meaning": "ambivalent cooperation", "explicit": True}),
    ("Tina", "Group", {"meaning": "exclusion", "explicit": True})
]

G_filtered = nx.Graph()
for u, v, attr in edges:
    G_filtered.add_edge(u, v, **attr)

print("FILTERED NETWORK (Explicit Relationships Only):")
print()
print(f"Nodes: {list(G_filtered.nodes())}")
print(f"Edges: {list(G_filtered.edges())}")
print()
for u, v, d in G_filtered.edges(data=True):
    print(f"  {u} -- {v}: {d['meaning']}")

FILTERED NETWORK (Explicit Relationships Only):

Nodes: ['Narrator', 'Leah', 'Marcus', 'Tina', 'Group']
Edges: [('Narrator', 'Leah'), ('Narrator', 'Marcus'), ('Leah', 'Marcus'), ('Tina', 'Group')]

  Narrator -- Leah: confidential trust
  Narrator -- Marcus: formal authority
  Leah -- Marcus: ambivalent cooperation
  Tina -- Group: exclusion


## Ethical Checkpoint 3: Representation

Ask:
- Does this visualization stigmatize?
- Could it harm participants if disclosed?

In [9]:
print("ETHICAL CHECKPOINT 3: REPRESENTATION")
print("="*60)
print()
print("Critical Questions:")
print("1. Does this visualization stigmatize?")
print("   - Consider: How are actors positioned?")
print("   - Consider: Does the network reinforce stereotypes?")
print()
print("2. Could it harm participants if disclosed?")
print("   - Consider: Could re-identification occur?")
print("   - Consider: Could relationships damage careers or relationships?")
print()
print("Mitigation: Use anonymization and careful disclosure strategies.")

ETHICAL CHECKPOINT 3: REPRESENTATION

Critical Questions:
1. Does this visualization stigmatize?
   - Consider: How are actors positioned?
   - Consider: Does the network reinforce stereotypes?

2. Could it harm participants if disclosed?
   - Consider: Could re-identification occur?
   - Consider: Could relationships damage careers or relationships?

Mitigation: Use anonymization and careful disclosure strategies.


## Data Anonymization and Ethical Implementation

Protect participant identity while maintaining network structure using hashing.

In [10]:
# Anonymization function
def anonymize_user(username):
    return hashlib.sha256(username.encode()).hexdigest()[:8]

# Simulated raw data
raw_data = [
    {"user": "Leah", "text": "I provide confidential advice.", "timestamp": "2023-10-01"},
    {"user": "Marcus", "text": "I manage the team formally.", "timestamp": "2023-10-02"},
    {"user": "Tina", "text": "I feel excluded from decisions.", "timestamp": "2023-10-03"}
]

df_raw = pd.DataFrame(raw_data)

# Clean and anonymize
df_raw['clean_text'] = df_raw['text'].str.lower()
df_raw['anon_user'] = df_raw['user'].apply(anonymize_user)

print("ANONYMIZATION:")
print()
print("Original User Mapping:")
print(df_raw[['user', 'anon_user']])
print()
print("Anonymization preserves network structure while protecting identity.")

ANONYMIZATION:

Original User Mapping:
     user anon_user
0    Leah  1373cf56
1  Marcus  245536ac
2    Tina  0fd1eb2e

Anonymization preserves network structure while protecting identity.


## Validation and Inter-Rater Reliability

It is essential to validate LLM output against human coding to ensure accuracy.

In [11]:
# Simulated LLM output vs. Human coder output
llm_output = [("Leah", "Narrator"), ("Marcus", "Narrator"), ("Leah", "Marcus"), ("Tina", "Group")]
human_coder_output = [("Leah", "Narrator"), ("Marcus", "Narrator"), ("Leah", "Marcus"), ("Tina", "Group")]

# Calculate agreement
agreement = sum(1 for x, y in zip(llm_output, human_coder_output) if x == y)
accuracy = agreement / len(llm_output)

print("VALIDATION AND INTER-RATER RELIABILITY")
print("="*60)
print()
print(f"LLM Output:        {llm_output}")
print(f"Human Coder:       {human_coder_output}")
print()
print(f"Agreement:         {agreement}/{len(llm_output)} pairs")
print(f"Accuracy:          {accuracy:.0%}")
print()
print("Note: High agreement suggests reliable extraction, but always validate.")

VALIDATION AND INTER-RATER RELIABILITY

LLM Output:        [('Leah', 'Narrator'), ('Marcus', 'Narrator'), ('Leah', 'Marcus'), ('Tina', 'Group')]
Human Coder:       [('Leah', 'Narrator'), ('Marcus', 'Narrator'), ('Leah', 'Marcus'), ('Tina', 'Group')]

Agreement:         4/4 pairs
Accuracy:          100%

Note: High agreement suggests reliable extraction, but always validate.


## Documentation and Audit Trail

Document all LLM use for transparency and accountability.

In [12]:
# Minimal documentation record
documentation = pd.DataFrame([
    {
        "model": "gpt-4o-mini",
        "purpose": "Provisional relationship extraction",
        "filters": "Only explicit relations retained",
        "review": "Manual verification applied",
        "date": "2026-01-26"
    }
])

print("DOCUMENTATION AND AUDIT TRAIL:")
print()
print(documentation.to_string())
print()
print("This documentation is part of ethical accountability.")

DOCUMENTATION AND AUDIT TRAIL:

         model                              purpose                           filters                       review        date
0  gpt-4o-mini  Provisional relationship extraction  Only explicit relations retained  Manual verification applied  2026-01-26

This documentation is part of ethical accountability.


## Session 5 Takeaway

Ethical responsibility is not an add-on. It is embedded in:
- Data choices
- Prompts
- Filters
- Representations

## Part 2: Applying Computational Practice to 20 Newsgroups Dataset

Now we apply the same ethical and computational practices to a larger dataset.

In [13]:
# --- CONFIGURATION ---
n = 20  # Number of Nodes (Researchers)
m = 100  # Number of Edges (Interactions/posts)

# Dataset Description
# The 20 Newsgroups dataset is a collection of approximately 18,000 newsgroup posts 
# that originated in the early days of the internet (Usenet) and they can be 
# displayed as a social network (a directed weighted multigraph) among thousands 
# of unique nodes/researchers interacting/replying in the posts of the 20 newsgroups.
# Taken from sklearn.datasets.fetch_20newsgroups

# Generate a unique filename based on m to avoid mixing samples
config_hash = hashlib.md5(f"{m}_newsgroups_s5".encode()).hexdigest()[:8]
SNAPSHOT_FILE = f"news_snapshot_m{m}_{config_hash}.csv"

# CHECK IF WE ALREADY HAVE THE COMPLETE DATA
if os.path.exists(SNAPSHOT_FILE):
    print(f"âœ… LOADING PERMANENT SNAPSHOT: {SNAPSHOT_FILE}")
    interactions = pd.read_csv(SNAPSHOT_FILE)
else:
    print(f"ðŸš€ SNAPSHOT NOT FOUND. GENERATING NEW SAMPLE...")
    
    # 1. Fetch the big dataset (11,000+ posts)
    newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
    full_df = pd.DataFrame({'text': newsgroups.data})
    
    # 2. Filter and Sample M posts
    df = full_df[full_df['text'].str.strip().str.len() > 20].copy()
    subset = df.sample(n=m, random_state=42).reset_index(drop=True)
    
    # 3. Assign the Social Structure (Source/Target)
    user_pool = [f"Researcher_{i:02d}" for i in range(n)]
    sources = [random.choice(user_pool) for _ in range(m)]
    targets = [random.choice([u for u in user_pool if u != s]) for s in sources]

    interactions = pd.DataFrame({
        "source": sources,
        "target": targets,
        "text": subset['text'].str[:300].replace('\n', ' ', regex=True)
    })

    # 4. IMMEDIATELY SAVE (before LLM processing)
    interactions.to_csv(SNAPSHOT_FILE, index=False)
    print(f"ðŸ’¾ PERMANENTLY SAVED: {SNAPSHOT_FILE}")

print(f"\n--- READY: {len(interactions)} interactions between {n} nodes ---")
interactions.head()

ðŸš€ SNAPSHOT NOT FOUND. GENERATING NEW SAMPLE...
ðŸ’¾ PERMANENTLY SAVED: news_snapshot_m100_514f6311.csv

--- READY: 100 interactions between 20 nodes ---


Unnamed: 0,source,target,text
0,Researcher_09,Researcher_11,In case you missed it on the news....the first...
1,Researcher_17,Researcher_04,We have no way of knowing because we cann...
2,Researcher_15,Researcher_14,The lengthy article you quote doesn't imply ...
3,Researcher_09,Researcher_07,"The recent rise of nostalgia in this group, co..."
4,Researcher_17,Researcher_08,"# ## Absolutely nothing, seeing as there is no..."


## Preprocessing Newsgroups Data

Apply the same preprocessing and cleaning procedures to the newsgroups dataset.

In [14]:
# Apply preprocessing to newsgroups
interactions['cleaned_text'] = interactions['text'].apply(preprocess_text)

print("PREPROCESSING NEWSGROUPS DATA:")
print()
print(f"Original text length (mean): {interactions['text'].str.len().mean():.1f} chars")
print(f"Cleaned text length (mean):  {interactions['cleaned_text'].str.len().mean():.1f} chars")
print()
print("Sample (before and after):")
sample_idx = 0
print(f"Before: {interactions.iloc[sample_idx]['text'][:100]}...")
print(f"After:  {interactions.iloc[sample_idx]['cleaned_text'][:100]}...")

PREPROCESSING NEWSGROUPS DATA:

Original text length (mean): 263.0 chars
Cleaned text length (mean):  252.7 chars

Sample (before and after):
Before: In case you missed it on the news....the first 16 Haitians of many that tested positive for HIV and ...
After:  In case you missed it on the news....the first 16 Haitians of many that tested positive for HIV and ...


## LLM-Assisted Interpretation on Newsgroups Sample

Apply LLM-assisted interpretation with ethical checkpoints to newsgroups data.

In [15]:
# Sample a subset for detailed analysis
sample_size = min(3, len(interactions))
sample_interactions = interactions.sample(n=sample_size, random_state=42)

print("LLM-ASSISTED INTERPRETATION (Newsgroups Sample):")
print("="*60)
print()

for idx, row in sample_interactions.iterrows():
    text = row['text']
    interpretation = get_label("openai", text, query_openai_relationships, "newsgroups_relationships")
    print(f"Post {idx}: {text[:80]}...")
    print(f"Interpretation:\n{interpretation}")
    print("-" * 60)

LLM-ASSISTED INTERPRETATION (Newsgroups Sample):

Post 83:     Ok boys & girls, hang on; here we go!      Christ's Eternal Gospel          ...
Interpretation:
Based on the text provided, the social relationships can be identified as follows:

1. **Robinson & Robinson**
   - **Relationship**: Collaborative Partnership
   - **Description**: This pair appears to be working together on a topic related to the field of religious studies, possibly co-authors or collaborators on a project or publication.

2. **WS LaSor & RH Eisenman**
   - **Relationship**: Academic Peers
   - **Description**: Both individuals are mentioned in the context of religious texts and scholarly analysis, indicating they are likely colleagues or peers in the academic field, possibly engaging in scholarly discourse or research.

3. **RH Eisenman & Quamran**
   - **Relationship**: Researcher and Subject of Study
   - **Description**: RH Eisenman is associated with the Dead Sea Scrolls, which are linked to the site of Qu

## Anonymization of Newsgroups Researchers

Apply anonymization to protect researcher identity while preserving network structure.

In [16]:
# Anonymize newsgroups researchers
interactions['anon_source'] = interactions['source'].apply(anonymize_user)
interactions['anon_target'] = interactions['target'].apply(anonymize_user)

print("ANONYMIZATION (Newsgroups):")
print()
print(interactions[['source', 'anon_source', 'target', 'anon_target']].head(10))
print()
print("Anonymization preserves network structure while protecting identity.")

ANONYMIZATION (Newsgroups):

          source anon_source         target anon_target
0  Researcher_09    8e63bbe2  Researcher_11    002f27a4
1  Researcher_17    a1c35218  Researcher_04    12035e93
2  Researcher_15    5264903f  Researcher_14    cdc051b2
3  Researcher_09    8e63bbe2  Researcher_07    55c33e28
4  Researcher_17    a1c35218  Researcher_08    03a3094f
5  Researcher_08    03a3094f  Researcher_02    b4c254e8
6  Researcher_11    002f27a4  Researcher_01    1cf82af9
7  Researcher_05    5c4c51f2  Researcher_02    b4c254e8
8  Researcher_01    1cf82af9  Researcher_06    01664e66
9  Researcher_15    5264903f  Researcher_10    28385db5

Anonymization preserves network structure while protecting identity.


## Network Construction from Newsgroups

Build a directed graph from the anonymized newsgroups interactions.

In [17]:
# Build the Graph from anonymized interactions
G = nx.from_pandas_edgelist(interactions, 'anon_source', 'anon_target', 
                            create_using=nx.DiGraph())

print(f"Graph Statistics:")
print(f"Nodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")
print(f"Density: {nx.density(G):.3f}")
print()

# Print first 10 edges
print("Sample edges (anonymized):")
for i, e in enumerate(G.edges(data=True)):
    if i < 10:
        print(e)

Graph Statistics:
Nodes: 20
Edges: 81
Density: 0.213

Sample edges (anonymized):
('8e63bbe2', '002f27a4', {})
('8e63bbe2', '55c33e28', {})
('8e63bbe2', '5c32df15', {})
('8e63bbe2', '5264903f', {})
('8e63bbe2', '12035e93', {})
('002f27a4', '1cf82af9', {})
('a1c35218', '12035e93', {})
('a1c35218', '03a3094f', {})
('a1c35218', '30f11a4b', {})
('a1c35218', 'cdc051b2', {})


## Network Visualization

Visualize the newsgroups social network using pyvis with labels only (no circles).

In [18]:
# Initialize Network
net = Network(height="500px", width="100%", directed=True, bgcolor="#ffffff")

# Add Nodes (labels only, no visible circles)
for node in G.nodes():
    net.add_node(
        node, 
        label=node, 
        shape='dot',
        size=1,
        color='#ffffff',
        borderWidth=0,
        font={'size': 12, 'color': 'black', 'align': 'center'}
    )
    
# Add Edges
for source, target in G.edges():
    net.add_edge(
        source, 
        target, 
        color='#848484',
        arrows={'to': {'enabled': True, 'scaleFactor': 0.5}},
        smooth={'type': 'curvedCW', 'roundness': 0.2},
        font={'align': 'top', 'size': 12, 'color': 'blue'}
    )

# Physics and Rendering
net.set_options("""
var options = {
  "physics": {
    "barnesHut": { "gravitationalConstant": -3000, "springLength": 150 }
  }
}
""")

html_content = net.generate_html()
with open("newsgroups_graph_s5.html", "w") as f:
    f.write(html_content)

IPython.display.IFrame(src="newsgroups_graph_s5.html", width='100%', height='550px')