<div style="text-align: right;">Â© 2026 Moses Boudourides. All Rights Reserved.</div>

# LLMs for Qualitative and Mixed-Methods Social Network Analysis (SNA)
## Moses Boudourides

# Session 1: Motivation and Qualitative SNA Foundations

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from pyvis.network import Network
import os
import json
import sys
import hashlib
import random
from sklearn.datasets import fetch_20newsgroups
import IPython
from openai import OpenAI
# import google.generativeai as genai

In [2]:
# --- 1. & 2. KEY LOADING & INITIALIZATION ---

# Force Google to use REST to avoid ALTS/GCP credential errors
os.environ["GOOGLE_API_USE_MTLS"] = "never" 

def get_api_key(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return f.read().strip().replace('"', '').replace("'", "")
    return None

oa_key = get_api_key("openai_key.txt")
# gem_key = get_api_key("gemini_key.txt")

# Initialize OpenAI
client_oa = OpenAI(api_key=oa_key)

# # Initialize Gemini using 'rest' transport to bypass gRPC/ALTS errors
# genai.configure(api_key=gem_key, transport='rest')

# # Dynamic Model Selection
# available_models = [m.name for m in genai.list_models() if 'generateContent' in m.supported_generation_methods]
# target_model = 'gemini-1.5-flash' if 'models/gemini-1.5-flash' in available_models else available_models[0].split('/')[-1]
# model_gemini = genai.GenerativeModel(target_model)

In [3]:
# --- 3. DATA & PERSISTENT QUERY STEP ---

# 1. Configuration & Data
sys_prompt = "Classify this tie: 'Professional', 'Social', or 'Supportive'. Respond with ONLY the word."

data = [
    {"source": "Alice", "target": "Bob", "text": "I need the report by 5 PM or there will be consequences."},
    {"source": "Bob", "target": "Alice", "text": "I'm working on it, please don't worry."},
    {"source": "Bob", "target": "Charlie", "text": "Hey man, do you want to grab a coffee?"},
    {"source": "Alice", "target": "Charlie", "text": "Great job on the presentation!"},
    {"source": "Mary", "target": "Charlie", "text": "What's up, dude?"}
]
interactions = pd.DataFrame(data)

# 2. Persistence Logic
CACHE_FILE = "llm_cache.json"

if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        cache = json.load(f)
else:
    cache = {}

def get_label(model_id, text, api_func):
    # Unique key ensures cache stays valid even if prompt or text changes
    cache_key = f"{model_id}_{sys_prompt}_{text}"
    
    if cache_key in cache:
        return cache[cache_key]
    
    # Cache Miss: Call API
    result = api_func(text)
    cache[cache_key] = result
    
    # Save updated cache to disk
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f)
    return result

# 3. API Execution Wrappers
def query_openai(text):
    res = client_oa.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": f"{sys_prompt}\nText: {text}"}],
        temperature=0
    )
    return res.choices[0].message.content.strip()

def query_gemini(text):
    res = model_gemini.generate_content(f"{sys_prompt}\nText: {text}")
    return res.text.strip()

# 4. Run Process
print("Processing (Syncing Cache/API)...")

interactions['oa_label'] = interactions['text'].apply(
    lambda x: get_label("openai", x, query_openai)
)

# interactions['gem_label'] = interactions['text'].apply(
#     lambda x: get_label("gemini", x, query_gemini)
# )

print("Status: All labels synchronized.")
interactions

Processing (Syncing Cache/API)...
Status: All labels synchronized.


Unnamed: 0,source,target,text,oa_label
0,Alice,Bob,I need the report by 5 PM or there will be con...,Professional
1,Bob,Alice,"I'm working on it, please don't worry.",Supportive
2,Bob,Charlie,"Hey man, do you want to grab a coffee?",Social
3,Alice,Charlie,Great job on the presentation!,Supportive
4,Mary,Charlie,"What's up, dude?",Social


In [4]:
# --- 4. GRAPH CONSTRUCTION ---

# # STEP 1: Choose your rater ('oa_label' or 'gem_label')
selected_model = 'openai' # Change to 'gemini' to switch

# Mapping the selection to the actual column names
col_map = {'openai': 'oa_label', 'gemini': 'gem_label'}
target_col = col_map[selected_model]

# STEP 2: Create a clean version of the data with ONLY the selected labels
# We rename the column to 'label' so the graph code remains generic
clean_interactions = interactions[['source', 'target', target_col]].rename(columns={target_col: 'label'})

# STEP 3: Build the Graph
# By only passing 'label', the other AI's data is completely removed from G
G = nx.from_pandas_edgelist(clean_interactions, 'source', 'target', 
                            edge_attr='label', 
                            create_using=nx.DiGraph())

for e in G.edges(data=True):
    print(e)

('Alice', 'Bob', {'label': 'Professional'})
('Alice', 'Charlie', {'label': 'Supportive'})
('Bob', 'Alice', {'label': 'Supportive'})
('Bob', 'Charlie', {'label': 'Social'})
('Mary', 'Charlie', {'label': 'Social'})


In [5]:
# 1. Initialize
net = Network(height="500px", width="100%", directed=True, bgcolor="#ffffff")

# 2. Add Nodes with Internal Labels
# Shape 'circle' or 'ellipse' puts the label inside the node
for node in set(interactions['source']).union(set(interactions['target'])):
    net.add_node(
        node, 
        label=node, 
        shape='circle',
        color='#f0f7ff',
        # THIS IS YOUR NEW 'SIZE' PARAMETER:
        # If you want them bigger, change 80 to 100. Smaller? Change to 60.
        widthConstraint=40, 
        heightConstraint={'minimum': 40, 'valign': 'middle'},
        font={'size': 14, 'color': 'black', 'align': 'center'}
    )
    
# 3. Add Edges 
for _, row in interactions.iterrows():
    label_text = row['oa_label'] 
    
    net.add_edge(
        row['source'], 
        row['target'], 
        label=label_text, 
        color='#848484',
        arrows={'to': {'enabled': True, 'scaleFactor': 0.5}},
        smooth={'type': 'curvedCW', 'roundness': 0.2},
        font={'align': 'top', 'size': 12, 'color': 'blue'}
    )

# 4. Physics and Rendering
net.set_options("""
var options = {
  "physics": {
    "barnesHut": { "gravitationalConstant": -3000, "springLength": 150 }
  }
}
""")

html_content = net.generate_html()
with open("social_graph.html", "w") as f:
    f.write(html_content)

IPython.display.IFrame(src="social_graph.html", width='100%', height='550px')

In [6]:
# --- CONFIGURATION ---
n = 20  # Number of Nodes (Researchers) 10
m = 100  # Number of Edges (Interactions/posts) 70

# Dataset Description (Formal Comment for Seminar)
# The 20 Newsgroups dataset is a collection of approximately 18,000 newsgroup posts 
# that originated in the early days of the internet (Usenet) and they can be 
# displayed as a social network (a directed weighted multigraph) among thousands 
# of unique nodes/researchers interacting/replying in the posts of the 20 newsgroups.
# Taken from sklearn.datasets.fetch_20newsgroups

sys_prompt = "Classify this post: 'Directive', 'Commissive', or 'Social'. Respond with ONLY the word."

# Generate a unique filename based on m and the prompt to avoid mixing samples
config_hash = hashlib.md5(f"{m}_{sys_prompt}".encode()).hexdigest()[:8]
SNAPSHOT_FILE = f"news_snapshot_m{m}_{config_hash}.csv"

# CHECK IF WE ALREADY HAVE THE COMPLETE DATA
if os.path.exists(SNAPSHOT_FILE):
    print(f"âœ… LOADING PERMANENT SNAPSHOT: {SNAPSHOT_FILE}")
    interactions = pd.read_csv(SNAPSHOT_FILE)
else:
    print(f"ðŸš€ SNAPSHOT NOT FOUND. GENERATING NEW SAMPLE...")
    
    # 1. Fetch the big dataset (11,000+ posts)
    # The 20 Newsgroups dataset is a collection of approximately 18,000 newsgroup posts 
    # that originated in the early days of the internet (Usenet) and they can be 
    # displayed as a social network (a directed weighted multigraph) among thousands 
    # of unique nodes/researchers interacting/replying in the posts of the 20 newsgroups.
    newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
    full_df = pd.DataFrame({'text': newsgroups.data})
    
    # 2. Filter and Sample M posts
    df = full_df[full_df['text'].str.strip().str.len() > 20].copy()
    subset = df.sample(n=m, random_state=42).reset_index(drop=True)
    
    # 3. Assign the Social Structure (Source/Target)
    user_pool = [f"Researcher_{i:02d}" for i in range(n)]
    sources = [random.choice(user_pool) for _ in range(m)]
    targets = [random.choice([u for u in user_pool if u != s]) for s in sources]

    interactions = pd.DataFrame({
        "source": sources,
        "target": targets,
        "text": subset['text'].str[:300].replace('\n', ' ', regex=True)
    })

    # 4. Label via API (ONLY done if CSV doesn't exist)
    print(f"Syncing {m} labels with GPT-4o (This will only happen once)...")
    interactions['oa_label'] = interactions['text'].apply(query_openai)
    
    # 5. IMMEDIATELY SAVE
    interactions.to_csv(SNAPSHOT_FILE, index=False)
    print(f"ðŸ’¾ PERMANENTLY SAVED: {SNAPSHOT_FILE}")

print(f"\n--- READY: {len(interactions)} interactions between {n} nodes ---")

print(interactions['oa_label'].value_counts())
interactions.head()

âœ… LOADING PERMANENT SNAPSHOT: news_snapshot_m100_6d4e403c.csv

--- READY: 100 interactions between 20 nodes ---
oa_label
Social        88
Directive     11
Commissive     1
Name: count, dtype: int64


Unnamed: 0,source,target,text,oa_label
0,Researcher_14,Researcher_11,In case you missed it on the news....the first...,Social
1,Researcher_01,Researcher_15,We have no way of knowing because we cann...,Social
2,Researcher_01,Researcher_08,The lengthy article you quote doesn't imply ...,Social
3,Researcher_12,Researcher_10,"The recent rise of nostalgia in this group, co...",Social
4,Researcher_05,Researcher_14,"# ## Absolutely nothing, seeing as there is no...",Social


In [7]:
# --- 4. GRAPH CONSTRUCTION ---

# # STEP 1: Choose your rater ('oa_label' or 'gem_label')
selected_model = 'openai' # Change to 'gemini' to switch

# Mapping the selection to the actual column names
col_map = {'openai': 'oa_label', 'gemini': 'gem_label'}
target_col = col_map[selected_model]

# STEP 2: Create a clean version of the data with ONLY the selected labels
# We rename the column to 'label' so the graph code remains generic
clean_interactions = interactions[['source', 'target', target_col]].rename(columns={target_col: 'label'})

# STEP 3: Build the Graph
# By only passing 'label', the other AI's data is completely removed from G
G = nx.from_pandas_edgelist(clean_interactions, 'source', 'target', 
                            edge_attr='label', 
                            create_using=nx.DiGraph())

for i,e in enumerate(G.edges(data=True)):
    if i<10:
        print(e)

('Researcher_14', 'Researcher_11', {'label': 'Social'})
('Researcher_14', 'Researcher_08', {'label': 'Social'})
('Researcher_14', 'Researcher_09', {'label': 'Social'})
('Researcher_14', 'Researcher_10', {'label': 'Social'})
('Researcher_11', 'Researcher_00', {'label': 'Social'})
('Researcher_11', 'Researcher_16', {'label': 'Directive'})
('Researcher_01', 'Researcher_15', {'label': 'Social'})
('Researcher_01', 'Researcher_08', {'label': 'Social'})
('Researcher_01', 'Researcher_05', {'label': 'Social'})
('Researcher_01', 'Researcher_12', {'label': 'Social'})


In [8]:
# 1. SETUP COLORS
color_map = {
    'Directive': '#e74c3c', 'Commissive': '#2ecc71', 
    'Social': '#3498db', 'Administrative': '#95a5a6'
}

# 2. AGGREGATE DATA
# We group by source/target/label to ensure weights are accurate per type
weighted_df = interactions.groupby(['source', 'target', 'oa_label']).size().reset_index(name='weight')

# 3. CIRCULAR LAYOUT
G = nx.DiGraph()
for _, row in weighted_df.iterrows():
    G.add_edge(row['source'], row['target'], weight=row['weight'], label=row['oa_label'])

pos = nx.circular_layout(G, scale=400) 

# 4. INITIALIZE PYVIS
net = Network(height='750px', width='100%', directed=True, notebook=True)

# 5. ADD NODES
for node_id in G.nodes():
    net.add_node(
        node_id, label=node_id,
        x=pos[node_id][0], y=pos[node_id][1],
        shape='text', font={'size': 22}, physics=False 
    )

# 6. ADD EDGES WITH INTERACTIVE HOVER
for u, v, data in G.edges(data=True):
    # This is the "Better" part: The hover tooltip
    hover_text = f"Type: {data['label']} | Count: {data['weight']} messages"
    
    net.add_edge(
        u, v, 
        color=color_map.get(data['label'], '#bdc3c7'),
        value=data['weight'],  # Visual thickness
        title=hover_text,      # <--- THE HOVER MAGIC
        smooth={'type': 'curvedCW', 'roundness': 0.15}
    )

# 7. SAVE AND OPEN
net.show("hover_weighted_circle.html")

hover_weighted_circle.html
