<div style="text-align: right;">Â© 2026 Moses Boudourides. All Rights Reserved.</div>

# LLMs for Qualitative and Mixed-Methods Social Network Analysis (SNA)
## Moses Boudourides

# Session 2: Qualitative and Mixed-Methods SNA

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from pyvis.network import Network
import os
import json
import sys
import hashlib
import random
from sklearn.datasets import fetch_20newsgroups
import IPython
from openai import OpenAI
# import google.generativeai as genai

In [2]:
# API Key Loading
def get_api_key(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return f.read().strip().replace('"', '').replace("'", "")
    return None

oa_key = get_api_key("openai_key.txt")
client_oa = OpenAI(api_key=oa_key)

# Persistence Logic (Re-using the Session 1 Cache)
CACHE_FILE = "llm_cache_session2.json"
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        cache = json.load(f)
else:
    cache = {}

def get_label(model_id, text, sys_prompt, api_func):
    cache_key = f"{model_id}_{sys_prompt}_{text}"
    if cache_key in cache:
        return cache[cache_key]
    result = api_func(text, sys_prompt)
    cache[cache_key] = result
    with open(CACHE_FILE, "w") as f:
        json.dump(cache, f)
    return result

def query_openai(text, sys_prompt):
    res = client_oa.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": f"{sys_prompt}\nText: {text}"}],
        temperature=0
    )
    return res.choices[0].message.content.strip()

In [3]:
CACHE_FILE_S2 = "llm_cache_session2.json"
if os.path.exists(CACHE_FILE_S2):
    with open(CACHE_FILE_S2, "r") as f:
        cache_s2 = json.load(f)
else:
    cache_s2 = {}

def get_label_s2(model_id, text, sys_prompt, api_func):
    cache_key = f"{model_id}_{hashlib.md5((sys_prompt + text).encode()).hexdigest()}"
    if cache_key in cache_s2:
        return cache_s2[cache_key]
    result = api_func(text, sys_prompt)
    cache_s2[cache_key] = result
    with open(CACHE_FILE_S2, "w") as f:
        json.dump(cache_s2, f)
    return result

# --- 1. DATA PREPARATION: Diverse Context for Discovery ---
ego_data_raw = [
    {"ego": "Steve Jobs", "alter": "Steve Wozniak", "text": "He was the technical genius who co-founded Apple and built the first boards."},
    {"ego": "Steve Jobs", "alter": "Tim Cook", "text": "The operational expert who took over the company and scaled it globally."},
    {"ego": "Steve Jobs", "alter": "John Sculley", "text": "A CEO from Pepsi who had a brief, tumultuous tenure ending in a boardroom coup."},
    {"ego": "Steve Jobs", "alter": "Paul Rand", "text": "An outside contractor hired briefly to design a single corporate logo for NeXT."},
    {"ego": "Steve Jobs", "alter": "Regis McKenna", "text": "A public relations consultant who helped with early marketing but was not an employee."},
    {"ego": "Steve Jobs", "alter": "Jony Ive", "text": "The soulmate designer who shared Jobs's office and designed every major product."},
    {"ego": "Steve Jobs", "alter": "Bob Dylan", "text": "The singer he was a devoted fan of."}
]

interactions = pd.DataFrame(ego_data_raw)
interactions = interactions.rename(columns={'ego': 'source', 'alter': 'target'})

# --- 2. DISCOVERY PROMPTS ---
p_cat = "Identify the professional category or identity of this person. Respond with ONLY the title."
p_emic = "From the perspective of the Ego, what is the personal meaning/role of this tie? Respond with ONLY a short phrase."
p_etic = "Based on historical facts, classify this relationship as 'Core' or 'Periphery' in terms of daily organizational impact. Respond with ONLY the word."

print("LLM revealing categories, emic meanings, and etic positions...")

# Perform Discovery using the Session 2 Cache
interactions['target_label'] = interactions['text'].apply(lambda x: get_label_s2("openai", x, p_cat, query_openai))
interactions['emic_label'] = interactions['text'].apply(lambda x: get_label_s2("openai", x, p_emic, query_openai))
interactions['etic_label'] = interactions.apply(lambda x: get_label_s2("openai", f"Relation: {x['source']} & {x['target']} | Context: {x['text']}", p_etic, query_openai), axis=1)

# Combined label for the visualization
interactions['oa_label'] = interactions.apply(lambda x: f"{x['emic_label']} ({x['etic_label']})", axis=1)

print("\nInteractions Dataframe Created with Discovered Labels:")
interactions

LLM revealing categories, emic meanings, and etic positions...

Interactions Dataframe Created with Discovered Labels:


Unnamed: 0,source,target,text,target_label,emic_label,etic_label,oa_label
0,Steve Jobs,Steve Wozniak,He was the technical genius who co-founded App...,Steve Wozniak,Symbol of personal achievement and identity.,Core,Symbol of personal achievement and identity. (...
1,Steve Jobs,Tim Cook,The operational expert who took over the compa...,CEO,Symbol of authority and achievement.,Core,Symbol of authority and achievement. (Core)
2,Steve Jobs,John Sculley,"A CEO from Pepsi who had a brief, tumultuous t...",CEO,Symbol of authority and status.,Periphery,Symbol of authority and status. (Periphery)
3,Steve Jobs,Paul Rand,An outside contractor hired briefly to design ...,Graphic Designer,Professional collaboration.,Periphery,Professional collaboration. (Periphery)
4,Steve Jobs,Regis McKenna,A public relations consultant who helped with ...,Consultant,A symbol of external validation and influence.,Periphery,A symbol of external validation and influence....
5,Steve Jobs,Jony Ive,The soulmate designer who shared Jobs's office...,Jony Ive,Creative partnership and identity.,Core,Creative partnership and identity. (Core)
6,Steve Jobs,Bob Dylan,The singer he was a devoted fan of.,Fan,Admiration and identity connection.,Periphery,Admiration and identity connection. (Periphery)


In [4]:
# 1. Initialize Pyvis Network
net = Network(height="500px", width="100%", directed=True, bgcolor="#ffffff")

# 2. Add Nodes with Interactive Hover Labels
all_nodes = set(interactions['source']).union(set(interactions['target']))

for node in all_nodes:
    is_ego = (node == "Steve Jobs")
    node_color = '#ffcccb' if is_ego else '#f0f7ff'
    
    if is_ego:
        # Internal label is the name, hover title is "ego"
        internal_label = "Steve Jobs"
        hover_label = "ego"
    else:
        # For alters, internal label is their name
        internal_label = node
        # Hover label is the Etic classification (Core/Periphery)
        etic_val = interactions[interactions['target'] == node]['etic_label'].values[0]
        hover_label = f"Etic: {etic_val}"
    
    net.add_node(
        node, 
        label=internal_label, 
        title=hover_label, 
        shape='circle',
        color=node_color,
        widthConstraint=80, 
        heightConstraint={'minimum': 80, 'valign': 'middle'},
        font={'size': 14, 'color': 'black', 'align': 'center'}
    )
    
# 3. Add Edges with Interactive Hover Labels
for _, row in interactions.iterrows():
    # Emic meaning is in 'title' for hover interaction
    hover_emic = f"Emic: {row['emic_label']}"
    
    net.add_edge(
        row['source'], 
        row['target'], 
        label="",           
        title=hover_emic,   
        color='green', #'#848484',
        arrows={'to': {'enabled': True, 'scaleFactor': 0.5}},
        smooth={'type': 'curvedCW', 'roundness': 0}
    )

# 4. Physics and Rendering
net.set_options("""
var options = {
  "physics": {
    "barnesHut": { "gravitationalConstant": -3000, "springLength": 150 }
  },
  "interaction": {
    "hover": true
  }
}
""")

html_content = net.generate_html()
with open("interactive_ego_graph.html", "w") as f:
    f.write(html_content)

IPython.display.IFrame(src="interactive_ego_graph.html", width='100%', height='550px')

In [5]:
# --- CONFIGURATION ---
n = 20  # Number of Nodes (Researchers) 10
m = 100  # Number of Edges (Interactions/posts) 70

# Dataset Description (Formal Comment for Seminar)
# The 20 Newsgroups dataset is a collection of approximately 18,000 newsgroup posts 
# that originated in the early days of the internet (Usenet) and they can be 
# displayed as a social network (a directed weighted multigraph) among thousands 
# of unique nodes/researchers interacting/replying in the posts of the 20 newsgroups.
# Taken from sklearn.datasets.fetch_20newsgroups

sys_prompt = "Classify this post: 'Directive', 'Commissive', or 'Social'. Respond with ONLY the word."

# Generate a unique filename based on m and the prompt to avoid mixing samples
config_hash = hashlib.md5(f"{m}_{sys_prompt}".encode()).hexdigest()[:8]
SNAPSHOT_FILE = f"news_snapshot_m{m}_{config_hash}.csv"

# CHECK IF WE ALREADY HAVE THE COMPLETE DATA
if os.path.exists(SNAPSHOT_FILE):
    print(f"âœ… LOADING PERMANENT SNAPSHOT: {SNAPSHOT_FILE}")
    interactions = pd.read_csv(SNAPSHOT_FILE)
else:
    print(f"ðŸš€ SNAPSHOT NOT FOUND. GENERATING NEW SAMPLE...")
    
    # 1. Fetch the big dataset (11,000+ posts)
    # The 20 Newsgroups dataset is a collection of approximately 18,000 newsgroup posts 
    # that originated in the early days of the internet (Usenet) and they can be 
    # displayed as a social network (a directed weighted multigraph) among thousands 
    # of unique nodes/researchers interacting/replying in the posts of the 20 newsgroups.
    newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
    full_df = pd.DataFrame({'text': newsgroups.data})
    
    # 2. Filter and Sample M posts
    df = full_df[full_df['text'].str.strip().str.len() > 20].copy()
    subset = df.sample(n=m, random_state=42).reset_index(drop=True)
    
    # 3. Assign the Social Structure (Source/Target)
    user_pool = [f"Researcher_{i:02d}" for i in range(n)]
    sources = [random.choice(user_pool) for _ in range(m)]
    targets = [random.choice([u for u in user_pool if u != s]) for s in sources]

    interactions = pd.DataFrame({
        "source": sources,
        "target": targets,
        "text": subset['text'].str[:300].replace('\n', ' ', regex=True)
    })

    # 4. Label via API (ONLY done if CSV doesn't exist)
    print(f"Syncing {m} labels with GPT-4o (This will only happen once)...")
    interactions['oa_label'] = interactions['text'].apply(query_openai)
    
    # 5. IMMEDIATELY SAVE
    interactions.to_csv(SNAPSHOT_FILE, index=False)
    print(f"ðŸ’¾ PERMANENTLY SAVED: {SNAPSHOT_FILE}")

print(f"\n--- READY: {len(interactions)} interactions between {n} nodes ---")

print(interactions['oa_label'].value_counts())
interactions.head(10)

âœ… LOADING PERMANENT SNAPSHOT: news_snapshot_m100_6d4e403c.csv

--- READY: 100 interactions between 20 nodes ---
oa_label
Social        88
Directive     11
Commissive     1
Name: count, dtype: int64


Unnamed: 0,source,target,text,oa_label
0,Researcher_14,Researcher_11,In case you missed it on the news....the first...,Social
1,Researcher_01,Researcher_15,We have no way of knowing because we cann...,Social
2,Researcher_01,Researcher_08,The lengthy article you quote doesn't imply ...,Social
3,Researcher_12,Researcher_10,"The recent rise of nostalgia in this group, co...",Social
4,Researcher_05,Researcher_14,"# ## Absolutely nothing, seeing as there is no...",Social
5,Researcher_08,Researcher_14,Why are only those people in favor of the sy...,Social
6,Researcher_16,Researcher_12,"[ ... ] Here'a a copy, cdt: -----------------...",Social
7,Researcher_16,Researcher_11,A few days ago there was a posting in this gro...,Social
8,Researcher_06,Researcher_07,Yes this is a common problem. The leaks oc...,Social
9,Researcher_17,Researcher_11,Could someone please tell me if a 1/4 decoder ...,Social


In [6]:
# --- 2. EXTRACT FULL EGO-NETWORK (INCLUDING ALTER-ALTER) ---
# 1. Identify Ego
ego_node = interactions['source'].value_counts().idxmax()

# 2. Identify the Alters first (based on direct contact with Ego)
direct_interactions = interactions[(interactions['source'] == ego_node) | 
                                   (interactions['target'] == ego_node)]
all_alters = pd.unique(direct_interactions[['source', 'target']].values.ravel())
all_alters = [a for a in all_alters if a != ego_node]

# Limit to top 20-25 alters for a readable graph
selected_alters = all_alters[:50]
group_members = selected_alters + [ego_node]

# 3. THE KEY CHANGE: Filter the ENTIRE dataset for any interaction 
# where BOTH participants are in our group.
subgraph_df = interactions[
    (interactions['source'].isin(group_members)) & 
    (interactions['target'].isin(group_members))
].copy()

# 4. Aggregate for weights
weighted_ego_df = subgraph_df.groupby(['source', 'target', 'oa_label']).size().reset_index(name='weight')

print(f"Ego: {ego_node}")
print(f"Alters found: {len(selected_alters)}")
print(f"Total interactions (including alter-alter): {len(weighted_ego_df)}")
# print(f"Dataset ready: {len(interactions)} rows.")
weighted_ego_df.head(10)

Ego: Researcher_10
Alters found: 9
Total interactions (including alter-alter): 29


Unnamed: 0,source,target,oa_label,weight
0,Researcher_01,Researcher_05,Social,1
1,Researcher_01,Researcher_08,Social,1
2,Researcher_01,Researcher_12,Social,1
3,Researcher_01,Researcher_13,Social,1
4,Researcher_03,Researcher_06,Directive,1
5,Researcher_03,Researcher_10,Social,1
6,Researcher_03,Researcher_14,Social,1
7,Researcher_05,Researcher_12,Social,2
8,Researcher_05,Researcher_14,Directive,1
9,Researcher_05,Researcher_14,Social,1


In [7]:
# --- 2. EXTRACTION (STRICT EGONET PROTOCOL) ---
ego_node = "Researcher_10"

# 1. Identify all interactions involving the Ego
ego_mask = (interactions['source'] == ego_node) | (interactions['target'] == ego_node)
ego_interactions = interactions[ego_mask]

# 2. Get the unique list of everyone who talked to the Ego
# This handles the "9 alters" available in your data
all_contacts = pd.unique(ego_interactions[['source', 'target']].values.ravel())
all_contacts = [c for c in all_contacts if c != ego_node]

# 3. Define the group members (Ego + available Alters)
node_filter = set(all_contacts + [ego_node])

# 4. Filter the master 'interactions' for ANY tie where BOTH people are in this group
# This is the "Web" logic: it finds Alter-to-Alter ties
subgraph_df = interactions[
    (interactions['source'].isin(node_filter)) & 
    (interactions['target'].isin(node_filter))
].copy()

# 5. Final Aggregation
weighted_ego_df = subgraph_df.groupby(['source', 'target', 'oa_label']).size().reset_index(name='weight')

print(f"Ego: {ego_node}")
print(f"Alters found and mapped: {len(all_contacts)}")
print(f"Total edges (Ego-Alter + Alter-Alter): {len(weighted_ego_df)}")

Ego: Researcher_10
Alters found and mapped: 9
Total edges (Ego-Alter + Alter-Alter): 29


In [8]:
# --- 3. FINAL VISUALIZATION (SPACIOUS PHYSICS + HOVER + TEXT ALTERS) ---
net_ego = Network(height='750px', width='100%', directed=True, notebook=True)

color_map = {'Directive': '#e74c3c', 'Commissive': '#2ecc71', 'Social': '#3498db'}

# 1. Add Nodes
nodes_in_subgraph = pd.unique(weighted_ego_df[['source', 'target']].values.ravel())

for node in nodes_in_subgraph:
    is_ego = (node == ego_node)
    
    # Generate Hover Title (The "Missing" Hover Logic)
    if is_ego:
        hover_label = "ego"
    else:
        try:
            # Look up the specific role/label for this alter
            label_val = weighted_ego_df[weighted_ego_df['target'] == node]['oa_label'].values[0]
            hover_label = f"Alter: {node} | Role: {label_val}"
        except IndexError:
            hover_label = f"Alter: {node}"

    if is_ego:
        net_ego.add_node(
            str(node), 
            label=str(node), 
            title=hover_label, 
            shape='circle',
            color='#ffcccb',
            widthConstraint=120,
            font={'size': 18, 'weight': 'bold'},
            mass=3 # Heavier ego stays more central
        )
    else:
        # ALTERS: Text only (no circle), Hover Title INCLUDED
        net_ego.add_node(
            str(node), 
            label=str(node), 
            title=hover_label, 
            shape='text', 
            font={'size': 18, 'color': '#2c3e50'}
        )

# 2. Add Edges (With Curvature for Reciprocation)
for _, row in weighted_ego_df.iterrows():
    hover_emic = f"Type: {row['oa_label']} | Count: {row['weight']}"
    
    net_ego.add_edge(
        str(row['source']), 
        str(row['target']), 
        title=hover_emic,
#         width=1.5,
        width=float(row['weight']),
        color=color_map.get(row['oa_label'], '#bdc3c7'),
        arrows={'to': {'enabled': True, 'scaleFactor': 0.5}},
        # Slight curvature allows us to see back-and-forth ties
        smooth={'type': 'curvedCW', 'roundness': 0.15}
    )

# 3. Physics: Tuned for Space and Legibility
net_ego.toggle_physics(True)
net_ego.set_options("""
var options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -20000,
      "centralGravity": 0.2,
      "springLength": 250,
      "springConstant": 0.04
    },
    "maxVelocity": 50,
    "solver": "barnesHut",
    "timestep": 0.35,
    "stabilization": {"iterations": 150}
  },
  "interaction": {
    "hover": true,
    "tooltipDelay": 100,
    "navigationButtons": true
  }
}
""")

net_ego.show("newsgroup_ego_final.html")
IPython.display.IFrame(src="newsgroup_ego_final.html", width='100%', height='750px')

newsgroup_ego_final.html
