In [1]:
import networkx as nx
import spacy
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mutual_info_score
import numpy as np
from scipy.stats import pearsonr
import pandas as pd
from tqdm import tqdm
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/Supplementary data - responses and measures.csv')
df = df[~df['EmpathyQ_1'].isna()]
df = df[~df['Response'].isna()]

## Graph construction

## Discover patterns and concepts

In [4]:
def discover_empathy_words_tf_idf(df: pd.DataFrame, empathy_col: str, min_score: float=7.0):    
    high_empathy = df[df[empathy_col] >= min_score]['Response'].tolist()
    low_empathy = df[df[empathy_col] < min_score]['Response'].tolist()
    
    vectorizer = TfidfVectorizer(
        max_features=1000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2
    )
    
    all_responses = high_empathy + low_empathy
    tfidf_matrix = vectorizer.fit_transform(all_responses)
    feature_names = vectorizer.get_feature_names_out()
    
    high_tfidf = tfidf_matrix[:len(high_empathy)].mean(axis=0).A1
    low_tfidf = tfidf_matrix[len(high_empathy):].mean(axis=0).A1
    
    empathy_ratio = high_tfidf / (low_tfidf + 1e-8)
    
    top_indices = np.argsort(empathy_ratio)[-50:]
    empathy_words = [(feature_names[i], empathy_ratio[i]) for i in top_indices]
    
    return empathy_words

In [5]:
def discover_empathy_patterns_correlation(df):
    """Find patterns correlated with empathy scores"""
    
    vectorizer = TfidfVectorizer(
        max_features=500,
        stop_words='english',
        ngram_range=(1, 3)
    )
    
    tfidf_matrix = vectorizer.fit_transform(df['Response'])
    feature_names = vectorizer.get_feature_names_out()
    
    correlations = []
    for i, word in enumerate(feature_names):
        word_scores = tfidf_matrix[:, i].toarray().flatten()
        corr, p_value = pearsonr(word_scores, df['EmpathyQ_1'])
        
        if not np.isnan(corr) and p_value < 0.05:
            correlations.append((word, corr, p_value))
    
    correlations.sort(key=lambda x: abs(x[1]), reverse=True)
    return correlations[:30]

In [6]:
def discover_empathy_concepts_mutual_info(df, empathy_threshold=5.0):
    empathy_high = (df['EmpathyQ_1'] >= empathy_threshold).astype(int)
    
    vectorizer = TfidfVectorizer(
        max_features=300,
        stop_words='english',
        binary=True, 
        ngram_range=(1, 2)
    )
    
    feature_matrix = vectorizer.fit_transform(df['Response'])
    feature_names = vectorizer.get_feature_names_out()
    
    mi_scores = []
    for i in range(feature_matrix.shape[1]):
        feature_values = feature_matrix[:, i].toarray().flatten()
        mi_score = mutual_info_score(empathy_high, feature_values)
        mi_scores.append((feature_names[i], mi_score))
    mi_scores.sort(key=lambda x: x[1], reverse=True)
    return mi_scores[:25]

In [7]:
def discover_empathy_concepts_semantic(df, empathy_threshold=7.0):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    high_empathy_responses = df[df['EmpathyQ_1'] >= empathy_threshold]['Response'].tolist()
    low_empathy_responses = df[df['EmpathyQ_1'] < empathy_threshold]['Response'].tolist()
    
    high_embeddings = model.encode(high_empathy_responses)
    low_embeddings = model.encode(low_empathy_responses)
    high_centroid = np.mean(high_embeddings, axis=0)
    low_centroid = np.mean(low_embeddings, axis=0)
    
    all_words = []
    for response in df['Response']:
        words = response.lower().split()
        all_words.extend(words)
    
    unique_words = list(set(all_words))
    word_embeddings = model.encode(unique_words)
    
    similarities_high = cosine_similarity([high_centroid], word_embeddings)[0]
    similarities_low = cosine_similarity([low_centroid], word_embeddings)[0]
    
    # Words more similar to high empathy than low empathy
    empathy_diff = similarities_high - similarities_low
    
    word_scores = list(zip(unique_words, empathy_diff))
    word_scores.sort(key=lambda x: x[1], reverse=True)
    
    return word_scores[:30]

## Entity extraction

In [8]:
def extract_linguistic_patterns(doc):
    patterns = []
    
    # Question patterns
    if doc.text.strip().endswith('?'):
        patterns.append(('question', 1.0))
    
    # Apology patterns
    for token in doc:
        if token.lemma_ in ['sorry', 'apologize'] and token.pos_ in ['ADJ', 'VERB']:
            patterns.append(('apology_pattern', 1.0))
    
    # First person patterns
    first_person_count = sum(1 for token in doc if token.text.lower() in ['i', 'me', 'my'])
    if first_person_count > 0:
        patterns.append(('first_person', first_person_count / len(doc)))
    
    return patterns

In [9]:
def extract_empathy_entities(response_text, discovered_concepts):
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(response_text)
    
    entities = {
        'empathy_concepts': [],
        'emotional_states': [],
        'linguistic_patterns': [],
        'named_entities': []
    }
    
    response_lower = response_text.lower()
    for concept, score in discovered_concepts:
        if concept in response_lower and score > 0.1:  # Threshold
            entities['empathy_concepts'].append((concept, score))
    
    # Extract other patterns
    entities['named_entities'] = [(ent.text, ent.label_) for ent in doc.ents]
    entities['linguistic_patterns'] = extract_linguistic_patterns(doc)
    
    return entities



# Build knowledge graph

In [10]:
def get_concept_coocurrence(graph, response_nodes):
    concept_cooccurrence = defaultdict(int)
    
    for response in response_nodes:
        concepts = [n for n in graph.neighbors(response) 
                   if graph.nodes[n]['type'] == 'empathy_concept']
        
        for i, c1 in enumerate(concepts):
            for c2 in concepts[i+1:]:
                concept_cooccurrence[(c1, c2)] += 1
    return concept_cooccurrence


In [11]:
def build_graph(df, top_concepts):
    graph = nx.Graph()
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        response_id = f"response_{idx}"
        empathy_score = row['EmpathyQ_1']
        
        graph.add_node(response_id, 
                  type='response',
                  empathy_score=empathy_score,
                  text=row['Response'],
                  cognitive=row.get('cognitive', 0),
                  affective=row.get('affective', 0),
                  motivational=row.get('motivational', 0))
        

        entities = extract_empathy_entities(row['Response'], top_concepts)
        
        # add concept nodes
        for concept, strength in entities['empathy_concepts']:
            concept_id = f"concept_{concept}"
            
            if not graph.has_node(concept_id):
                graph.add_node(concept_id, 
                          type='empathy_concept',
                          discovered_strength=dict(top_concepts).get(concept, 0))
            
            graph.add_edge(response_id, concept_id, 
                      weight=strength,
                      edge_type='contains_concept')
        
        # add linguistic pattern nodes
        for pattern, strength in entities['linguistic_patterns']:
            pattern_id = f"pattern_{pattern}"
            
            if not graph.has_node(pattern_id):
                graph.add_node(pattern_id, type='linguistic_pattern')
            
            graph.add_edge(response_id, pattern_id, 
                      weight=strength,
                      edge_type='uses_pattern')
    
    # add concept-concept relationships based on co-occurrence
    response_nodes = [n for n in graph.nodes() if graph.nodes[n]['type'] == 'response']
    concept_cooccurrence = get_concept_coocurrence(graph, response_nodes)
    
    for (c1, c2), cnt in concept_cooccurrence.items():
        if cnt >= 3:
            graph.add_edge(c1, c2, 
                      weight=cnt,
                      edge_type='co_occurs_with')
    
    print(f"Built knowledge graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
    return graph


In [12]:
def build_kg(df, empathy_col: str):
    """Build knowledge graph using automatically discovered concepts"""

    stat_empathy_words = discover_empathy_words_tf_idf(df, empathy_col)
    corr_patterns = discover_empathy_patterns_correlation(df)
    mi_concepts = discover_empathy_concepts_mutual_info(df)
    semantic_concepts = discover_empathy_concepts_semantic(df)

    # combination of the methods
    all_concepts = {}
    for word, ratio in stat_empathy_words:
        all_concepts[word] = all_concepts.get(word, 0) + ratio * 0.3

    for word, corr, _ in corr_patterns:
        all_concepts[word] = all_concepts.get(word, 0) + abs(corr) * 0.3

    for word, mi_score in mi_concepts:
        all_concepts[word] = all_concepts.get(word, 0) + mi_score * 0.2

    for word, sem_score in semantic_concepts:
        all_concepts[word] = all_concepts.get(word, 0) + max(0, sem_score) * 0.2

    sorted_concepts = sorted(all_concepts.items(), key=lambda x: x[1], reverse=True)
    top_concepts = sorted_concepts[:50]

    print(f"Top discovered empathy concepts: {[w for w, s in top_concepts[:10]]}")
    graph = build_graph(df, top_concepts)
    return graph, top_concepts


## Get graph

In [13]:
graph, discovered_concepts = build_kg(df, 'EmpathyQ_1')

print("Top discovered empathy indicators:")
for concept, score in discovered_concepts[:10]:
    print(f"  {concept}: {score:.3f}")

empathy_concepts = [n for n in graph.nodes() if graph.nodes[n]['type'] == 'empathy_concept']

Top discovered empathy concepts: ['absence', 'genuine', 'reflect', 'overcome', 'valuable', 'heartbreak', 'gesture', 'reflecting', 'presents', 'cousin']


100%|██████████| 2494/2494 [14:34<00:00,  2.85it/s]

Built knowledge graph with 2542 nodes and 6507 edges
Top discovered empathy indicators:
  absence: 115172.504
  genuine: 90643.850
  reflect: 85480.893
  overcome: 80632.178
  valuable: 77706.262
  heartbreak: 73059.487
  gesture: 66139.298
  reflecting: 63591.819
  presents: 61794.551
  cousin: 60979.989





# sample x responses from the graph for an easier presentation

In [14]:
def get_connected_nodes(graph, response_nodes):
    connected_nodes = set(response_nodes)
    for response_node in response_nodes:
        if response_node in graph:
            neighbors = list(graph.neighbors(response_node))
            connected_nodes.update(neighbors)
    return connected_nodes

In [15]:
levels = {
        'Very High (8-9)': [8, 9],
        'High (6-7)': [6, 7], 
        'Medium (4-5)': [4, 5],
        'Low (2-3)': [2, 3],
        'Very Low (0-1)': [0, 1]
}

level_colors = {
        'Very High (8-9)': '#8B0000',
        'High (6-7)': '#DC143C', 
        'Medium (4-5)': '#FF4500',
        'Low (2-3)': '#FFA500',
        'Very Low (0-1)': '#FFD700'
    }

In [16]:
def edges_by_levels(sampled_graph, df):
    edges_and_levels = {level: [] for level in level_colors.keys()}
    for edge in sampled_graph.edges():
        source, target = edge
        
        # which empathy level the source response belongs to
        if 'response_' in source:
            response_idx = int(source.split('_')[1])
            empathy_score = df.loc[response_idx, 'EmpathyQ_1']
            
            if empathy_score >= 8:
                level = 'Very High (8-9)'
            elif empathy_score >= 6:
                level = 'High (6-7)'
            elif empathy_score >= 4:
                level = 'Medium (4-5)'
            elif empathy_score >= 2:
                level = 'Low (2-3)'
            else:
                level = 'Very Low (0-1)'
                
            edges_and_levels[level].append(edge)
    return edges_and_levels

In [17]:
def create_sampled_empathy_graph(df, graph, samples_per_level=5, empathy_col='EmpathyQ_1'):
    empathy_scores = df[empathy_col].fillna(0)
    sampled_indices = []
    level_info = {}
        
    for level_name, (min_score, max_score) in levels.items():
        level_mask = (empathy_scores >= min_score) & (empathy_scores <= max_score)
        level_indices = df[level_mask].index.tolist()
        
        if len(level_indices) == 0:
            print(f"  {level_name}: No responses found")
            continue
            
        sample_size = min(samples_per_level, len(level_indices))
        if sample_size > 0:
            if len(level_indices) <= samples_per_level:
                selected_response = level_indices
            else:
                level_df = df.loc[level_indices].sort_values(empathy_col)
                step = len(level_df) // sample_size
                selected_response = []
                for i in range(sample_size):
                    idx = min(i * step, len(level_df) - 1)
                    selected_response.append(level_df.iloc[idx].name)
            
            sampled_indices.extend(selected_response)
            level_info[level_name] = {
                'indices': selected_response,
                'scores': [empathy_scores[idx] for idx in selected_response],
                'count': len(level_indices),
                'sampled': len(selected_response)
            }
            
            print(f"{level_name}: {len(selected_response)}/{len(level_indices)} sampled")
    
    
    response_nodes = [f"response_{idx}" for idx in sampled_indices]
    connected_nodes = get_connected_nodes(graph, response_nodes)
    sampled_graph = graph.subgraph(connected_nodes)
    print(f"Sampled graph: {sampled_graph.number_of_nodes()} nodes, {sampled_graph.number_of_edges()} edges")
    return sampled_graph, level_info, sampled_indices


In [18]:
def analyze_sampled_patterns(G_sampled, level_info):    
    concept_by_level = defaultdict(lambda: defaultdict(int))
    
    for level_name, info in level_info.items():
        if not info['indices']:
            continue
            
        level_response_nodes = [f"response_{idx}" for idx in info['indices']]
        
        for response_node in level_response_nodes:
            if response_node in G_sampled:
                neighbors = [n for n in G_sampled.neighbors(response_node) if 'concept_' in str(n)]
                for concept in neighbors:
                    clean_concept = concept.replace('concept_', '')
                    concept_by_level[level_name][clean_concept] += 1
    
    for level_name, concepts in concept_by_level.items():
        if concepts:
            print(f"\n {level_name}:")
            top_concepts = sorted(concepts.items(), key=lambda x: x[1], reverse=True)[:5]
            for concept, count in top_concepts:
                print(f"   • {concept}: {count} responses")
    
    all_concepts = defaultdict(int)
    for level_concepts in concept_by_level.values():
        for concept, count in level_concepts.items():
            all_concepts[concept] += count
    
    print(f"\nMOST POPULAR CONCEPTS OVERALL:")
    top_overall = sorted(all_concepts.items(), key=lambda x: x[1], reverse=True)[:8]
    for concept, count in top_overall:
        print(f"{concept}: {count} total uses")

def run_sampled_empathy_analysis(df, graph, samples_per_level=5):  
    sampled_graph, level_info, sampled_indices = create_sampled_empathy_graph(
        df, graph, samples_per_level=samples_per_level
    )
    
    analyze_sampled_patterns(sampled_graph, level_info)    
    return sampled_graph, level_info


In [19]:
sampled_graph, level_info = run_sampled_empathy_analysis(df, graph, samples_per_level=25)

Very High (8-9): 25/1687 sampled
High (6-7): 25/593 sampled
Medium (4-5): 25/117 sampled
Low (2-3): 25/66 sampled
Very Low (0-1): 25/31 sampled
Sampled graph: 156 nodes, 374 edges

 Very High (8-9):
   • stand: 8 responses
   • forget: 2 responses
   • act: 2 responses
   • feels: 2 responses
   • heartbreak: 2 responses

 High (6-7):
   • stand: 8 responses
   • genuine: 3 responses
   • act: 2 responses
   • perfect: 2 responses
   • reflect: 2 responses

 Medium (4-5):
   • stand: 12 responses
   • act: 4 responses
   • genuine: 4 responses
   • unexpectedly: 1 responses
   • milestone: 1 responses

 Low (2-3):
   • stand: 10 responses
   • act: 4 responses
   • heartbreak: 2 responses
   • forget: 2 responses
   • perfect: 1 responses

 Very Low (0-1):
   • stand: 6 responses
   • act: 2 responses
   • confidence: 1 responses
   • perfect: 1 responses
   • seek: 1 responses

MOST POPULAR CONCEPTS OVERALL:
stand: 44 total uses
act: 14 total uses
genuine: 9 total uses
heartbreak: 5 t

In [20]:
def create_colored_edges(G_sampled, df, level_info, pos):
    edge_traces = []
    edges_by_level = edges_by_levels(G_sampled, df)
    for level_name, edges in edges_by_level.items():
        if not edges:
            continue
            
        edge_x, edge_y = [], []
        for edge in edges:
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.extend([x0, x1, None])
            edge_y.extend([y0, y1, None])
        
        edge_traces.append(go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=1.5, color=level_colors[level_name]),
            hoverinfo='none',
            mode='lines',
            showlegend=False,
            name=f'{level_name}_edges'
        ))
    
    return edge_traces

In [21]:
def color_size_concept_nodes(G_sampled, concept_nodes):
    concept_sizes = []
    concept_colors = []
        
    for node in concept_nodes:
        discovery_strength = G_sampled.nodes[node].get('discovered_strength', 0)
        connectivity = G_sampled.degree(node)
            
        normalized_strength = min(discovery_strength / 10000, 1.0)
        size = max(normalized_strength * 30 + connectivity * 5, 15)
        concept_sizes.append(size)
            
        intensity = min(connectivity / 5, 1.0)
        concept_colors.append(intensity)
    return concept_sizes,concept_colors


def create_txt_concept_nodes(sampled_graph, df, concept_nodes):
    concept_hover = []
        
    for node in concept_nodes:
        clean_name = node.replace('concept_', '').title()
        discovery_strength = sampled_graph.nodes[node].get('discovered_strength', 0)
        connectivity = sampled_graph.degree(node)
            
        connected_responses = [n for n in sampled_graph.neighbors(node) if 'response_' in str(n)]
        levels_using = set()
            
        for response_node in connected_responses:
            idx = int(response_node.split('_')[1])
            empathy_score = df.loc[idx, 'EmpathyQ_1']
                
            if empathy_score >= 8:
                levels_using.add('Very High')
            elif empathy_score >= 6:
                levels_using.add('High')
            elif empathy_score >= 4:
                levels_using.add('Medium')
            elif empathy_score >= 2:
                levels_using.add('Low')
            else:
                levels_using.add('Very Low')
            
        txt = f"<b>{clean_name}</b><br>"
        txt += f"Discovery Strength: {discovery_strength:.0f}<br>"
        txt += f"Connected to {connectivity} responses<br>"
        txt += f"Used in levels: {', '.join(sorted(levels_using))}"
            
        concept_hover.append(txt)
    return concept_hover


In [22]:
def pattern_node_txt(sampled_graph, pattern_nodes):
    pattern_hover = []
    for node in pattern_nodes:
        clean_name = node.replace('pattern_', '').replace('_', ' ').title()
        connectivity = sampled_graph.degree(node)
            
        txt = f"<b>{clean_name}</b><br>"
        txt += f"Type: Linguistic Pattern<br>"
        txt += f"Used in {connectivity} responses<br>"
        txt += f"Indicates: {clean_name.lower()} usage in empathetic responses"
            
        pattern_hover.append(txt)
    return pattern_hover

In [23]:
level_y_positions = {
            'Very High (8-9)': 4,
            'High (6-7)': 2,
            'Medium (4-5)': 0,
            'Low (2-3)': -2,
            'Very Low (0-1)': -4
        }

In [38]:
def create_graph_vis(sampled_graph, df, level_info):
    def create_level_based_layout(G, level_info):
        pos = {}       
        for level_name, info in level_info.items():
            if not info['indices']:
                continue
                
            y_pos = level_y_positions[level_name]
            level_response_nodes = [f"response_{idx}" for idx in info['indices']]
            level_nodes_in_graph = [node for node in level_response_nodes if node in G]
            
            for i, node in enumerate(level_nodes_in_graph):
                x_pos = (i - len(level_nodes_in_graph)/2) * 3
                pos[node] = (x_pos, y_pos)
        
        concept_nodes = [n for n in G.nodes() if 'concept_' in str(n)]
        if concept_nodes:
            concept_subgraph = G.subgraph(concept_nodes)
            concept_pos = nx.spring_layout(concept_subgraph, k=2, iterations=50)
            for node, (x, y) in concept_pos.items():
                pos[node] = (x * 6, y * 3)
        
        pattern_nodes = [n for n in G.nodes() if 'pattern_' in str(n)]
        for i, node in enumerate(pattern_nodes):
            pos[node] = (-10, i * 2 - len(pattern_nodes))
            
        return pos
    
    pos = create_level_based_layout(sampled_graph, level_info)
    
    traces = []
    
    colored_edge_traces = create_colored_edges(sampled_graph, df, level_info, pos)
    traces.extend(colored_edge_traces)
    
    for level_name, info in level_info.items():
        if not info['indices']:
            continue
            
        level_response_nodes = [f"response_{idx}" for idx in info['indices']]
        level_nodes_in_graph = [node for node in level_response_nodes if node in sampled_graph]
        
        if not level_nodes_in_graph:
            continue
        
        response_x = [pos[node][0] for node in level_nodes_in_graph]
        response_y = [pos[node][1] for node in level_nodes_in_graph]
        
        response_texts = []
        response_hover = []
        response_sizes = []
        
        for node in level_nodes_in_graph:
            idx = int(node.split('_')[1])
            row = df.loc[idx]
            
            response_texts.append(f"R{idx}")
            
            connected_concepts = [n for n in sampled_graph.neighbors(node) if 'concept_' in str(n)]
            concept_names = [n.replace('concept_', '') for n in connected_concepts]
            
            txt = f"<b>Response {idx}</b><br>"
            txt += f"Empathy Level: {level_name}<br>"
            txt += f"Empathy Score: {row.get('EmpathyQ_1', 0):.2f}<br>"
                        
            txt += f"Connected Concepts: {len(connected_concepts)}<br>"
            if concept_names:
                txt += f"Key Words: {', '.join(concept_names[:3])}<br>"
            
            response_text = str(row['Response'])[:80] + "..." if len(str(row['Response'])) > 80 else str(row['Response'])
            txt += f"<br>Text: <i>{response_text}</i>"
            
            response_hover.append(txt)
            response_sizes.append(max(row.get('EmpathyQ_1', 5) * 4, 20))
        
        traces.append(go.Scatter(
            x=response_x, y=response_y,
            mode='markers+text',
            marker=dict(
                size=response_sizes,
                color=level_colors[level_name],
                symbol='circle',  # Explicitly set symbol
                line=dict(width=3, color='white'),
                opacity=0.9
            ),
            text=response_texts,
            textposition="middle center",
            textfont=dict(size=10, color='white', family="Arial Black"),
            hovertext=response_hover,
            hoverinfo='text',
            name=f'{level_name} Responses ({len(level_nodes_in_graph)})',
            legendgroup="responses",
            showlegend=True
        ))
    
    concept_nodes = [n for n in sampled_graph.nodes() if 'concept_' in str(n)]
    if concept_nodes:
        concept_x = [pos[node][0] for node in concept_nodes]
        concept_y = [pos[node][1] for node in concept_nodes]
        
        concept_sizes, concept_colors = color_size_concept_nodes(sampled_graph, concept_nodes)
        
        concept_text = [node.replace('concept_', '') for node in concept_nodes]
        concept_hover = create_txt_concept_nodes(sampled_graph, df, concept_nodes)
        
        traces.append(go.Scatter(
            x=concept_x, y=concept_y,
            mode='markers+text',
            marker=dict(
                size=concept_sizes,
                color=concept_colors,
                colorscale='Viridis',
                symbol='diamond',
                line=dict(width=2, color='white'),
                opacity=0.8,
                showscale=False
            ),
            text=concept_text,
            textposition="top center",
            textfont=dict(size=11, color='darkslategray', family="Arial"),
            hovertext=concept_hover,
            hoverinfo='text',
            name='Empathy Concepts (discovered words)',
            legendgroup="concepts",
            showlegend=True
        ))
    
    pattern_nodes = [n for n in sampled_graph.nodes() if 'pattern_' in str(n)]
    if pattern_nodes:
        pattern_x = [pos[node][0] for node in pattern_nodes]
        pattern_y = [pos[node][1] for node in pattern_nodes]
        pattern_text = [node.replace('pattern_', '').replace('_', ' ').title() for node in pattern_nodes]
        
        pattern_hover = pattern_node_txt(sampled_graph, pattern_nodes)
        
        traces.append(go.Scatter(
            x=pattern_x, y=pattern_y,
            mode='markers+text',
            marker=dict(
                size=30,
                color='cornflowerblue',
                symbol='square',  # Explicitly set symbol
                line=dict(width=3, color='white'),
                opacity=0.8
            ),
            text=pattern_text,
            textposition="middle right",
            textfont=dict(size=12, color='navy', family="Arial Bold"),
            hovertext=pattern_hover,
            hoverinfo='text',
            name='communication styles (linguistic patterns)',
            legendgroup="patterns",
            showlegend=True
        ))
        
    
    fig = go.Figure(
        data=traces,
        layout=go.Layout(
            title=dict(
                text='Responses and Empathy levels Knowledge Graph<br>',
                x=0.5,
                font=dict(size=22, color='darkslategray')
            ),
            showlegend=True,
            legend=dict(
                x=0.02, y=0.98,
                bgcolor="rgba(255,255,255,0.95)",
                bordercolor="darkgray",
                borderwidth=2,
                font=dict(size=12),
                title=dict(
                    text="<b>GRAPH ELEMENTS</b><br><i>Click to show/hide</i>",
                    font=dict(size=14, color="darkblue")
                ),
                grouptitlefont=dict(size=13)
            ),
            hovermode='closest',
            margin=dict(b=120, l=60, r=220, t=120),
            annotations=[
                dict(
                    text="<b>SHAPE GUIDE</b><br>" +
                         "● <b>Circles</b> = responses (colored by empathy level)<br>" +
                         "◆ <b>Diamonds</b> = empathy concepts (discovered words)<br>" +
                         "■ <b>Squares</b> = communication styles (linguistic patterns)<br>" +
                         "<br><b>SIZE MEANING</b><br>" +
                         "Larger = Higher empathy score or stronger discovery",
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=1.02, y=0.5,
                    xanchor='left', yanchor='middle',
                    font=dict(color="darkblue", size=11),
                    bgcolor="rgba(240,248,255,0.9)",
                    bordercolor="steelblue",
                    borderwidth=2,
                    align="left"
                ),
            ],
            xaxis=dict(
                showgrid=True, 
                gridwidth=1, 
                gridcolor='rgba(128,128,128,0.2)', 
                zeroline=False, 
                showticklabels=False,
                title=""
            ),
            yaxis=dict(
                showgrid=True, 
                gridwidth=1, 
                gridcolor='rgba(128,128,128,0.2)', 
                zeroline=False, 
                showticklabels=False,
                title=""
            ),
            plot_bgcolor='rgba(248,249,250,0.9)',
            paper_bgcolor='white',
            width=1700,
            height=1000
        )
    )
    
    return fig

graph_fig = create_graph_vis(sampled_graph, df, level_info)
graph_fig.show()

In [39]:
import plotly.io as pio
pio.write_html(graph_fig, file='../index.html', auto_open=False)
