In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm
from langchain_openai import AzureOpenAIEmbeddings
import json
import numpy as np
import torch
import time
import random
import ast
import matplotlib.pyplot as plt
import os

Let's fetch the dataframes.

In [3]:
from data_extractor import Extractor

extractor_instance = Extractor()
extractor_instance.extract(location="./data/results/df_combined.csv")  # This saves the CSV

# Load the saved CSV
df_combined = pd.read_csv("./data/results/df_combined.csv")
print("Total COLLABORATION relationships:", len(df_combined[df_combined['relationship type'] == 'COLLABORATION']))
print("\nSample COLLABORATION relationships:")
print(df_combined[df_combined['relationship type'] == 'COLLABORATION'].head())
# Now filter for innovations only for similarity comparison
innovations_only = df_combined[df_combined["source type"] == "Innovation"].copy()
print(f"Total relationships: {len(df_combined)}")
print(f"Innovation relationships only: {len(innovations_only)}")

Total COLLABORATION relationships: 6751

Sample COLLABORATION relationships:
      Document number      VAT id  \
39                 28  FI08932048   
1504                4  FI01111693   
1505                5  FI27256903   
1506                5  FI27256903   
1507                5  FI27256903   

                               relationship description   source id  \
39    Both Resq Club and Lunchie are digital platfor...  FI27254203   
1504  VTT Technical Research Centre of Finland and M...  FI26473754   
1505  VTT Technical Research Centre of Finland and N...  FI26473754   
1506  VTT Technical Research Centre of Finland and S...  FI26473754   
1507  Natural Resources Institute Finland (Luke) and...    temp_874   

       source type                           source english_id  \
39      Innovation                                   Resq Club   
1504  Organization    VTT Technical Research Centre of Finland   
1505  Organization    VTT Technical Research Centre of Finland   
1506  Org

In [6]:
from embedding_generator import EmbeddingGenerator, OpenAIEmbeddingProvider
import ast
import os

# Configuration
METHOD = "openai"
MODEL_KEY = "gpt-4.1-mini"

# Create temporary file for innovations only
innovations_file = "data/results/innovations_for_similarity.csv"
innovations_only.to_csv(innovations_file, index=False)

# Generate embeddings for innovations only
output_file = "data/results/innovations_embeddings.csv"
provider = OpenAIEmbeddingProvider(MODEL_KEY)
generator = EmbeddingGenerator(provider)

if os.path.exists(output_file):
    innovations_with_embeddings = pd.read_csv(output_file)
    print(f"Loaded {len(innovations_with_embeddings)} innovation embeddings")
else:
    innovations_with_embeddings = generator.generate_embeddings(innovations_file, output_file)

# Convert embeddings from string to list
innovations_with_embeddings["embedding"] = innovations_with_embeddings["embedding"].apply(ast.literal_eval)

2025-05-31 18:32:21,614 - INFO - Initialized OpenAI embedding model with key: gpt-4.1-mini


Loaded 6767 innovation embeddings


In [12]:
result_df.head()

Unnamed: 0.2,Unnamed: 0.1,Document number,VAT id,relationship description,source id,source type,source english_id,source description,relationship type,target id,...,target english_id,target description,Link Source Text,Source Text,text_to_compare,Unnamed: 0,Source Company,embedding,embedding_provider,embedding_dimension
0,0,VTT0,FI10292588,"FiR 1 nuclear research reactor was developed, ...",FiR 1,Innovation,FiR 1,FiR 1 is a Triga-type nuclear research reactor...,DEVELOPED_BY,FI26473754,...,VTT Technical Research Centre of Finland Ltd.,VTT is a Finnish research and innovation partn...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...,FiR 1 - FiR 1 is a Triga-type nuclear research...,,,"[-0.018910497426986694, 0.017533520236611366, ...",openai_gpt-4.1-mini,3072
1,1,VTT0,FI10292588,Centre for Nuclear Safety is being developed a...,Centre for Nuclear Safety,Innovation,Centre for Nuclear Safety,A modern research facility under construction ...,DEVELOPED_BY,FI26473754,...,VTT Technical Research Centre of Finland Ltd.,VTT is a Finnish research and innovation partn...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...,Centre for Nuclear Safety - A modern research ...,,,"[-0.009751166217029095, 0.006150579079985619, ...",openai_gpt-4.1-mini,3072
2,2,VTT3,FI08932048,The innovation approach 'Beyond the obvious' i...,Beyond the obvious,Innovation,Beyond the obvious,An innovation approach promising to provide so...,DEVELOPED_BY,FI26473754,...,VTT Technical Research Centre of Finland Ltd,A visionary research and innovation partner fo...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...,Beyond the obvious - An innovation approach pr...,,,"[-0.02210899069905281, -0.002120873425155878, ...",openai_gpt-4.1-mini,3072
3,3,VTT4,FI01111693,Data-Driven Bioeconomy project is developed by...,Data-Driven Bioeconomy project,Innovation,Data-Driven Bioeconomy project,An innovation using Big Data for sustainable u...,DEVELOPED_BY,FI26473754,...,VTT Technical Research Centre of Finland,A Finnish research and innovation partner work...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...,Data-Driven Bioeconomy project - An innovation...,,,"[-0.015033945441246033, 0.031899139285087585, ...",openai_gpt-4.1-mini,3072
4,5,VTT5,FI27256903,The innovation of renewable glue resins from s...,Alternatives to toxic phenol compounds used in...,Innovation,Alternatives to toxic phenol compounds used in...,A novel industrial scale method and products f...,DEVELOPED_BY,FI26473754,...,VTT Technical Research Centre of Finland,A leading research institution in Finland deve...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...,Alternatives to toxic phenol compounds used in...,,,"[-0.028945280238986015, -0.007764079142361879,...",openai_gpt-4.1-mini,3072


In [7]:
import torch
import numpy as np
from sentence_transformers import util
import time
from tqdm import tqdm

# Remove duplicates and prepare for similarity
innovations_dedup = innovations_with_embeddings.drop_duplicates(subset=["source description"]).reset_index(drop=True)

combined_embeddings = innovations_dedup["embedding"].tolist()
combined_texts = innovations_dedup["text_to_compare"].tolist()

# Compute similarity matrix (your existing code)
threshold = 0.80
embeddings_tensor = torch.stack([torch.tensor(e) for e in combined_embeddings])
cos_sim_matrix = util.cos_sim(embeddings_tensor, embeddings_tensor)

# Get similar pairs above threshold
triu_indices = torch.triu_indices(cos_sim_matrix.size(0), cos_sim_matrix.size(1), offset=1)
sim_scores = cos_sim_matrix[triu_indices[0], triu_indices[1]]
mask = sim_scores > threshold

i_indices = triu_indices[0][mask].tolist()
j_indices = triu_indices[1][mask].tolist()
scores = sim_scores[mask].tolist()

similar_pairs = [(i, j, score) for i, j, score in zip(i_indices, j_indices, scores)]
print(f"Found {len(similar_pairs)} similar pairs above threshold {threshold}")

Found 4496 similar pairs above threshold 0.8


In [8]:
def group_similar_pairs(similar_pairs):
    """Group pairs into connected components"""
    from collections import defaultdict
    
    graph = defaultdict(set)
    for i, j, score in similar_pairs:
        graph[i].add(j)
        graph[j].add(i)
    
    visited = set()
    groups = []
    
    def dfs(node, current_group):
        if node in visited:
            return
        visited.add(node)
        current_group.add(node)
        for neighbor in graph[node]:
            dfs(neighbor, current_group)
    
    for node in graph:
        if node not in visited:
            current_group = set()
            dfs(node, current_group)
            if current_group:
                groups.append(current_group)
    
    return groups

# Group the similar innovations
innovation_groups = group_similar_pairs(similar_pairs)
print(f"Found {len(innovation_groups)} groups of similar innovations")

# Show sample groups
for i, group in enumerate(innovation_groups[:3]):
    print(f"\nGroup {i+1} (size: {len(group)}):")
    for idx in sorted(group):
        print(f"  {innovations_dedup.iloc[idx]['source english_id']}")

Found 632 groups of similar innovations

Group 1 (size: 12):
  FiR 1
  FiR 1 -tutkimusreaktorin käytöstä poisto
  FiR1-ydinreaktorin purkuprojekti
  FiR1-ydinreaktori
  FiR 1 -tutkimusreaktori
  FiR1 reactor decommissioning project
  First nuclear decommissioning project in Finland at FiR1 reactor
  FiR1 nuclear reactor decommissioning
  FiR1 research nuclear reactor decommissioning
  FiR 1 -tutkimusreaktori
  FiR1 research nuclear reactor decommissioning project
  decommissioning the FiR1 research nuclear reactor

Group 2 (size: 2):
  AARRE project
  AARRE-projekti

Group 3 (size: 6):
  bio-based packaging solution made of cellulose
  bio-based packaging solution
  biodegradable packaging materials based on a fibrous web
  transparent cellulose film
  transparent cellulose film
  Transparent cellulose film by VTT


In [None]:
def get_all_organizations_for_innovation_group(group_indices, innovations_dedup, df_combined):
    """Get all organizations involved with an innovation group from the full dataset (refined)"""
    
    # Get all innovation names AND IDs in this group for more robust matching
    group_innovation_source_ids = set()
    group_innovation_english_ids = set()
    for idx in group_indices:
        # Assuming 'source id' in innovations_dedup refers to the innovation's own ID
        group_innovation_source_ids.add(innovations_dedup.iloc[idx]['source id']) 
        group_innovation_english_ids.add(innovations_dedup.iloc[idx]['source english_id'])
    
    # --- CHANGE 1: Broaden relevant document scope ---
    # Get document numbers where these innovations appear as EITHER source or target
    doc_numbers_source_mention = df_combined[
        df_combined['source id'].isin(group_innovation_source_ids) | \
        df_combined['source english_id'].isin(group_innovation_english_ids)
    ]['Document number'].unique()

    doc_numbers_target_mention = df_combined[
        df_combined['target id'].isin(group_innovation_source_ids) | \
        df_combined['target english_id'].isin(group_innovation_english_ids)
    ]['Document number'].unique()
    
    relevant_doc_numbers = set(list(doc_numbers_source_mention) + list(doc_numbers_target_mention))
    # --- END CHANGE 1 ---

    # Find ALL relationships from these RELEVANT documents
    relationships_in_relevant_docs = df_combined[
        df_combined['Document number'].isin(relevant_doc_numbers)
    ].copy() # Use .copy() to avoid SettingWithCopyWarning
    
    organizations = {}
    
    # --- CHANGE 2: Helper function to add/update orgs, checking type and handling NaN ---
    def add_or_update_org(org_id, org_name, org_type, role_flag_key, mention_info):
        if org_type != 'Organization' or pd.isna(org_id): # CRITICAL: Only process actual organizations with IDs
            # print(f"DEBUG: Skipping non-org or NaN ID: {org_name} ({org_id}), Type: {org_type}")
            return

        name_to_use = org_name
        if org_id in organizations: # If org already exists
            current_name = organizations[org_id]['name']
            # Prefer new name if current is NaN/placeholder, or if new is not 'temp_' and current is
            if pd.isna(current_name) or str(current_name).startswith("Org_ID_") or \
               (not pd.isna(org_name) and not str(org_name).startswith("temp_") and str(current_name).startswith("temp_")):
                if not pd.isna(org_name):
                    name_to_use = org_name
                else: # both current and new are bad, keep current if not NaN
                    name_to_use = current_name if not pd.isna(current_name) else f"Org_ID_{org_id}"
            else: # current name is good, keep it
                name_to_use = current_name
        elif pd.isna(name_to_use): # New org, but name is NaN
             name_to_use = f"Org_ID_{org_id}"


        if org_id not in organizations:
            organizations[org_id] = {
                'name': name_to_use,
                'developed_by': False,
                'collaboration': False,
                'mentions': []
            }
        elif pd.isna(organizations[org_id]['name']) and not pd.isna(name_to_use): # Update if existing name was NaN
            organizations[org_id]['name'] = name_to_use
        elif not pd.isna(name_to_use) and not str(name_to_use).startswith("temp_") and \
             str(organizations[org_id]['name']).startswith("temp_"): # Prefer non-temp name
            organizations[org_id]['name'] = name_to_use


        organizations[org_id][role_flag_key] = True
        organizations[org_id]['mentions'].append(mention_info)
    # --- END CHANGE 2 ---

    # Store IDs of organizations that are developers of this group's innovations
    developer_org_ids_for_this_group = set()

    # First pass: Identify developers and direct Org-Innovation collaborations
    for _, row in relationships_in_relevant_docs.iterrows():
        is_source_innovation_in_group = (
            (row['source id'] in group_innovation_source_ids or row['source english_id'] in group_innovation_english_ids) and
            row['source type'] == 'Innovation'
        )
        is_target_innovation_in_group = (
            (row['target id'] in group_innovation_source_ids or row['target english_id'] in group_innovation_english_ids) and
            row['target type'] == 'Innovation'
        )

        # DEVELOPED_BY: Innovation (from group) -> Organization (developer)
        if row['relationship type'] == 'DEVELOPED_BY' and is_source_innovation_in_group:
            mention = {
                'innovation_name': row['source english_id'],
                'relationship_type': 'DEVELOPED_BY',
                'document_number': row['Document number']
            }
            add_or_update_org(row['target id'], row['target english_id'], row['target type'], 'developed_by', mention)
            if row['target type'] == 'Organization' and not pd.isna(row['target id']):
                 developer_org_ids_for_this_group.add(row['target id'])
        
        # COLLABORATION: Innovation (from group) --COLLABORATION--> Organization
        elif row['relationship type'] == 'COLLABORATION' and is_source_innovation_in_group and row['target type'] == 'Organization':
            mention = {
                'innovation_name': row['source english_id'], # The innovation
                'relationship_type': 'COLLABORATION_WITH_INNOVATION',
                'document_number': row['Document number'],
                'other_entity_in_collab': row['target english_id']
            }
            add_or_update_org(row['target id'], row['target english_id'], row['target type'], 'collaboration', mention)

        # COLLABORATION: Organization --COLLABORATION--> Innovation (from group)
        elif row['relationship type'] == 'COLLABORATION' and row['source type'] == 'Organization' and is_target_innovation_in_group:
            mention = {
                'innovation_name': row['target english_id'], # The innovation
                'relationship_type': 'COLLABORATION_WITH_INNOVATION',
                'document_number': row['Document number'],
                'other_entity_in_collab': row['source english_id']
            }
            add_or_update_org(row['source id'], row['source english_id'], row['source type'], 'collaboration', mention)
            
    # Second pass for Org-Org collaborations, only if one is a known developer of THIS group
    for _, row in relationships_in_relevant_docs.iterrows():
        if row['relationship type'] == 'COLLABORATION' and \
           row['source type'] == 'Organization' and row['target type'] == 'Organization':
            
            source_is_dev_for_group = row['source id'] in developer_org_ids_for_this_group
            target_is_dev_for_group = row['target id'] in developer_org_ids_for_this_group

            # If Org1 (developer for this group) collaborates with Org2, Org2 is a collaborator for the group.
            if source_is_dev_for_group:
                mention = {
                    'innovation_context': f"Via dev: {row['source english_id']}",
                    'relationship_type': 'COLLABORATION_WITH_DEVELOPER',
                    'document_number': row['Document number'],
                    'other_entity_in_collab': row['source english_id'] # The developer
                }
                add_or_update_org(row['target id'], row['target english_id'], row['target type'], 'collaboration', mention)
            
            # If Org2 (developer for this group) collaborates with Org1, Org1 is a collaborator for the group.
            if target_is_dev_for_group:
                mention = {
                    'innovation_context': f"Via dev: {row['target english_id']}",
                    'relationship_type': 'COLLABORATION_WITH_DEVELOPER',
                    'document_number': row['Document number'],
                    'other_entity_in_collab': row['target english_id'] # The developer
                }
                add_or_update_org(row['source id'], row['source english_id'], row['source type'], 'collaboration', mention)
    
    return organizations

def create_unified_innovations(innovation_groups, innovations_dedup, df_combined):
    """Create final unified innovation records"""
    
    unified_innovations = []
    
    for group_idx, group in enumerate(innovation_groups):
        if len(group) == 1:  # Single innovation, not a duplicate group
            continue
            
        # Get canonical innovation (longest description)
        group_list = list(group)
        canonical_idx = max(group_list, 
                          key=lambda x: len(innovations_dedup.iloc[x]['source description'] or ''))
        canonical = innovations_dedup.iloc[canonical_idx]
        
        # Get all organizations involved with this innovation group
        organizations = get_all_organizations_for_innovation_group(group, innovations_dedup, df_combined)
        
# ADD THE DEBUG CODE RIGHT HERE:
        #print(f"\n🔍 DEBUG: Processing group {group_idx}")
        vtt_org = organizations.get('FI26473754', {})
       # print(f"🔍 DEBUG: VTT org data: {vtt_org}")
        vtt_roles = []
        if vtt_org.get('developed_by', False):
            vtt_roles.append('developer')
            #print(f"🔍 DEBUG: Added VTT as developer")
        if vtt_org.get('collaboration', False):
            vtt_roles.append('collaborator')
            #print(f"🔍 DEBUG: Added VTT as collaborator")
       # print(f"🔍 DEBUG: Final VTT roles: {vtt_roles}")

        # Separate developers and collaborators
        developers = [org for org_id, org in organizations.items() if org['developed_by']]
        collaborators = [org for org_id, org in organizations.items() if org['collaboration']]
        
        # FIXED: Determine VTT's role(s) properly
        vtt_org = organizations.get('FI26473754', {})
        vtt_roles = []
        if vtt_org.get('developed_by', False):
            vtt_roles.append('developer')
        if vtt_org.get('collaboration', False):
            vtt_roles.append('collaborator')
        
        unified = {
            'group_id': f"innovation_group_{group_idx}",
            'canonical_name': canonical['source english_id'] + " **TEST_MODIFIED**",  # Debug text
            'canonical_description': canonical['source description'],
            'aliases': [innovations_dedup.iloc[idx]['source english_id'] for idx in group],
            'total_mentions': len(group),
            'developers': developers,
            'collaborators': collaborators,
            'vtt_role': '+'.join(vtt_roles) if vtt_roles else 'unknown',  # FIXED
            'document_numbers': list(set([mention['document_number'] for org in organizations.values() for mention in org['mentions']])),
            #'debug_test': "FUNCTION_WAS_EXECUTED"  # Debug field
        }
        
        unified_innovations.append(unified)
    
    return unified_innovations

# Create the final unified innovations
unified_innovations = create_unified_innovations(innovation_groups, innovations_dedup, df_combined)


print(f"Created {len(unified_innovations)} unified innovation records")

Created 632 unified innovation records


In [22]:
import json

# Save results
with open('data/results/unified_innovations_3.json', 'w') as f:
    json.dump(unified_innovations, f, indent=2)

# Display sample results
print("\nSample Unified Innovations:")
for innovation in unified_innovations[:3]:
    print(f"\n{'='*60}")
    print(f"Innovation: {innovation['canonical_name']}")
    print(f"Aliases: {innovation['aliases']}")
    print(f"VTT Role: {innovation['vtt_role']}")
    print(f"Developers: {[dev['name'] for dev in innovation['developers']]}")
    print(f"Collaborators: {[collab['name'] for collab in innovation['collaborators']]}")
    print(f"Total mentions: {innovation['total_mentions']}")

# Create final clean dataset
final_df = pd.DataFrame(unified_innovations)
final_df.to_csv('data/results/vtt_unified_innovations.csv', index=False)
print(f"\nSaved {len(final_df)} unified innovations to CSV")


Sample Unified Innovations:

Innovation: FiR1-ydinreaktorin purkuprojekti **TEST_MODIFIED**
Aliases: ['FiR 1', 'FiR1 nuclear reactor decommissioning', 'First nuclear decommissioning project in Finland at FiR1 reactor', 'FiR1 research nuclear reactor decommissioning', 'FiR1 research nuclear reactor decommissioning project', 'decommissioning the FiR1 research nuclear reactor', 'FiR1 reactor decommissioning project', 'FiR 1 -tutkimusreaktori', 'FiR1-ydinreaktorin purkuprojekti', 'FiR1-ydinreaktori', 'FiR 1 -tutkimusreaktorin käytöstä poisto', 'FiR 1 -tutkimusreaktori']
VTT Role: developer+collaborator
Developers: ['VTT Technical Research Centre of Finland Ltd.', 'Fortum']
Collaborators: ['VTT Technical Research Centre of Finland Ltd.', 'Valio', 'University of Helsinki', 'Fortum']
Total mentions: 12

Innovation: AARRE-projekti **TEST_MODIFIED**
Aliases: ['AARRE-projekti', 'AARRE project']
VTT Role: developer+collaborator
Developers: ['VTT']
Collaborators: ['VTT', 'Finnish Environment Inst