# Collaboration Recommender System
## Recommend potential collaborators who have never worked together

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load and Explore Data

In [2]:
# Load scientists data
scientists_df = pd.read_csv('../collect_uam_data/data/scientists_with_identifiers.csv')
print(f"Total scientists: {len(scientists_df)}")
print(f"Columns: {scientists_df.columns.tolist()}")
scientists_df.head()

Total scientists: 164
Columns: ['profile_id', 'full_name', 'academic_title', 'first_name', 'last_name', 'position', 'profile_url', 'image_url', 'affiliations', 'orcid', 'google_scholar_id', 'google_scholar_url', 'scopus_id', 'scopus_url', 'europepmc', 'crossref', 'researchgate', 'other_links']


Unnamed: 0,profile_id,full_name,academic_title,first_name,last_name,position,profile_url,image_url,affiliations,orcid,google_scholar_id,google_scholar_url,scopus_id,scopus_url,europepmc,crossref,researchgate,other_links
0,UAM12847,prof. dr hab. Wojciech Buszkowski,prof. dr hab.,Wojciech,Buszkowski,profesor Senior,https://researchportal.amu.edu.pl/info/author/...,https://researchportal.amu.edu.pl/javax.faces....,Wydział Matematyki i Informatyki; Szkoła Nauk ...,0000-0002-2950-0864,C2zchKsAAAAJ,https://scholar.google.pl/citations?user=C2zch...,via_orcid,https://www.scopus.com/results/authorNamesList...,http://europepmc.org/search?query=%28AUTHORID%...,https://search.crossref.org/?q=0000-0002-2950-...,,
1,D448311-DD-Mat,dr Andrzej Kokosza,dr,Andrzej,Kokosza,,https://researchportal.amu.edu.pl/info/author/...,https://researchportal.amu.edu.pl/javax.faces....,Wydział Matematyki i Informatyki; Szkoła Nauk ...,0000-0002-8578-8997,,,via_orcid,https://www.scopus.com/results/authorNamesList...,http://europepmc.org/search?query=%28AUTHORID%...,https://search.crossref.org/?q=0000-0002-8578-...,,
2,D433412-DD-Mat,dr inż. Piotr Mizerka,dr inż.,Piotr,Mizerka,adiunkt,https://researchportal.amu.edu.pl/info/author/...,https://researchportal.amu.edu.pl/javax.faces....,Wydział Matematyki i Informatyki; Szkoła Nauk ...,0000-0001-5712-8513,,,via_orcid,https://www.scopus.com/results/authorNamesList...,http://europepmc.org/search?query=%28AUTHORID%...,https://search.crossref.org/?q=0000-0001-5712-...,,
3,UAM106647,dr Hubert Przybycień,dr,Hubert,Przybycień,adiunkt,https://researchportal.amu.edu.pl/info/author/...,https://researchportal.amu.edu.pl/javax.faces....,Wydział Matematyki i Informatyki; Szkoła Nauk ...,0000-0002-7125-6113,,,via_orcid,https://www.scopus.com/results/authorNamesList...,http://europepmc.org/search?query=%28AUTHORID%...,https://search.crossref.org/?q=0000-0002-7125-...,,
4,UAM206f981f3b9d4727bad9ad900a92dca1,mgr Robert Marcin Bendun,mgr,Robert Marcin,Bendun,asystent,https://researchportal.amu.edu.pl/info/author/...,https://researchportal.amu.edu.pl/javax.faces....,Wydział Matematyki i Informatyki; Szkoła Nauk ...,0009-0008-1117-2875,,,via_orcid,https://www.scopus.com/results/authorNamesList...,http://europepmc.org/search?query=%28AUTHORID%...,https://search.crossref.org/?q=0009-0008-1117-...,,


In [3]:
# Load publications data
publications_df = pd.read_csv('../abstracts/data/titles_with_abstracts.csv')
print(f"\nTotal publications: {len(publications_df)}")
print(f"Columns: {publications_df.columns.tolist()}")
publications_df.head()


Total publications: 3440
Columns: ['main_author_orcid', 'openalex_id', 'title', 'publication_year', 'publication_date', 'doi', 'type', 'cited_by_count', 'journal', 'topics', 'co_authors', 'co_author_orcids', 'num_co_authors', 'abstract', 'keywords']


Unnamed: 0,main_author_orcid,openalex_id,title,publication_year,publication_date,doi,type,cited_by_count,journal,topics,co_authors,co_author_orcids,num_co_authors,abstract,keywords
0,0000-0002-2950-0864,W1976366395,On Action Logic: Equational Theories of Action...,2006,2006-08-23,https://doi.org/10.1093/logcom/exl036,article,34,Journal of Logic and Computation,"Advanced Algebra and Logic; Logic, Reasoning, ...",,,0,Journal Article On Action Logic: Equational Th...,Action (physics); Computer science; Mathematic...
1,0000-0002-2950-0864,W2149439075,"Infinitary Action Logic: Complexity, Models an...",2008,2008-05-23,https://doi.org/10.1007/s11225-008-9116-7,article,18,Studia Logica,Advanced Algebra and Logic; semigroups and aut...,,,0,Action logic of Pratt [21] can be presented as...,Axiom; Mathematics; Monoid; Action (physics); ...
2,0000-0002-2950-0864,W2012920920,Categorial grammars determined from linguistic...,1990,1990-12-01,https://doi.org/10.1007/bf00370157,article,97,Studia Logica,Natural Language Processing Techniques; Syntax...,Gerald Penn,0000-0003-3553-8305,1,We provide an algorithm for determining a cate...,Unification; Rule-based machine translation; C...
3,0000-0002-2950-0864,W1770609059,Lambek Grammars Based on Pregroups,2001,2001-01-01,https://doi.org/10.1007/3-540-48199-0_6,book-chapter,83,Lecture notes in computer science,Natural Language Processing Techniques; semigr...,,,0,Lambek [14] introduces pregroups as a new fram...,Rule-based machine translation; Tree-adjoining...
4,0000-0002-2950-0864,W1996712532,Interpolation and FEP for logics of residuated...,2010,2010-01-17,https://doi.org/10.1093/jigpal/jzp094,article,36,Logic Journal of IGPL,"Logic, programming, and type systems; Advanced...",,,0,Journal Article Interpolation and FEP for logi...,Interpolation (computer graphics); Mathematics...


## 2. Explore Co-author Structure

In [4]:
# Check structure of co-author fields
print("Sample co-author data:")
for idx in range(min(10, len(publications_df))):
    row = publications_df.iloc[idx]
    print(f"\nRow {idx}:")
    print(f"  Main author ORCID: {row['main_author_orcid']}")
    print(f"  Co-authors: {row['co_authors']}")
    print(f"  Co-author ORCIDs: {row['co_author_orcids']}")
    print(f"  Num co-authors: {row['num_co_authors']}")
    print(f"  Title: {row['title'][:80]}...")

Sample co-author data:

Row 0:
  Main author ORCID: 0000-0002-2950-0864
  Co-authors: nan
  Co-author ORCIDs: nan
  Num co-authors: 0
  Title: On Action Logic: Equational Theories of Action Algebras...

Row 1:
  Main author ORCID: 0000-0002-2950-0864
  Co-authors: nan
  Co-author ORCIDs: nan
  Num co-authors: 0
  Title: Infinitary Action Logic: Complexity, Models and Grammars...

Row 2:
  Main author ORCID: 0000-0002-2950-0864
  Co-authors: Gerald Penn
  Co-author ORCIDs: 0000-0003-3553-8305
  Num co-authors: 1
  Title: Categorial grammars determined from linguistic data by unification...

Row 3:
  Main author ORCID: 0000-0002-2950-0864
  Co-authors: nan
  Co-author ORCIDs: nan
  Num co-authors: 0
  Title: Lambek Grammars Based on Pregroups...

Row 4:
  Main author ORCID: 0000-0002-2950-0864
  Co-authors: nan
  Co-author ORCIDs: nan
  Num co-authors: 0
  Title: Interpolation and FEP for logics of residuated algebras...

Row 5:
  Main author ORCID: 0000-0002-2950-0864
  Co-authors: nan


In [5]:
# Check if co_authors and co_author_orcids are lists or single values
print("\nData types:")
print(f"co_authors type: {type(publications_df['co_authors'].iloc[0])}")
print(f"co_author_orcids type: {type(publications_df['co_author_orcids'].iloc[0])}")

# Check for patterns (semicolon-separated, comma-separated, etc.)
sample_with_coauthors = publications_df[publications_df['num_co_authors'] > 0].head(5)
print("\nSamples with co-authors:")
for idx, row in sample_with_coauthors.iterrows():
    print(f"\n{row['co_authors']}")
    print(f"{row['co_author_orcids']}")


Data types:
co_authors type: <class 'float'>
co_author_orcids type: <class 'float'>

Samples with co-authors:

Gerald Penn
0000-0003-3553-8305

Mirosława Kołowska-Gawiejnowicz
0000-0003-1333-6438

Ewa Orłowska
0000-0002-7931-2351

Zhe Lin
0000-0002-3828-8754

Wojtek Pałubicki; Agata Burian
0000-0002-2374-346X; 0000-0002-8912-483X


## 3. Build Collaboration Graph

In [6]:
# Create a mapping of ORCID to full name
orcid_to_name = dict(zip(scientists_df['orcid'], scientists_df['full_name']))
print(f"Total scientists with ORCID: {len(orcid_to_name)}")

Total scientists with ORCID: 161


In [7]:
# Build collaboration graph
G = nx.Graph()

# Add all scientists as nodes
for orcid, name in orcid_to_name.items():
    G.add_node(orcid, name=name)

print(f"Graph initialized with {G.number_of_nodes()} nodes")

Graph initialized with 161 nodes


In [8]:
# Parse co-authorship and add edges
# This will determine if co_author_orcids is a list or single value

def parse_coauthor_orcids(orcid_str):
    """Parse co-author ORCID string (could be semicolon or comma separated)"""
    if pd.isna(orcid_str) or orcid_str == '':
        return []
    
    # Try semicolon separator first
    if ';' in str(orcid_str):
        return [o.strip() for o in str(orcid_str).split(';') if o.strip()]
    # Try comma separator
    elif ',' in str(orcid_str):
        return [o.strip() for o in str(orcid_str).split(',') if o.strip()]
    else:
        # Single ORCID
        return [str(orcid_str).strip()] if str(orcid_str).strip() else []

# Add edges from co-authorship
edge_count = 0
for idx, row in publications_df.iterrows():
    main_orcid = row['main_author_orcid']
    coauthor_orcids = parse_coauthor_orcids(row['co_author_orcids'])
    
    for coauthor_orcid in coauthor_orcids:
        # Only add edge if both are in our scientist list
        if main_orcid in orcid_to_name and coauthor_orcid in orcid_to_name:
            if G.has_edge(main_orcid, coauthor_orcid):
                # Increment collaboration count
                G[main_orcid][coauthor_orcid]['weight'] += 1
            else:
                G.add_edge(main_orcid, coauthor_orcid, weight=1)
                edge_count += 1

print(f"\nCollaboration graph built:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Density: {nx.density(G):.4f}")


Collaboration graph built:
  Nodes: 161
  Edges: 110
  Density: 0.0085


## 4. Aggregate Publications per Scientist (Titles Only)

In [9]:
# Group publications by main author and concatenate titles
scientist_texts = {}

for orcid in orcid_to_name.keys():
    # Get all publications by this scientist
    pubs = publications_df[publications_df['main_author_orcid'] == orcid]
    
    # Concatenate all titles
    titles = pubs['title'].dropna().tolist()
    combined_text = " ".join(titles)
    
    scientist_texts[orcid] = combined_text

print(f"Collected texts for {len(scientist_texts)} scientists")
print(f"Scientists with publications: {sum(1 for text in scientist_texts.values() if text.strip())}")

Collected texts for 161 scientists
Scientists with publications: 115


In [10]:
# Show sample
sample_orcid = list(scientist_texts.keys())[0]
print(f"\nSample scientist: {orcid_to_name[sample_orcid]}")
print(f"Text length: {len(scientist_texts[sample_orcid])} chars")
print(f"Text preview: {scientist_texts[sample_orcid][:200]}...")


Sample scientist: prof. dr hab. Wojciech Buszkowski
Text length: 2482 chars
Text preview: On Action Logic: Equational Theories of Action Algebras Infinitary Action Logic: Complexity, Models and Grammars Categorial grammars determined from linguistic data by unification Lambek Grammars Base...


## 5. Generate Embeddings

In [11]:
# Load model
model = SentenceTransformer('allenai-specter')
print("Model loaded: allenai-specter")

Model loaded: allenai-specter


In [12]:
# Create ordered list of ORCIDs and texts
orcid_list = list(scientist_texts.keys())
text_list = [scientist_texts[orcid] if scientist_texts[orcid].strip() else "No publications" for orcid in orcid_list]

# Generate embeddings
embeddings = model.encode(text_list, show_progress_bar=True)
print(f"\nGenerated embeddings: {embeddings.shape}")

Batches: 100%|██████████| 6/6 [01:06<00:00, 11.07s/it]


Generated embeddings: (161, 768)





In [13]:
# Create ORCID to embedding index mapping
orcid_to_idx = {orcid: idx for idx, orcid in enumerate(orcid_list)}
print(f"Created index mapping for {len(orcid_to_idx)} scientists")

Created index mapping for 161 scientists


## 6. Recommendation System: Find Non-Collaborators by Similarity

In [17]:
def recommend_collaborators(target_orcid, top_n=10):
    """
    Recommend potential collaborators for a target scientist.
    Excludes existing collaborators.
    """
    if target_orcid not in orcid_to_idx:
        return None
    
    # Get target scientist's embedding
    target_idx = orcid_to_idx[target_orcid]
    target_embedding = embeddings[target_idx].reshape(1, -1)
    
    # Calculate similarity to all other scientists
    similarities = cosine_similarity(target_embedding, embeddings)[0]
    
    # Get existing collaborators (neighbors in graph)
    existing_collaborators = set(G.neighbors(target_orcid)) if G.has_node(target_orcid) else set()
    
    # Create recommendations
    recommendations = []
    for idx, orcid in enumerate(orcid_list):
        # Skip self and existing collaborators
        if orcid == target_orcid or orcid in existing_collaborators:
            continue
        
        recommendations.append({
            'orcid': orcid,
            'name': orcid_to_name[orcid],
            'similarity': similarities[idx],
            'num_publications': len(publications_df[publications_df['main_author_orcid'] == orcid])
        })
    
    # Sort by similarity
    recommendations = sorted(recommendations, key=lambda x: x['similarity'], reverse=True)[:top_n]
    
    return pd.DataFrame(recommendations)

In [18]:
# Test the recommendation system with Patryk Żywica
test_orcid = "0000-0003-3542-8982"  # Patryk Żywica
test_name = orcid_to_name.get(test_orcid, "Unknown")

print(f"Recommendations for: {test_name}")
print(f"ORCID: {test_orcid}")

# Show existing collaborators
if G.has_node(test_orcid):
    existing_collab = list(G.neighbors(test_orcid))
    print(f"\nExisting collaborators ({len(existing_collab)}):")
    for collab_orcid in existing_collab[:5]:
        print(f"  - {orcid_to_name[collab_orcid]}")
    if len(existing_collab) > 5:
        print(f"  ... and {len(existing_collab) - 5} more")
else:
    print("\nNo existing collaborators found")

# Show recommendations
recommendations = recommend_collaborators(test_orcid, top_n=10)
print(f"\nTop 10 Recommended Potential Collaborators:")
recommendations

Recommendations for: prof. UAM dr hab. Patryk Żywica
ORCID: 0000-0003-3542-8982

Existing collaborators (4):
  - prof. UAM dr hab. Krzysztof Dyczkowski
  - dr Andrzej Wójtowicz
  - dr inż. Anna Pankowska
  - dr Joanna Siwek

Top 10 Recommended Potential Collaborators:


Unnamed: 0,orcid,name,similarity,num_publications
0,0000-0002-0608-0801,dr inż. Dawid Ewald,0.835639,20
1,0000-0002-3445-2422,prof. dr hab. Tomasz Kubiak,0.744609,35
2,0000-0002-9969-5257,prof. UAM dr hab. Tomasz Górecki,0.743077,114
3,0000-0002-2442-8816,prof. UAM dr hab. Łukasz Smaga,0.730811,66
4,0000-0002-6132-651X,Bartłomiej Grzelak,0.729985,9
5,0000-0003-0015-9348,prof. UAM dr hab. Jerzy Grzybowski,0.720387,45
6,0000-0002-0777-9163,prof. UAM dr hab. Waldemar Wołyński,0.717949,43
7,0000-0002-1648-6987,prof. dr hab. Stanisław Gawiejnowicz,0.713853,54
8,0000-0002-1186-9612,prof. UAM dr hab. Jacek Marciniak,0.70845,29
9,0000-0002-6185-6115,dr inż. Marcin Michał Szczepański,0.706726,5


## 7. Save Results

In [16]:
# Save graph statistics
print("Graph Statistics:")
print(f"  Total scientists: {G.number_of_nodes()}")
print(f"  Total collaborations: {G.number_of_edges()}")
print(f"  Average collaborators per scientist: {2 * G.number_of_edges() / G.number_of_nodes():.2f}")
print(f"  Connected components: {nx.number_connected_components(G)}")

Graph Statistics:
  Total scientists: 161
  Total collaborations: 110
  Average collaborators per scientist: 1.37
  Connected components: 90
