In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA       
import plotly.express as px
import plotly.graph_objects as go


In [3]:
titles_df = pd.read_csv("../abstracts/data/titles_with_abstracts.csv")
scientists_df = pd.read_csv("../collect_uam_data/data/scientists_with_identifiers.csv")

scientists_df = scientists_df[
    scientists_df["orcid"].isin(titles_df["main_author_orcid"])
]

print(f"Scientists available: {len(scientists_df)}")

Scientists available: 115


In [4]:
def build_author_corpora(titles_df, scientists_df, use_abstracts: bool):
    texts = []
    labels = []

    for _, sci in scientists_df.iterrows():
        orcid = sci["orcid"]
        name = sci["full_name"]

        works = titles_df[titles_df["main_author_orcid"] == orcid]

        chunks = []
        for _, w in works.iterrows():
            title = str(w["title"]) if pd.notna(w["title"]) else ""
            chunks.append(title)

            if use_abstracts:
                abstract = str(w.get("abstract", "")) if pd.notna(w.get("abstract")) else ""
                chunks.append(abstract)

        combined_text = " ".join(chunks).strip()

        if combined_text:
            texts.append(combined_text)
            labels.append(name)

    return texts, labels


texts_titles, labels = build_author_corpora(
    titles_df, scientists_df, use_abstracts=False
)

texts_titles_abstracts, _ = build_author_corpora(
    titles_df, scientists_df, use_abstracts=True
)

print(f"Scientists used: {len(labels)}")


Scientists used: 115


In [5]:
model = SentenceTransformer("allenai-specter")

emb_titles = model.encode(texts_titles, show_progress_bar=True)
emb_titles_abs = model.encode(texts_titles_abstracts, show_progress_bar=True)


Batches: 100%|██████████| 4/4 [01:00<00:00, 15.00s/it]
Batches: 100%|██████████| 4/4 [01:20<00:00, 20.08s/it]


In [45]:
from sklearn.cluster import KMeans

# Fit on titles-only embeddings
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
clusters_titles = kmeans.fit_predict(emb_titles)

# For titles+abstracts, assign each point to the nearest cluster from the titles-only model
from scipy.spatial.distance import cdist
distances = cdist(emb_titles_abs, kmeans.cluster_centers_)
clusters_titles_abs = np.argmin(distances, axis=1)

In [46]:
pca = PCA(n_components=2)
pca.fit(np.vstack([emb_titles, emb_titles_abs]))

coords_titles = pca.transform(emb_titles)
coords_titles_abs = pca.transform(emb_titles_abs)


In [47]:
df_titles = pd.DataFrame({
    "PC1": coords_titles[:, 0],
    "PC2": coords_titles[:, 1],
    "Scientist": labels,
    "Cluster": clusters_titles
})

df_titles_abs = pd.DataFrame({
    "PC1": coords_titles_abs[:, 0],
    "PC2": coords_titles_abs[:, 1],
    "Scientist": labels,
    "Cluster": clusters_titles_abs
})


In [48]:
import plotly.graph_objects as go

# Create a figure with separate traces per cluster for the base visualization
fig_titles = go.Figure()

for cluster_id in sorted(df_titles["Cluster"].unique()):
    cluster_data = df_titles[df_titles["Cluster"] == cluster_id]
    fig_titles.add_trace(go.Scatter(
        x=cluster_data["PC1"],
        y=cluster_data["PC2"],
        mode="markers",
        marker=dict(size=8),
        name=f"Cluster {cluster_id}",
        text=cluster_data["Scientist"],
        hovertemplate="<b>%{text}</b><br>PC1: %{x}<br>PC2: %{y}<extra></extra>",
        customdata=cluster_data["Scientist"]
    ))

fig_titles.update_layout(
    title="PCA — Titles only",
    template="plotly_dark"
)

# Save the figure and add custom search bar
html_content = fig_titles.to_html(include_plotlyjs='cdn')

# Add custom search bar with JavaScript
search_bar_html = """
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%); z-index: 1000; background: rgba(0,0,0,0.8); padding: 10px; border-radius: 5px;">
    <input type="text" id="scientistSearch" placeholder="Search scientist..." 
           style="padding: 5px; width: 250px; font-size: 14px; border: 1px solid #666; background: #222; color: white; border-radius: 3px;">
    <button onclick="resetHighlight()" style="padding: 5px 10px; margin-left: 5px; background: #444; color: white; border: 1px solid #666; border-radius: 3px; cursor: pointer;">Reset</button>
    <div id="suggestions" style="margin-top: 5px; max-height: 200px; overflow-y: auto; background: #1a1a1a; border: 1px solid #666; border-radius: 3px; display: none;"></div>
</div>

<script>
var scientists = """ + str(sorted(labels)) + """;

var searchInput = document.getElementById('scientistSearch');
var suggestionsDiv = document.getElementById('suggestions');

searchInput.addEventListener('input', function() {
    var query = this.value.toLowerCase();
    suggestionsDiv.innerHTML = '';
    
    if (query.length === 0) {
        suggestionsDiv.style.display = 'none';
        return;
    }
    
    var matches = scientists.filter(function(name) {
        return name.toLowerCase().includes(query);
    });
    
    if (matches.length > 0) {
        suggestionsDiv.style.display = 'block';
        matches.forEach(function(name) {
            var div = document.createElement('div');
            div.textContent = name;
            div.style.padding = '5px 10px';
            div.style.cursor = 'pointer';
            div.style.color = 'white';
            div.onmouseover = function() { this.style.background = '#444'; };
            div.onmouseout = function() { this.style.background = 'transparent'; };
            div.onclick = function() {
                highlightScientist(name);
                searchInput.value = name;
                suggestionsDiv.style.display = 'none';
            };
            suggestionsDiv.appendChild(div);
        });
    } else {
        suggestionsDiv.style.display = 'none';
    }
});

function highlightScientist(scientistName) {
    var gd = document.getElementsByClassName('plotly-graph-div')[0];
    var update = {
        'marker.size': [],
        'marker.line.width': [],
        'marker.line.color': []
    };
    
    for (var i = 0; i < gd.data.length; i++) {
        var trace = gd.data[i];
        var sizes = [];
        var lineWidths = [];
        var lineColors = [];
        
        for (var j = 0; j < trace.text.length; j++) {
            if (trace.text[j] === scientistName) {
                sizes.push(20);
                lineWidths.push(3);
                lineColors.push('yellow');
            } else {
                sizes.push(6);
                lineWidths.push(0);
                lineColors.push('');
            }
        }
        
        update['marker.size'].push(sizes);
        update['marker.line.width'].push(lineWidths);
        update['marker.line.color'].push(lineColors);
    }
    
    Plotly.restyle(gd, update);
}

function resetHighlight() {
    var gd = document.getElementsByClassName('plotly-graph-div')[0];
    Plotly.restyle(gd, {
        'marker.size': 8,
        'marker.line.width': 0,
        'marker.line.color': ''
    });
    searchInput.value = '';
    suggestionsDiv.style.display = 'none';
}

// Close suggestions when clicking outside
document.addEventListener('click', function(event) {
    if (!searchInput.contains(event.target) && !suggestionsDiv.contains(event.target)) {
        suggestionsDiv.style.display = 'none';
    }
});
</script>
"""

# Insert search bar before the closing body tag
html_content = html_content.replace('</body>', search_bar_html + '</body>')

with open('pca_titles_only.html', 'w', encoding='utf-8') as f:
    f.write(html_content)

In [49]:
# Create a figure with separate traces per cluster for the base visualization
fig_titles_abs = go.Figure()

for cluster_id in sorted(df_titles_abs["Cluster"].unique()):
    cluster_data = df_titles_abs[df_titles_abs["Cluster"] == cluster_id]
    fig_titles_abs.add_trace(go.Scatter(
        x=cluster_data["PC1"],
        y=cluster_data["PC2"],
        mode="markers",
        marker=dict(size=8),
        name=f"Cluster {cluster_id}",
        text=cluster_data["Scientist"],
        hovertemplate="<b>%{text}</b><br>PC1: %{x}<br>PC2: %{y}<extra></extra>",
        customdata=cluster_data["Scientist"]
    ))

fig_titles_abs.update_layout(
    title="PCA — Titles + Abstracts",
    template="plotly_dark"
)

# Save the figure and add custom search bar
html_content = fig_titles_abs.to_html(include_plotlyjs='cdn')

# Add custom search bar with JavaScript
search_bar_html = """
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%); z-index: 1000; background: rgba(0,0,0,0.8); padding: 10px; border-radius: 5px;">
    <input type="text" id="scientistSearch" placeholder="Search scientist..." 
           style="padding: 5px; width: 250px; font-size: 14px; border: 1px solid #666; background: #222; color: white; border-radius: 3px;">
    <button onclick="resetHighlight()" style="padding: 5px 10px; margin-left: 5px; background: #444; color: white; border: 1px solid #666; border-radius: 3px; cursor: pointer;">Reset</button>
    <div id="suggestions" style="margin-top: 5px; max-height: 200px; overflow-y: auto; background: #1a1a1a; border: 1px solid #666; border-radius: 3px; display: none;"></div>
</div>

<script>
var scientists = """ + str(sorted(labels)) + """;

var searchInput = document.getElementById('scientistSearch');
var suggestionsDiv = document.getElementById('suggestions');

searchInput.addEventListener('input', function() {
    var query = this.value.toLowerCase();
    suggestionsDiv.innerHTML = '';
    
    if (query.length === 0) {
        suggestionsDiv.style.display = 'none';
        return;
    }
    
    var matches = scientists.filter(function(name) {
        return name.toLowerCase().includes(query);
    });
    
    if (matches.length > 0) {
        suggestionsDiv.style.display = 'block';
        matches.forEach(function(name) {
            var div = document.createElement('div');
            div.textContent = name;
            div.style.padding = '5px 10px';
            div.style.cursor = 'pointer';
            div.style.color = 'white';
            div.onmouseover = function() { this.style.background = '#444'; };
            div.onmouseout = function() { this.style.background = 'transparent'; };
            div.onclick = function() {
                highlightScientist(name);
                searchInput.value = name;
                suggestionsDiv.style.display = 'none';
            };
            suggestionsDiv.appendChild(div);
        });
    } else {
        suggestionsDiv.style.display = 'none';
    }
});

function highlightScientist(scientistName) {
    var gd = document.getElementsByClassName('plotly-graph-div')[0];
    var update = {
        'marker.size': [],
        'marker.line.width': [],
        'marker.line.color': []
    };
    
    for (var i = 0; i < gd.data.length; i++) {
        var trace = gd.data[i];
        var sizes = [];
        var lineWidths = [];
        var lineColors = [];
        
        for (var j = 0; j < trace.text.length; j++) {
            if (trace.text[j] === scientistName) {
                sizes.push(20);
                lineWidths.push(3);
                lineColors.push('yellow');
            } else {
                sizes.push(6);
                lineWidths.push(0);
                lineColors.push('');
            }
        }
        
        update['marker.size'].push(sizes);
        update['marker.line.width'].push(lineWidths);
        update['marker.line.color'].push(lineColors);
    }
    
    Plotly.restyle(gd, update);
}

function resetHighlight() {
    var gd = document.getElementsByClassName('plotly-graph-div')[0];
    Plotly.restyle(gd, {
        'marker.size': 8,
        'marker.line.width': 0,
        'marker.line.color': ''
    });
    searchInput.value = '';
    suggestionsDiv.style.display = 'none';
}

// Close suggestions when clicking outside
document.addEventListener('click', function(event) {
    if (!searchInput.contains(event.target) && !suggestionsDiv.contains(event.target)) {
        suggestionsDiv.style.display = 'none';
    }
});
</script>
"""

# Insert search bar before the closing body tag
html_content = html_content.replace('</body>', search_bar_html + '</body>')

with open('pca_titles_with_abstracts.html', 'w', encoding='utf-8') as f:
    f.write(html_content)

In [50]:
disp_df = pd.DataFrame({
    "Scientist": labels,
    "dx": coords_titles_abs[:, 0] - coords_titles[:, 0],
    "dy": coords_titles_abs[:, 1] - coords_titles[:, 1],
    "x0": coords_titles[:, 0],
    "y0": coords_titles[:, 1],
    "x1": coords_titles_abs[:, 0],
    "y1": coords_titles_abs[:, 1],
})

fig_disp = go.Figure()

for _, row in disp_df.iterrows():
    fig_disp.add_trace(go.Scatter(
        x=[row.x0, row.x1],
        y=[row.y0, row.y1],
        mode="lines+markers",
        marker=dict(size=6),
        line=dict(width=1),
        hovertext=row.Scientist,
        showlegend=False,
        name=row.Scientist
    ))

fig_disp.update_layout(
    title="Semantic displacement: Titles → Titles + Abstracts",
    template="plotly_dark"
)

# Save the figure and add custom search bar
html_content = fig_disp.to_html(include_plotlyjs='cdn')

# Add custom search bar with JavaScript
search_bar_html = """
<div style="position: fixed; top: 10px; left: 50%; transform: translateX(-50%); z-index: 1000; background: rgba(0,0,0,0.8); padding: 10px; border-radius: 5px;">
    <input type="text" id="scientistSearch" placeholder="Search scientist..." 
           style="padding: 5px; width: 250px; font-size: 14px; border: 1px solid #666; background: #222; color: white; border-radius: 3px;">
    <button onclick="resetHighlight()" style="padding: 5px 10px; margin-left: 5px; background: #444; color: white; border: 1px solid #666; border-radius: 3px; cursor: pointer;">Reset</button>
    <div id="suggestions" style="margin-top: 5px; max-height: 200px; overflow-y: auto; background: #1a1a1a; border: 1px solid #666; border-radius: 3px; display: none;"></div>
</div>

<script>
var scientists = """ + str(sorted(labels)) + """;

var searchInput = document.getElementById('scientistSearch');
var suggestionsDiv = document.getElementById('suggestions');

searchInput.addEventListener('input', function() {
    var query = this.value.toLowerCase();
    suggestionsDiv.innerHTML = '';
    
    if (query.length === 0) {
        suggestionsDiv.style.display = 'none';
        return;
    }
    
    var matches = scientists.filter(function(name) {
        return name.toLowerCase().includes(query);
    });
    
    if (matches.length > 0) {
        suggestionsDiv.style.display = 'block';
        matches.forEach(function(name) {
            var div = document.createElement('div');
            div.textContent = name;
            div.style.padding = '5px 10px';
            div.style.cursor = 'pointer';
            div.style.color = 'white';
            div.onmouseover = function() { this.style.background = '#444'; };
            div.onmouseout = function() { this.style.background = 'transparent'; };
            div.onclick = function() {
                highlightScientist(name);
                searchInput.value = name;
                suggestionsDiv.style.display = 'none';
            };
            suggestionsDiv.appendChild(div);
        });
    } else {
        suggestionsDiv.style.display = 'none';
    }
});

function highlightScientist(scientistName) {
    var gd = document.getElementsByClassName('plotly-graph-div')[0];
    var updates = {
        'marker.size': [],
        'line.width': [],
        'line.color': []
    };
    
    for (var i = 0; i < gd.data.length; i++) {
        var trace = gd.data[i];
        if (trace.hovertext === scientistName) {
            updates['marker.size'].push(12);
            updates['line.width'].push(3);
            updates['line.color'].push('yellow');
        } else {
            updates['marker.size'].push(6);
            updates['line.width'].push(1);
            updates['line.color'].push('');
        }
    }
    
    Plotly.restyle(gd, updates);
}

function resetHighlight() {
    var gd = document.getElementsByClassName('plotly-graph-div')[0];
    Plotly.restyle(gd, {
        'marker.size': 6,
        'line.width': 1,
        'line.color': ''
    });
    searchInput.value = '';
    suggestionsDiv.style.display = 'none';
}

// Close suggestions when clicking outside
document.addEventListener('click', function(event) {
    if (!searchInput.contains(event.target) && !suggestionsDiv.contains(event.target)) {
        suggestionsDiv.style.display = 'none';
    }
});
</script>
"""

# Insert search bar before the closing body tag
html_content = html_content.replace('</body>', search_bar_html + '</body>')

with open('pca_semantic_displacement.html', 'w', encoding='utf-8') as f:
    f.write(html_content)