# 1. Counting synergies

In [1]:
import pandas as pd
from itertools import combinations

# Load data
input_file = "Data_clean/08.researchers_with_themes_expertise_cleaned.csv"
data = pd.read_csv(input_file)

# Ensure Themes_Clean and Expertise_Clean are lists
data['Themes_Clean'] = data['Themes_Clean'].apply(lambda x: [item.strip() for item in x.split(';')] if isinstance(x, str) else [])
data['Expertise_Clean'] = data['Expertise_Clean'].apply(lambda x: [item.strip() for item in x.split(';')] if isinstance(x, str) else [])

# Initialize results list
results = []

# Generate researcher pairs (symmetric handling)
for researcher_a, researcher_b in combinations(data['Full Name'], 2):
    row_a = data[data['Full Name'] == researcher_a].iloc[0]
    row_b = data[data['Full Name'] == researcher_b].iloc[0]

    # Find shared synergies (themes and expertise overlap)
    shared_themes = set(row_a['Themes_Clean']).intersection(row_b['Themes_Clean'])
    shared_expertise = set(row_a['Expertise_Clean']).intersection(row_b['Expertise_Clean'])
    overlap_count = len(shared_themes) + len(shared_expertise)

    # Combine themes and expertise for shared synergies
    shared_synergies = shared_themes.union(shared_expertise)

    # Add both directions for the pair
    if overlap_count > 0:
        results.append({
            "Researcher_A": researcher_a,
            "Researcher_B": researcher_b,
            "Shared_synergies_count": overlap_count,
            "Shared_synergies": "; ".join(sorted(shared_synergies)),
        })
        results.append({
            "Researcher_A": researcher_b,
            "Researcher_B": researcher_a,
            "Shared_synergies_count": overlap_count,
            "Shared_synergies": "; ".join(sorted(shared_synergies)),
        })

# Create output DataFrame
results_df = pd.DataFrame(results)

# Sort by researcher names and synergies count (descending for synergies count)
results_df.sort_values(by=["Researcher_A", "Shared_synergies_count"], ascending=[True, False], inplace=True)

# Save to output file
output_file = "Data_clean/09.potential_collaborations.csv"
results_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


ModuleNotFoundError: No module named 'pandas'

In [None]:
import pandas as pd
