In [1]:
from dotenv import load_dotenv
load_dotenv() ## load all the environment variables from .env
import glob
# import streamlit as st
import os
from PIL import Image
import google.generativeai as genai
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import textwrap
from typing import List, Dict, Tuple, Optional # For type hinting
import time
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

## Load Gemini model
model=genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')

In [3]:
def get_gemini_response(input,image,user_prompt):
    response=model.generate_content([input,image[0],user_prompt])
    return response.text

## DATA Loading

In [4]:
import os
import json
import pandas as pd

data_directory = 'data/ranking_data/'
records = []

def safe_get(data, keys, default=None):
    """
    Safely get a nested key from a dictionary using a list of keys.
    Returns default if any key is missing.
    """
    for key in keys:
        if isinstance(data, dict) and key in data:
            data = data[key]
        else:
            return default
    return data

for sub_dir in os.listdir(data_directory):
    print(f"Reading files in {sub_dir}...")
    sub_directory = os.path.join(data_directory, sub_dir)
    for filename in os.listdir(sub_directory):
        if filename.endswith('.json'):
            filepath = os.path.join(sub_directory, filename)
            try:
                with open(filepath, 'r') as file:
                    data = json.load(file)
            except Exception as e:
                print(f"Error reading {filepath}: {e}")
                continue

            # Extract award-level context information safely
            award_type = data.get("awd_istr_txt")
            award_title = data.get("awd_titl_txt")
            abstract = data.get("abst_narr_txt")
            org_name = data.get("org_long_name")
            org_name2 = data.get("org_long_name2")
            perf_inst_name = safe_get(data, ["perf_inst", "perf_inst_name"])
            
            # Extract program element and reference safely (checking if list exists)
            pgm_ele_list = data.get("pgm_ele")
            if isinstance(pgm_ele_list, list) and len(pgm_ele_list) > 0:
                program_element = pgm_ele_list[0].get("pgm_ele_long_name")
            else:
                program_element = None

            pgm_ref_list = data.get("pgm_ref")
            if isinstance(pgm_ref_list, list) and len(pgm_ref_list) > 0:
                program_reference = pgm_ref_list[0].get("pgm_ref_long_name")
            else:
                program_reference = None

            # Get investigator information, ensuring it's a list
            pi_list = data.get("pi")
            if not isinstance(pi_list, list):
                continue

            # Loop through each investigator in the file
            for pi in pi_list:
                record = {
                    "award_type": award_type,
                    "award_title": award_title,
                    "abstract": abstract,
                    "org_name": org_name,
                    "org_name2": org_name2,
                    "perf_inst_name": perf_inst_name,
                    "program_element": program_element,
                    "program_reference": program_reference,
                    "pi_id": pi.get("pi_id"),
                    "pi_full_name": pi.get("pi_full_name", "").strip() if pi.get("pi_full_name") else None,
                    "role": pi.get("proj_role_code2", "").strip() if pi.get("proj_role_code2") else None,
                    "department": pi.get("pi_dept_name"),
                    "email": pi.get("pi_email_addr"),
                    "start_date": pi.get("start_date")
                }
                records.append(record)

# Create a DataFrame from the records
df = pd.DataFrame(records)


Reading files in 2022...
Reading files in 2024...
Reading files in 2023...
Reading files in 2021...
Reading files in 2020...


In [5]:
df = df[df['role'].isin(['Co-Principal Investigator', 'Principal Investigator'])]
df.head()

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,role,department,email,start_date
0,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269967889,Terrance Figy,Co-Principal Investigator,"Mathematics, Statistics, and Physics",Terrance.Figy@wichita.edu,2024-08-29
1,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269758255,Pratul K Agarwal,Principal Investigator,,pratul.agarwal@okstate.edu,2022-08-03
2,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",224099,Mickey Slimp,Co-Principal Investigator,Department of Chemistry,,2024-08-29
4,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269666332,William H Hsu,Co-Principal Investigator,Computer Science,bhsu@ksu.edu,2022-08-03
7,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",270046494,Robert Fleming,Co-Principal Investigator,Engineering,rofleming@AState.edu,2024-08-29


In [6]:
df.shape

(83112, 14)

In [7]:
# df.to_csv('combined_data.csv')

In [56]:
df.head()

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,role,department,email,start_date,combined_text,leadership,experience_years,text_embedding
0,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269967889,Terrance Figy,Co-Principal Investigator,"Mathematics, Statistics, and Physics",Terrance.Figy@wichita.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
1,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269758255,Pratul K Agarwal,Principal Investigator,,pratul.agarwal@okstate.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
2,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",224099,Mickey Slimp,Co-Principal Investigator,Department of Chemistry,,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
4,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269666332,William H Hsu,Co-Principal Investigator,Computer Science,bhsu@ksu.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
7,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",270046494,Robert Fleming,Co-Principal Investigator,Engineering,rofleming@AState.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."


## Scholer Identifier LLM

In [8]:
def filter_data_by_pi(df: pd.DataFrame, pi_ids: List[str]) -> pd.DataFrame:
    """
    Filters the DataFrame to include only rows matching the provided PI IDs.

    Args:
        df: The input DataFrame.
        pi_ids: A list of PI IDs (strings) to filter by.

    Returns:
        A DataFrame containing only the rows for the specified PI IDs.
    """
    print(f"Filtering DataFrame for PI IDs: {pi_ids}...")
    filtered = df[df['pi_id'].isin(pi_ids)].copy()
    # print(f"Found {len(filtered)} relevant entries.")
    return filtered

In [9]:
def format_pi_data_for_prompt(filtered_df: pd.DataFrame, pi_ids_to_format: List[str]) -> Tuple[str, Dict[str, str]]:
    """
    Formats the filtered PI data into a string suitable for the prompt context.

    Args:
        filtered_df: The DataFrame already filtered for relevant PIs.
        pi_ids_to_format: The original list of PI IDs requested, to ensure all are mentioned.


    Returns:
        A tuple containing:
            - formatted_data_string: A string with formatted details for each PI.
            - pi_names_dict: A dictionary mapping PI ID to PI full name.
    """
    # print("Formatting data for prompt...")
    formatted_data = ""
    pi_names = {} # Dictionary to store PI names

    if filtered_df.empty:
        print("Warning: Filtered DataFrame is empty. Formatting 'no data' message.")
        formatted_data = "No data could be retrieved for the specified potential collaborators.\n"
        for pi_id in pi_ids_to_format:
             pi_names[pi_id] = f"PI ID {pi_id}" # Use ID as placeholder name
        return formatted_data, pi_names

    # Iterate through the original list to ensure all requested PIs are accounted for
    for pi_id in pi_ids_to_format:
        pi_specific_data = filtered_df[filtered_df['pi_id'] == pi_id]

        if not pi_specific_data.empty:
            # Get consistent name and department from the first entry
            full_name = pi_specific_data['pi_full_name'].iloc[0]
            department = pi_specific_data['department'].iloc[0]
            pi_names[pi_id] = full_name

            formatted_data += f"--- Researcher: {full_name} (ID: {pi_id}) ---\n"
            formatted_data += f"Department: {department}\n"
            formatted_data += "Relevant Roles & Awards Found:\n"

            for index, row in pi_specific_data.iterrows():
                formatted_data += f"- Role: {row.get('role', 'N/A')}\n"
                formatted_data += f"  Award Title: {row.get('award_title', 'N/A')}\n"
                formatted_data += f"  Start Date: {row.get('start_date', 'N/A')}\n"
                abstract_preview = textwrap.shorten(row.get('abstract', 'N/A'), width=200, placeholder="...")
                formatted_data += f"  Abstract Snippet: {abstract_preview}\n"
                formatted_data += f"  Program Element/Reference: {row.get('program_element', 'N/A')} / {row.get('program_reference', 'N/A')}\n\n"
        else:
            # Handle case where a specific PI ID from the list had no data in the filtered df
            formatted_data += f"--- Researcher ID: {pi_id} ---\n"
            formatted_data += "No award data found in the provided dataset for this PI.\n\n"
            pi_names[pi_id] = f"PI ID {pi_id}" # Use ID as placeholder name

    # print("Data formatting complete.")
    return formatted_data, pi_names

In [10]:
def generate_recommendation_prompt(formatted_data_string: str, pi_names_dict: Dict[str, str], research_topic: str) -> str:
    """
    Generates the full prompt string for the Gemini model.

    Args:
        formatted_data_string: The formatted string containing PI details.
        pi_names_dict: A dictionary mapping PI ID to PI name.
        research_topic: The research topic for collaboration.

    Returns:
        The complete prompt string.
    """
    # print("Generating prompt...")
    collaborator_names_list = ", ".join(pi_names_dict.values())

    prompt = f"""
        Context:
        The following researchers ({collaborator_names_list}) are considering collaborating on a new research project focused on the topic '{research_topic}'. Below is information extracted from a database about their previous grants and roles:

        {formatted_data_string}

        Task:
        Based *only* on the information provided above, please analyze the qualifications, experience, and relevance of past work for each researcher ({collaborator_names_list}). Recommend which of these individuals would be the most suitable Principal Investigator (PI) to lead this new collaborative project on '{research_topic}'.

        Provide a detailed explanation for your recommendation. Consider factors apparent from the data, such as:
        - Direct relevance of their past research (award titles, abstracts, program elements) to the topic '{research_topic}'.
        - Demonstrated experience (e.g., number of awards listed, roles held like 'Principal Investigator').
        - Any indicators of leadership or seniority (e.g., award types like 'Career Award' if present, consistent PI roles).

        Please identify the suggested PI clearly by name.
        """
    # print("Prompt generated.")
    return prompt

In [11]:
# Please identify the suggested PI clearly by name and justify your choice thoroughly using specific evidence from the provided context. If the data is insufficient to make a strong recommendation for any particular candidate, please state that clearly as well.

In [12]:
def get_gemini_response(model: genai.GenerativeModel, prompt: str) -> Tuple[Optional[str], float]:
    """
    Sends the prompt to the Gemini model, streams the response, and measures time.

    Args:
        model: The configured Gemini model object.
        prompt: The prompt string to send to the model.

    Returns:
        A tuple containing:
            - The full response text as a string (or None if an error occurs).
            - The time taken for the API call in seconds.
    """
    # print("--- Sending Request to Gemini ---")
    start_time = time.time()
    full_response_text = ""
    contents = [prompt] # Prepare contents for the API

    try:
        responses = model.generate_content(contents, stream=True)
        # responses = model.generate_content(contents)

        # print("\n-------Response--------")
        for response in responses:
            # print(response.text, end="")
            full_response_text += response.text
        # print("\n-----------------------")
        PI_NAME = full_response_text.split('\n')[-1].strip()
        print(f"\nPI: {PI_NAME}")

        response_time = time.time() - start_time
        print(f"\nResponse generated in {response_time:.2f} seconds.")
        return full_response_text, response_time

    except AttributeError:
        response_time = time.time() - start_time
        print("\nError: 'model' object not found or not configured correctly.")
        print("Please ensure the 'model' variable holds your loaded Gemini model.")
        return None, response_time
    except Exception as e:
        response_time = time.time() - start_time
        print(f"\nAn error occurred during the API call: {e}")
        print(f"Attempt failed after {response_time:.2f} seconds.")
        return None, response_time

In [13]:
def recommend_pi(df: pd.DataFrame, model: genai.GenerativeModel, pi_ids: List[str], research_topic: str) -> Optional[str]:
    """
    Orchestrates the process of filtering data, formatting, generating prompt,
    and getting a PI recommendation from the Gemini model.

    Args:
        df: The main DataFrame.
        model: The configured Gemini model object.
        pi_ids: A list of PI IDs to consider.
        research_topic: The topic for collaboration.

    Returns:
        The recommendation text from the model, or None if an error occurred
        or essential steps failed.
    """
    print(f"\n--- Starting PI Recommendation Process for Topic: '{research_topic}' ---")

    # 1. Filter Data
    filtered_data = filter_data_by_pi(df, pi_ids)
    # Optional: Add a check here if you want to stop if no data is found at all
    # if filtered_data.empty:
    #     print("Stopping process as no data was found for any specified PI.")
    #     return None

    # 2. Format Data
    # Pass the original pi_ids list to ensure all are mentioned in formatting
    formatted_text, pi_names = format_pi_data_for_prompt(filtered_data, pi_ids)

    # 3. Generate Prompt
    prompt_text = generate_recommendation_prompt(formatted_text, pi_names, research_topic)

    # 4. Get Response
    recommendation, duration = get_gemini_response(model, prompt_text)

    # print(f"--- PI Recommendation Process Complete ({duration:.2f}s) ---")
    return recommendation

In [14]:
try:
    pi_ids_to_analyze = ['000025017', '000025762', '000030655']
    research_topic = 'STATISTICS'

    # --- Run the Recommendation Process ---
    recommendation_result = recommend_pi(df, model, pi_ids_to_analyze, research_topic)

    # Optional: Do something with the result
    if recommendation_result:
        # print("\n--- Final Recommendation Text ---")
        # print(recommendation_result) # Already printed during streaming
        pass # Result is already printed by get_gemini_response
    else:
        print("\nRecommendation could not be generated.")

except ImportError:
    print("Please install required libraries: pip install pandas google-generativeai")
except Exception as e:
    print(f"An error occurred during setup or execution: {e}")


--- Starting PI Recommendation Process for Topic: 'STATISTICS' ---
Filtering DataFrame for PI IDs: ['000025017', '000025762', '000030655']...

PI: Therefore, despite Dr. Bluestein having a greater number of overall awards and PI roles listed, Dr. MacEachern's direct, demonstrated expertise and PI experience specifically *in the field of 'STATISTICS'*, as shown by his grant titles, abstracts, and department, make him the most suitable candidate to lead a project on that topic according to the provided information.

Response generated in 15.55 seconds.


In [15]:
# Test with various topics dynamically.
for test_topic in ["knowledge graph", "AI", "Neuroscience", "STATISTICS"]:
    print("Testing with topic:", test_topic)
    recommendation_result = recommend_pi(df, model, pi_ids_to_analyze, test_topic)
    # print("Predicted PI:", pi_candidate)
    # print("Predicted Co-PIs:", co_pi_candidates)
    # print("Candidate Combined Scores:", scores)
    print("-" * 50)

Testing with topic: knowledge graph

--- Starting PI Recommendation Process for Topic: 'knowledge graph' ---
Filtering DataFrame for PI IDs: ['000025017', '000025762', '000030655']...

An error occurred during the API call: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 1.
Attempt failed after 15.69 seconds.
--------------------------------------------------
Testing with topic: AI

--- Starting PI Recommendation Process for Topic: 'AI' ---
Filtering DataFrame for PI IDs: ['000025017', '000025762', '000030655']...

PI: Therefore, considering the critical need for subject matter expertise in 'AI' combined with demonstrated PI experience, Steven N MacEachern is the most appropriate choice among the three, based solely on the provided data.

Response generated in 14.88 seconds.
-------------------------------------------

In [16]:
candidate_sets = [
    (['269948909', '000173003', '269886945'], 'STATISTICS'),
    (['269807623', '269794080', '269879497'], 'AI Circuits design'),
    (['269807623', '269794080', '269879497'], 'hardware software co-design'),
    (['269677663', '269988546', '270021884'], 'Trustworthy AI'),
    (['269677663', '269988546', '269814599'], 'Networking safety'),
    (['269811881', '269958535', '270083608'], 'Bioinformatics'),
    (['270082637', '269726900', '269963435'], 'Robotics'),
    (['270082637', '269726900', '270021884'], 'AI in Robotics'),
    (['269934201', '269769382', '269911544'], 'Algorithm'),
    (['269721983', '269928133', '000171581'], 'Data Science')
]

for pi_ids, topic in candidate_sets:
    recommendation_result = recommend_pi(df, model, pi_ids, topic) #recommend_pi(df, model, pi_ids_to_analyze, research_topic)
    # print("Predicted PI:", pi_candidate)
    # print("Predicted Co-PIs:", co_pi_candidates)
    # print("Candidate Combined Scores:", scores)
    print("-" * 50)
    # break


--- Starting PI Recommendation Process for Topic: 'STATISTICS' ---
Filtering DataFrame for PI IDs: ['269948909', '000173003', '269886945']...

PI: Therefore, Xin Zhang is the most suitable choice for Principal Investigator based on the evidence of direct relevance and demonstrated PI experience within the field of Statistics.

Response generated in 15.10 seconds.
--------------------------------------------------

--- Starting PI Recommendation Process for Topic: 'AI Circuits design' ---
Filtering DataFrame for PI IDs: ['269807623', '269794080', '269879497']...

PI: Therefore, Azadeh Davoodi's highly relevant research background, combined with her experience as a Principal Investigator, makes her the most qualified candidate among the three, based on the provided data, to lead a project specifically focused on 'AI Circuits design'.

Response generated in 10.18 seconds.
--------------------------------------------------

--- Starting PI Recommendation Process for Topic: 'hardware softw

In [92]:
candidate_sets = [
    (['269721983', '270082637', '269811881'], 'Data Science'),
    (['269721983', '270082637', '269811881'], 'Robotics'),
    (['269721983', '270082637', '269811881'], 'Bioinformatics')
]

for pi_ids, topic in candidate_sets:
    recommendation_result = recommend_pi(df, model, pi_ids, topic) #recommend_pi(df, model, pi_ids_to_analyze, research_topic)
    # print("Predicted PI:", pi_candidate)
    # print("Predicted Co-PIs:", co_pi_candidates)
    # print("Candidate Combined Scores:", scores)
    print("-" * 50)
    # break


--- Starting PI Recommendation Process for Topic: 'Data Science' ---
Filtering DataFrame for PI IDs: ['269721983', '270082637', '269811881']...

PI: Based *solely* on the provided grant information, Sofya Raskhodnikova's background is the most aligned with the topic 'Data Science'. Her role as a Co-Principal Investigator on a major "Foundations of Data Science Institute" grant directly demonstrates significant expertise and leadership experience within the core subject matter of the proposed project. While Jianlin Cheng has PI experience in an applied data science area (Bioinformatics) and Rodrigo O Spinola has PI experience in robotics, Sofya's explicit involvement in a foundational data science initiative makes her the most suitable candidate to lead a new collaborative project *specifically* on 'Data Science'.

Response generated in 16.25 seconds.
--------------------------------------------------

--- Starting PI Recommendation Process for Topic: 'Robotics' ---
Filtering DataFrame

In [90]:
# print(df[df.pi_id == '269721983'])

## Influencer - from a list of PI's

In [None]:
import pandas as pd
import textwrap
from typing import List, Dict, Tuple, Optional # For type hinting
import google.generativeai as genai # Assuming genai is already configured

# --- Helper Function to get Collaborators for specific awards ---
# (This is needed for format_influencer_data)
def get_collaborators_for_awards(df: pd.DataFrame, award_titles: List[str]) -> Dict[str, List[str]]:
    """
    Finds all PIs/Co-PIs associated with a list of award titles.

    Args:
        df: The main DataFrame.
        award_titles: A list of award titles.

    Returns:
        A dictionary where keys are award titles and values are lists of
        PI/Co-PI full names on that award.
    """
    collaborators = {}
    relevant_awards_df = df[df['award_title'].isin(award_titles)]
    for title in award_titles:
        # Filter for the specific award title and valid roles
        award_pis = relevant_awards_df[
            (relevant_awards_df['award_title'] == title) &
            (relevant_awards_df['role'].isin(['Principal Investigator', 'Co-Principal Investigator']))
        ]
        # Get unique names, handling potential missing names
        names = [name for name in award_pis['pi_full_name'].unique() if pd.notna(name)]
        collaborators[title] = names
    return collaborators

# --- New Function 1: Format Data for Influencer Prompt ---
def format_influencer_data(df: pd.DataFrame, pi_ids: List[str]) -> Tuple[str, Dict[str, str]]:
    """
    Formats data for selected PIs to highlight connections and field diversity
    suitable for an 'influencer' analysis prompt.

    Args:
        df: The main DataFrame containing award and PI information.
        pi_ids: A list of PI IDs to format data for.

    Returns:
        A tuple containing:
            - formatted_data_string: A string with formatted details for each PI,
              focusing on connections and fields.
            - pi_names_dict: A dictionary mapping PI ID to PI full name.
    """
    print(f"Formatting influencer data for PI IDs: {pi_ids}...")
    formatted_data = ""
    pi_names = {} # Dictionary to store PI names

    # Filter main df once for all relevant PIs to improve efficiency
    filtered_df = df[df['pi_id'].isin(pi_ids)].copy()

    if filtered_df.empty:
        print("Warning: No data found for any specified PI IDs.")
        formatted_data = "No data could be retrieved for the specified potential influencers.\n"
        for pi_id in pi_ids:
             pi_names[pi_id] = f"PI ID {pi_id}" # Use ID as placeholder name
        return formatted_data, pi_names

    # Iterate through the requested PI IDs
    for pi_id in pi_ids:
        pi_specific_data = filtered_df[filtered_df['pi_id'] == pi_id]

        if not pi_specific_data.empty:
            # Get consistent name from the first entry
            full_name = pi_specific_data['pi_full_name'].iloc[0]
            pi_names[pi_id] = full_name
            print(f"  Processing data for {full_name} ({pi_id})...")

            formatted_data += f"--- Potential Influencer: {full_name} (ID: {pi_id}) ---\n"

            # --- Project & Connection Analysis ---
            unique_award_titles = pi_specific_data['award_title'].unique()
            num_projects = len(unique_award_titles)
            formatted_data += f"Total Projects Involved In (as PI/Co-PI): {num_projects}\n"

            # Get all collaborators across these projects
            collaborators_by_award = get_collaborators_for_awards(df, list(unique_award_titles))
            all_collaborators = set()
            for title, names in collaborators_by_award.items():
                # Add collaborators, excluding the PI themselves
                all_collaborators.update(name for name in names if name != full_name)

            num_unique_collaborators = len(all_collaborators)
            formatted_data += f"Total Unique Collaborators (excluding self): {num_unique_collaborators}\n"
            # Optionally list some collaborators:
            collaborators_preview = ", ".join(list(all_collaborators)[:5]) # Preview first 5
            formatted_data += f"  Collaborators Sample: {collaborators_preview}{'...' if num_unique_collaborators > 5 else ''}\n"

            # --- Field Diversity Analysis ---
            unique_elements = pi_specific_data['program_element'].dropna().unique()
            unique_references = pi_specific_data['program_reference'].dropna().unique()
            all_fields = set(unique_elements) | set(unique_references)
            num_unique_fields = len(all_fields)
            formatted_data += f"Number of Unique Research Fields (Program Elements/References): {num_unique_fields}\n"
            # Optionally list some fields:
            fields_preview = ", ".join(list(all_fields)[:5]) # Preview first 5
            formatted_data += f"  Fields Sample: {fields_preview}{'...' if num_unique_fields > 5 else ''}\n\n"
            
            

            # --- Detailed Project List (Optional - can make prompt very long) ---
            # formatted_data += "  Projects Overview:\n"
            # for title in unique_award_titles:
            #    roles_on_project = pi_specific_data[pi_specific_data['award_title'] == title]['role'].unique()
            #    formatted_data += f"  - {title} (Roles: {', '.join(roles_on_project)})\n"
            #    formatted_data += f"    Collaborators on this project: {', '.join(c for c in collaborators_by_award.get(title, []) if c != full_name)}\n"
            # formatted_data += "\n"

        else:
            # Handle case where a specific PI ID from the list had no data
            formatted_data += f"--- Potential Influencer ID: {pi_id} ---\n"
            formatted_data += "No award data found in the provided dataset for this PI.\n\n"
            pi_names[pi_id] = f"PI ID {pi_id}" # Use ID as placeholder name

    print("Influencer data formatting complete.")
    return formatted_data, pi_names


# --- New Function 2: Generate Influencer Prompt ---
def generate_influencer_prompt(formatted_data_string: str, pi_names_dict: Dict[str, str]) -> str:
    """
    Generates the full prompt string for the Gemini model to identify influencers.

    Args:
        formatted_data_string: The formatted string containing PI details focused
                               on connections and fields.
        pi_names_dict: A dictionary mapping PI ID to PI name.

    Returns:
        The complete prompt string for influencer identification.
    """
    print("Generating influencer prompt...")
    candidate_names_list = ", ".join(pi_names_dict.values())

    prompt = f"""
        Context:
        You are an AI assistant analyzing research collaboration data to identify 'influencers'. An influencer is defined as a researcher who has significant connections within the network, demonstrated by:
        1.  Being involved (as PI or Co-PI) in a relatively high number of distinct projects/awards.
        2.  Having collaborated with a relatively high number of unique individuals.
        3.  Having experience across a diverse range of research fields (indicated by different Program Elements or Program References).

        Below is summarized data for potential influencers ({candidate_names_list}):

        {formatted_data_string}

        Task:
        Based *only* on the summarized information provided above, please analyze each researcher's profile according to the 'influencer' criteria (number of projects, number of unique collaborators, and field diversity).

        Rank these individuals ({candidate_names_list}) from most influential to least influential based on the definition provided.

        Provide a clear ranking and a concise justification for your ranking, referencing the specific metrics (project count, collaborator count, field count) for each researcher from the context provided.
    """
    print("Influencer prompt generated.")
    return prompt

# --- New Function 3: Identify Influencer using LLM ---
def identify_influencer_llm(df: pd.DataFrame, model: genai.GenerativeModel, pi_ids: List[str]) -> Optional[str]:
    """
    Orchestrates the process of formatting data, generating an influencer prompt,
    and getting a ranking from the Gemini model.

    Args:
        df: The main DataFrame.
        model: The configured Gemini model object.
        pi_ids: A list of PI IDs to consider as potential influencers.

    Returns:
        The influencer ranking text from the model, or None if an error occurred.
    """
    print(f"\n--- Starting Influencer Identification Process for PI IDs: {pi_ids} ---")

    # 1. Format Data for Influencer Analysis
    # Note: This function now focuses on connections and diversity metrics
    formatted_text, pi_names = format_influencer_data(df, pi_ids)
    print("-"*60)
    print("Formated data LLM: ", str(formatted_text)) # Print first 1000 characters for debugging
    print("-"*60)

    # Check if formatting yielded any usable data
    if not pi_names or all(name.startswith("PI ID") for name in pi_names.values()):
         print("Stopping process as no valid data could be formatted.")
         return "Could not generate influencer ranking due to lack of data for the specified PIs."

    # 2. Generate Influencer Prompt
    prompt_text = generate_influencer_prompt(formatted_text, pi_names)

    # 3. Get Response (using the existing get_gemini_response function)
    print("--- Sending Request to Gemini for Influencer Ranking ---")
    # Assuming get_gemini_response takes model and prompt, and returns (response_text, duration)
    # You might need to adapt this call slightly if your get_gemini_response has different args/return values
    ranking_result, duration = get_gemini_response(model, prompt_text) # Use your existing function

    if ranking_result:
        print(f"--- Influencer Identification Complete ({duration:.2f}s) ---")
        # The result is already printed by get_gemini_response during streaming usually
        return ranking_result
    else:
        print("--- Influencer Identification Failed ---")
        return "Failed to get influencer ranking from the model."


In [18]:
# --- Example Usage ---
# Assuming 'df' is your loaded DataFrame and 'model' is your configured Gemini model
try:
    # Example PI IDs known to have multiple projects/connections
    # Replace with IDs relevant to your analysis
    influencer_candidate_ids = ['000025762', '269811881', '269807623', '270021884'] # Example set

    # --- Run the Influencer Identification Process ---
    influencer_ranking_result = identify_influencer_llm(df, model, influencer_candidate_ids)

    # Optional: Print the final result again if not fully captured by streaming print
    if influencer_ranking_result:
        print("\n--- Final Influencer Ranking Text ---")
        print(influencer_ranking_result)
    else:
        print("\nInfluencer ranking could not be generated.")

except NameError as e:
    print(f"Error: Required variable not defined (e.g., 'df' or 'model'). Details: {e}")
except Exception as e:
    print(f"An error occurred during the influencer identification process: {e}")


--- Starting Influencer Identification Process for PI IDs: ['000025762', '269811881', '269807623', '270021884'] ---
Formatting influencer data for PI IDs: ['000025762', '269811881', '269807623', '270021884']...
  Processing data for Steven N MacEachern (000025762)...
  Processing data for Jianlin   Cheng (269811881)...
  Processing data for Dirk J Colbry (269807623)...
  Processing data for Guido F Montufar Cuartas (270021884)...
Influencer data formatting complete.
Generating influencer prompt...
Influencer prompt generated.
--- Sending Request to Gemini for Influencer Ranking ---

PI: 4.  **Jianlin Cheng:** Ranks lowest. He is tied for the lowest number of projects (2) and has the lowest number of unique collaborators (1). Although he is tied for the second highest number of unique research fields (4), his very low collaborator count indicates limited direct network connections, which is a key component of the influencer definition.

Response generated in 13.82 seconds.
--- Influenc

## Prepare Embading

In [19]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Combine relevant text columns into one (you may adjust columns as needed)
text_columns = [
    "award_type", "award_title", "abstract", 
    "org_name", "org_name2", "perf_inst_name", 
    "program_element", "program_reference"
]
df["combined_text"] = df[text_columns].astype(str).agg(" ".join, axis=1)

# a. Leadership indicator: 1 if role suggests prior leadership (e.g., contains "Principal Investigator")
df["leadership"] = df["role"].apply(lambda x: 1 if "Principal Investigator" in str(x) else 0)

# b. Experience in years: use start_date and a reference date (here we use today)
df["start_date"] = pd.to_datetime(df["start_date"], errors='coerce')
reference_date = datetime.now()  # or use a fixed project date
df["experience_years"] = (reference_date - df["start_date"]).dt.days / 365.25

# Load a pre-trained sentence transformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embedding for each award's combined text
df["text_embedding"] = df["combined_text"].apply(lambda x: embedder.encode(x))

# We assume each row has a researcher ID ("pi_id"). If a researcher has multiple rows, we aggregate.
# For aggregated text, we average the embeddings; for numeric features, we use appropriate aggregation.
award_counts = df.groupby("pi_id").size().reset_index(name="award_count")
df_grouped = df.groupby("pi_id").agg({
    "experience_years": "mean",       # average experience across awards
    "leadership": "max",              # if they have ever been a PI, mark as leadership
    "text_embedding": lambda embs: np.mean(np.stack(embs), axis=0)
}).reset_index()
df_grouped = df_grouped.merge(award_counts, on="pi_id", how="left")

# For later scoring, normalize the numeric features (experience and award_count)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_grouped[["exp_norm", "award_norm"]] = scaler.fit_transform(df_grouped[["experience_years", "award_count"]])

## Influencer by TOPIC

In [52]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Assuming SentenceTransformer 'embedder' and DataFrames 'df', 'df_grouped'
# are already loaded and computed as in your notebook (cells 17 & 18)

# --- New Function 1: Select Candidate PIs by Criterion ---
def select_candidate_pis(
    df: pd.DataFrame,
    df_grouped: pd.DataFrame,
    embedder, # Your SentenceTransformer model
    criterion_type: str, # "topic" or "department"
    criterion_value: str, # The actual topic or department name
    top_k: int = 10 # Number of top candidates to select
) -> List[str]:
    """
    Selects a list of candidate PI IDs based on a research topic or department.

    Args:
        df: The main DataFrame.
        df_grouped: DataFrame grouped by pi_id, containing aggregated info
                    and 'text_embedding'.
        embedder: The initialized SentenceTransformer model.
        criterion_type: Either 'topic' or 'department'.
        criterion_value: The specific topic string or department name.
        top_k: The maximum number of candidate IDs to return.

    Returns:
        A list of candidate PI IDs.
    """
    print(f"Selecting top {top_k} candidates based on {criterion_type}: '{criterion_value}'...")
    candidate_ids = []

    if criterion_type == "topic":
        if 'text_embedding' not in df_grouped.columns or embedder is None:
            print("Error: df_grouped with 'text_embedding' and embedder are required for topic search.")
            return []

        # Compute embedding for the research topic
        topic_emb = embedder.encode(criterion_value)

        # Calculate similarity between topic and all PIs in df_grouped
        all_embeddings = np.stack(df_grouped['text_embedding'].values)
        similarities = cosine_similarity([topic_emb], all_embeddings)[0]

        # Get indices of top k PIs sorted by similarity
        top_indices = np.argsort(similarities)[::-1][:top_k]

        # Get the corresponding PI IDs
        candidate_ids = df_grouped.iloc[top_indices]['pi_id'].tolist()

    elif criterion_type == "department":
        # Filter the main df by department (case-insensitive partial match)
        # You might want to refine this matching logic (e.g., exact match)
        dept_match_df = df[df['department'].str.contains(criterion_value, case=False, na=False)]

        if dept_match_df.empty:
            print(f"No PIs found matching department: '{criterion_value}'")
            return []

        # Get unique PI IDs from the matching departments
        unique_dept_pi_ids = dept_match_df['pi_id'].unique()

        # If more than top_k PIs, we can optionally rank them (e.g., by award count)
        # Here, we'll take the top_k based on award count from df_grouped
        if len(unique_dept_pi_ids) > top_k:
            candidate_subset = df_grouped[df_grouped['pi_id'].isin(unique_dept_pi_ids)]
            # Sort by 'award_count' (requires 'award_count' column in df_grouped)
            if 'award_count' in candidate_subset.columns:
                 ranked_candidates = candidate_subset.sort_values(by='award_count', ascending=False)
                 candidate_ids = ranked_candidates.head(top_k)['pi_id'].tolist()
            else: # Fallback if award_count isn't available
                 candidate_ids = list(unique_dept_pi_ids)[:top_k]
            print(f"  (Found {len(unique_dept_pi_ids)} PIs, selecting top {top_k} based on award count)")
        else:
            candidate_ids = list(unique_dept_pi_ids)

    else:
        print(f"Error: Invalid criterion_type '{criterion_type}'. Use 'topic' or 'department'.")
        return []

    print(f"Selected candidate PI IDs: {candidate_ids}")
    return candidate_ids

# --- New Function 2: Orchestrator for Criterion-Based Search ---
def find_influencers_by_criterion(
    df: pd.DataFrame,
    df_grouped: pd.DataFrame,
    embedder, # Your SentenceTransformer model
    model: genai.GenerativeModel, # Your Gemini model
    criterion_type: str,
    criterion_value: str,
    top_k_candidates: int = 10 # How many initial candidates to select
) -> Optional[str]:
    """
    Finds and ranks influencers based on a topic or department criterion.

    Args:
        df: Main DataFrame.
        df_grouped: Grouped DataFrame with embeddings and counts.
        embedder: SentenceTransformer model.
        model: Gemini model object.
        criterion_type: 'topic' or 'department'.
        criterion_value: The topic string or department name.
        top_k_candidates: Max number of candidates to select initially.

    Returns:
        The influencer ranking text from the model, or None/error message.
    """
    print(f"\n--- Starting Influencer Search by {criterion_type.capitalize()}: '{criterion_value}' ---")

    # 1. Select Candidate PIs based on the criterion
    candidate_pi_ids = select_candidate_pis(
        df, df_grouped, embedder, criterion_type, criterion_value, top_k=top_k_candidates
    )

    if not candidate_pi_ids:
        print("No candidates found for the specified criterion.")
        return f"Could not find potential influencers matching {criterion_type}: '{criterion_value}'."

    # 2. Proceed with the LLM analysis using the selected candidates
    # Reuse the 'identify_influencer_llm' logic (formatting, prompt, API call)
    # We pass the dynamically selected candidate_pi_ids
    print(f"\n--- Analyzing Selected Candidates for Influence ---")
    # (Assuming identify_influencer_llm structure remains similar)
    # This reuses the formatting, prompt generation and LLM call logic from before
    influencer_ranking_result = identify_influencer_llm(df, model, candidate_pi_ids)


    return influencer_ranking_result



In [21]:
# --- Example Usage ---
try:
    # --- Find Influencers by TOPIC ---
    topic_to_search = "AI Circuits design"
    topic_ranking = find_influencers_by_criterion(
        df, df_grouped, embedder, model,
        criterion_type="topic",
        criterion_value=topic_to_search,
        top_k_candidates=5 # Select top 5 PIs based on topic similarity first
    )
    if topic_ranking:
        print(f"\n--- Final Influencer Ranking for Topic '{topic_to_search}' ---")
        print(topic_ranking)

except NameError as e:
     print(f"Error: Required variable not defined (e.g., 'df', 'df_grouped', 'embedder', 'model'). Details: {e}")
except Exception as e:
     print(f"An error occurred during the influencer search process: {e}")


--- Starting Influencer Search by Topic: 'AI Circuits design' ---
Selecting top 5 candidates based on topic: 'AI Circuits design'...
Selected candidate PI IDs: ['269757420', '000166285', '269951871', '269674418', '269738150']

--- Analyzing Selected Candidates for Influence ---

--- Starting Influencer Identification Process for PI IDs: ['269757420', '000166285', '269951871', '269674418', '269738150'] ---
Formatting influencer data for PI IDs: ['269757420', '000166285', '269951871', '269674418', '269738150']...
  Processing data for Peng   Li (269757420)...
  Processing data for Andreas G Andreou (000166285)...
  Processing data for Dorit S Hochbaum (269951871)...
  Processing data for Alper   Atamturk (269674418)...
  Processing data for Charles B Pierre (269738150)...
Influencer data formatting complete.
Generating influencer prompt...
Influencer prompt generated.
--- Sending Request to Gemini for Influencer Ranking ---

PI: 3.  **Dorit S Hochbaum, Alper Atamturk, and Charles B Pier

In [218]:
scholar_data = pd.read_csv('data/scholer_recommendation.csv')
scholar_data = scholar_data.drop(columns=["Abstract", "Keywords"], axis=1)
scholar_data.rename(columns={'Fields of Study': 'Discipline', 'Category': 'Topic'}, inplace=True)
scholar_data.shape

(10000, 9)

In [219]:
def duplicate_row_check(df):
    duplicate_list = []
    previous_row = 0
    D_count = 0
    for index, row in df.iterrows():
        if index > 0:
            is_match = (previous_row == row).all()
            if is_match:
                duplicate_list.append(index)
            D_count = D_count + is_match
        previous_row = row.copy()
    return D_count, duplicate_list

In [220]:
duplicate_count, duplicate_list = duplicate_row_check(scholar_data)
scholar_data['Authors_list'] = scholar_data['Authors'].str.split(',')
scholar_data = scholar_data.explode('Authors_list').reset_index(drop=True)
scholar_data["Authors"] = scholar_data["Authors_list"]
scholar_data.drop(["Authors_list"], axis=1, inplace=True)

scholar_data['Discipline_list'] = scholar_data['Discipline'].str.split(',')
scholar_data = scholar_data.explode('Discipline_list').reset_index(drop=True)
scholar_data["Discipline"] = scholar_data["Discipline_list"]

duplicate_count, duplicate_list = duplicate_row_check(scholar_data)
scholar_data.drop(duplicate_list, inplace=True)
scholar_data.reset_index(drop=True, inplace=True)
duplicate_count = duplicate_row_check(scholar_data)

scholar_data.dropna(inplace=True)
duplicate_count = duplicate_row_check(scholar_data)

def categorize_venue(venue):
    if 'conference' in venue.lower() or 'symposium' in venue.lower() or 'workshop' in venue.lower():
        return 'Conference'
    elif 'journal' in venue.lower() or 'transactions' in venue.lower() or 'letters' in venue.lower():
        return 'Journal'
    else:
        return 'Other'

scholar_data['Venue Type'] = scholar_data['Venue'].apply(categorize_venue)
scholar_data = scholar_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
duplicate_count = duplicate_row_check(scholar_data)

scholar_data.drop(duplicate_list, inplace=True)
scholar_data.reset_index(drop=True, inplace=True)
duplicate_count = duplicate_row_check(scholar_data)



  scholar_data = scholar_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [221]:
scholar_data.columns

Index(['Title', 'Authors', 'Year', 'Venue', 'URL', 'Open Access', 'Discipline',
       'Citations', 'Topic', 'Discipline_list', 'Venue Type'],
      dtype='object')

In [222]:
scholar_data.head(5)

Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic,Discipline_list,Venue Type
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other


In [223]:
max(df_combined.pi_id)

'270123562'

In [224]:
scholar_data.head()

Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic,Discipline_list,Venue Type
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other


In [225]:
existing_authors_map = df_combined.drop_duplicates(subset=['pi_full_name'], keep='first') \
                                 .set_index('pi_full_name')['pi_id'] \
                                 .to_dict()

# --- Step 3: Map Existing IDs to scholar_data ---
# At this point, scholar_data['pi_id'] can contain numbers, strings, or NaN
scholar_data['pi_id'] = scholar_data['Authors'].map(existing_authors_map)

print("\n--- scholar_data after mapping existing IDs (raw values) ---")
print(scholar_data)

# --- Step 4: Identify New Authors ---
# These are authors in scholar_data whose pi_id is still NaN
new_authors_mask = scholar_data['pi_id'].isnull()
unique_new_author_names = scholar_data.loc[new_authors_mask, 'Authors'].unique()

# --- Step 5: Determine Starting ID for New Authors (Numeric Calculation) ---
max_existing_id = 0 # Default if no valid numeric IDs are found
if not df_combined.empty and 'pi_id' in df_combined.columns:
    # Attempt to convert pi_id to numeric.
    # errors='coerce' will turn non-numeric values (like 'INVALID_ID_EXAMPLE') into NaN
    numeric_pi_ids = pd.to_numeric(df_combined['pi_id'], errors='coerce')

    if not numeric_pi_ids.dropna().empty:
        # Get the maximum of the valid numeric IDs and ensure it's an integer
        max_existing_id = int(numeric_pi_ids.max())
    # else: max_existing_id remains 0
# else: max_existing_id remains 0

next_new_id = max_existing_id + 1

print(f"\nMax existing numeric ID in df_combined: {max_existing_id}")
print(f"Next available numeric ID for new authors: {next_new_id}")
print(f"Unique new author names: {unique_new_author_names}")

# --- Step 6 & 7: Assign IDs to New Unique Authors and Fill ---
# The IDs assigned here will be integers
if len(unique_new_author_names) > 0:
    new_author_id_map = {
        name: i # i will be an integer
        for i, name in enumerate(unique_new_author_names, start=int(next_new_id))
    }
    print("\nMapping for new authors (numeric IDs):")
    print(new_author_id_map)

    # Apply this map to the rows where pi_id is still NaN
    scholar_data.loc[new_authors_mask, 'pi_id'] = scholar_data.loc[new_authors_mask, 'Authors'].map(new_author_id_map)

print("\n--- scholar_data after assigning new numeric IDs (mixed values) ---")
print(scholar_data)

# --- Step 8: Convert all pi_id in scholar_data to 9-digit strings ---
if 'pi_id' in scholar_data.columns:
    def format_to_9_digits(value):
        if pd.isna(value):
            # For entries that are NaN (e.g., an author was in neither df_combined
            # nor got a new ID, though logic should cover all). Or if an ID became NaN.
            return '' # Or use '000000000' or None or 'MISSINGID'
        try:
            # Convert to float first (to handle "123.0" like strings), then to int
            numeric_value = int(float(str(value)))
            # Format as a 9-digit string with leading zeros if needed
            return f"{numeric_value:09d}"
        except ValueError:
            # If the value cannot be converted to an int (e.g., "INVALID_ID_EXAMPLE" or "Unknown")
            # Return it as an empty string, or a specific placeholder string.
            # Since the requirement is "9 digit long", non-numeric original IDs become empty.
            return '' # Or str(value) to keep original, or 'INVALIDID'

    scholar_data['pi_id'] = scholar_data['pi_id'].apply(format_to_9_digits)

print("\n--- Final scholar_data with 9-digit string pi_id ---")
print(scholar_data)

# Verification
# if 'pi_id' in scholar_data.columns and not scholar_data.empty:
#     print("\nSample of pi_id format and type:")
#     for idx, row in scholar_data.head().iterrows():
#         print(f"Author: {row['Authors']}, PI_ID: '{row['pi_id']}', Type: {type(row['pi_id'])}, Length: {len(row['pi_id'])}")


--- scholar_data after mapping existing IDs (raw values) ---
                                                   Title             Authors  \
0      Fashion-MNIST: a Novel Image Dataset for Bench...            Han Xiao   
1      Fashion-MNIST: a Novel Image Dataset for Bench...            Han Xiao   
2      Fashion-MNIST: a Novel Image Dataset for Bench...        Kashif Rasul   
3      Fashion-MNIST: a Novel Image Dataset for Bench...        Kashif Rasul   
4      Fashion-MNIST: a Novel Image Dataset for Bench...     Roland Vollgraf   
...                                                  ...                 ...   
80037  Competing Memes Propagation on Networks: A Net...  Nicholas C. Valler   
80038  Competing Memes Propagation on Networks: A Net...   B. Aditya Prakash   
80039  Competing Memes Propagation on Networks: A Net...      Iulian Neamtiu   
80040  Competing Memes Propagation on Networks: A Net...        M. Faloutsos   
80041  Competing Memes Propagation on Networks: A Net...  

In [226]:
scholar_data.head(5)

Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic,Discipline_list,Venue Type,pi_id
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123563
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other,270123563
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123564
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other,270123564
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123565


In [227]:
min(scholar_data.pi_id), max(scholar_data.pi_id), max(df_combined.pi_id)

('000023570', '270128830', '270123562')

In [170]:
scholar_data.head()

Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic,Discipline_list,Venue Type,pi_id
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123563
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other,270123564
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123565
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other,270123566
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123567


In [228]:
scholar_data.to_csv("data/Scholarly.csv", index=False)

In [169]:
# # Ensure 'pi_full_name' is unique by dropping duplicates
# df_unique = df.drop_duplicates(subset='pi_full_name')

# # Add a new column 'pi_id' to scholar_data by checking if Authors exist in df.pi_full_name
# scholar_data['pi_id'] = scholar_data['Authors'].map(df_unique.set_index('pi_full_name')['pi_id'])

# # Check the result
scholar_data.head()

Unnamed: 0,Title,Authors,Year,Venue,URL,Open Access,Discipline,Citations,Topic,Discipline_list,Venue Type,pi_id
0,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123563
1,Fashion-MNIST: a Novel Image Dataset for Bench...,Han Xiao,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other,270123564
2,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123565
3,Fashion-MNIST: a Novel Image Dataset for Bench...,Kashif Rasul,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Mathematics,8026,Machine Learning,Mathematics,Other,270123566
4,Fashion-MNIST: a Novel Image Dataset for Bench...,Roland Vollgraf,2017.0,arXiv.org,https://www.semanticscholar.org/paper/f9c602cc...,No,Computer Science,8026,Machine Learning,Computer Science,Other,270123567


In [140]:
# Filter df to get matched authors based on scholar_data['Authors']
matched_authors = df[df['pi_full_name'].isin(scholar_data['Authors'])]

# Print the matched pi_id and pi_full_name
matched_authors[['pi_id', 'pi_full_name']].head()


Unnamed: 0,pi_id,pi_full_name
1063,269911546,Henry Han
1064,269914395,Yongmin Liu
1572,270016963,Peng Li
2426,270035204,Xi Chen
3212,269975632,Yu Wang


In [152]:
matched_authors.shape

(108, 18)

In [155]:
len(scholar_data.Authors.unique()), scholar_data.Authors.unique().tolist()[:2]

(5337, ['Han Xiao', 'Kashif Rasul'])

In [156]:
# scholar_data['Walsh' in scholar_data.Authors][['Authors', 'pi_id']]

In [141]:
df[df.pi_id == '269911546']

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,role,department,email,start_date,combined_text,leadership,experience_years,text_embedding
1063,Continuing Grant,SHINE: Understanding the Physical Connection o...,Understanding the solar wind is crucial to spa...,Directorate for Geosciences,Division of Atmospheric and Geospace Sciences,Regents of the University of Michigan - Ann Arbor,SOLAR-TERRESTRIAL,COVID-Disproportionate Impcts Inst-Indiv,269911546,Henry Han,Co-Principal Investigator,Computer Science,Henry_Han@baylor.edu,2022-07-25,Continuing Grant SHINE: Understanding the Phys...,1,2.787132,"[-0.031365495, 0.041547813, 0.100492634, 0.065..."


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import xgboost as xgb

# Copy original data
df_XGB = df_combined.copy()

# --- Prepare topics and heuristic labels ---
df_XGB['topic'] = df_XGB['award_title'].apply(lambda x: x.split()[0])
df_XGB['topic_id'] = df_XGB.groupby('topic').ngroup()
df_XGB['heuristic_score'] = (
    df_XGB['leadership'] * 2 +
    df_XGB['experience_years'] +
    df_XGB['award_norm']
)
df_XGB['label'] = df_XGB.groupby('topic_id')['heuristic_score'].transform(
    lambda x: (x == x.max()).astype(int)
)

# --- Flatten embeddings ---
embeddings = pd.DataFrame(df_XGB['text_embedding'].tolist(), index=df_XGB.index)
embeddings.columns = [f'emb_{i}' for i in range(embeddings.shape[1])]
df_XGB = pd.concat([df_XGB, embeddings], axis=1)

# --- Feature and model DataFrame ---
# nfeature_cols = ['leadership', 'experience_years', 'award_norm'] + list(embeddings.columns)
nfeature_cols = ['leadership', 'experience_years', 'award_norm'] + list(embeddings.columns)
model_df = df_XGB[['pi_id', 'topic_id', 'label'] + nfeature_cols].copy()

# --- Train/Test Split ---
train_df, test_df = train_test_split(
    model_df, test_size=0.2, stratify=model_df['label'], random_state=42
)
train_groups = train_df.groupby('topic_id').size().to_numpy()
test_groups = test_df.groupby('topic_id').size().to_numpy()

# --- Prepare for XGBoost ---
X_train = train_df[nfeature_cols].to_numpy()
y_train = train_df['label'].to_numpy()
X_test = test_df[nfeature_cols].to_numpy()
y_test = test_df['label'].to_numpy()

dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(train_groups)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(test_groups)

# --- Train XGBoost Ranking Model ---
params = {
    'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0,
    'min_child_weight': 0.1, 'max_depth': 6, 'verbosity': 1,
    'eval_metric': 'ndcg@1'
}
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# --- Evaluate on held-out test split ---
results_df = test_df.copy()
results_df['score'] = model.predict(dtest)
ranked_df = results_df.groupby('topic_id').apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)
ranked_df['predicted_label'] = ranked_df.groupby('topic_id').cumcount().apply(lambda x: 1 if x == 0 else 0)
precision, recall, f1, accuracy = (
    precision_score(ranked_df['label'], ranked_df['predicted_label']),
    recall_score(ranked_df['label'], ranked_df['predicted_label']),
    f1_score(ranked_df['label'], ranked_df['predicted_label']),
    accuracy_score(ranked_df['label'], ranked_df['predicted_label'])
)
print("Held-out test split evaluation:")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

# --- Evaluate on custom candidate sets ---
candidate_sets = [
    (['269948909', '000173003', '269886945'], 'STATISTICS'),
    (['269807623', '269794080', '269879497'], 'AI Circuits design'),
    (['269807623', '269794080', '269879497'], 'hardware software co-design'),
    (['269677663', '269988546', '270021884'], 'Trustworthy AI'),
    (['269677663', '269988546', '269814599'], 'Networking safety'),
    (['269811881', '269958535', '270083608'], 'Bioinformatics'),
    (['270082637', '269726900', '269963435'], 'Robotics'),
    (['270082637', '269726900', '270021884'], 'AI in Robotics'),
    (['269934201', '269769382', '269911544'], 'Algorithm'),
    (['269721983', '269928133', '000171581'], 'Data Science')
]

all_true, all_pred = [], []
for ids, topic_str in candidate_sets:
    # Build small group
    group_df = df_XGB[df_XGB['pi_id'].isin(ids)].copy()
    # Use heuristic as true label
    group_df['true_label'] = (group_df['heuristic_score'] == group_df['heuristic_score'].max()).astype(int)
    # Features
    X_group = group_df[nfeature_cols].to_numpy()
    # Predict scores
    scores = model.predict(xgb.DMatrix(X_group))
    group_df['score'] = scores
    # Predicted label: top score = 1
    sorted_idx = np.argsort(-scores)
    preds = np.zeros(len(scores), dtype=int)
    preds[sorted_idx[0]] = 1
    # Collect
    all_true.extend(group_df['true_label'].tolist())
    all_pred.extend(preds.tolist())
    # Print group result
    print(f"\nTopic: {topic_str}")
    print(group_df[['pi_id', 'true_label']].assign(predicted=preds))

# Metrics on candidate sets
p2, r2, f12, acc2 = (
    precision_score(all_true, all_pred),
    recall_score(all_true, all_pred),
    f1_score(all_true, all_pred),
    accuracy_score(all_true, all_pred)
)
print("\nCustom candidate sets evaluation:")
print(f"Precision: {p2:.4f}, Recall: {r2:.4f}, F1: {f12:.4f}, Accuracy: {acc2:.4f}")


[0]	train-ndcg@1:0.89344	test-ndcg@1:0.90506
[1]	train-ndcg@1:0.92088	test-ndcg@1:0.90506
[2]	train-ndcg@1:0.93632	test-ndcg@1:0.90549
[3]	train-ndcg@1:0.94318	test-ndcg@1:0.91181
[4]	train-ndcg@1:0.95047	test-ndcg@1:0.91181
[5]	train-ndcg@1:0.95304	test-ndcg@1:0.91392
[6]	train-ndcg@1:0.95819	test-ndcg@1:0.91730
[7]	train-ndcg@1:0.96162	test-ndcg@1:0.91603
[8]	train-ndcg@1:0.96398	test-ndcg@1:0.91899
[9]	train-ndcg@1:0.96805	test-ndcg@1:0.92068
[10]	train-ndcg@1:0.97063	test-ndcg@1:0.91941
[11]	train-ndcg@1:0.97213	test-ndcg@1:0.92236
[12]	train-ndcg@1:0.97491	test-ndcg@1:0.91941
[13]	train-ndcg@1:0.97620	test-ndcg@1:0.92110
[14]	train-ndcg@1:0.97877	test-ndcg@1:0.92236
[15]	train-ndcg@1:0.98049	test-ndcg@1:0.92447
[16]	train-ndcg@1:0.98070	test-ndcg@1:0.92405
[17]	train-ndcg@1:0.98349	test-ndcg@1:0.92321
[18]	train-ndcg@1:0.98435	test-ndcg@1:0.92152
[19]	train-ndcg@1:0.98521	test-ndcg@1:0.92152
[20]	train-ndcg@1:0.98542	test-ndcg@1:0.92194
[21]	train-ndcg@1:0.98563	test-ndcg@1:0.9215

  ranked_df = results_df.groupby('topic_id').apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)


In [134]:
df.head()

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,role,department,email,start_date,combined_text,leadership,experience_years,text_embedding
0,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269967889,Terrance Figy,Co-Principal Investigator,"Mathematics, Statistics, and Physics",Terrance.Figy@wichita.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
1,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269758255,Pratul K Agarwal,Principal Investigator,,pratul.agarwal@okstate.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
2,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",224099,Mickey Slimp,Co-Principal Investigator,Department of Chemistry,,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
4,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269666332,William H Hsu,Co-Principal Investigator,Computer Science,bhsu@ksu.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
7,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",270046494,Robert Fleming,Co-Principal Investigator,Engineering,rofleming@AState.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."


In [103]:
# Merge the columns from df_grouped into df based on pi_id
df_combined = df.merge(df_grouped[['pi_id', 'award_count', 'award_norm', 'exp_norm']], on='pi_id', how='left')

In [119]:
df_combined.shape

(83112, 409)

In [63]:
df_combined.head()

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,...,department,email,start_date,combined_text,leadership,experience_years,text_embedding,award_count,award_norm,exp_norm
0,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269967889,Terrance Figy,...,"Mathematics, Statistics, and Physics",Terrance.Figy@wichita.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0....",1,0.0,0.099395
1,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269758255,Pratul K Agarwal,...,,pratul.agarwal@okstate.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0....",2,0.028571,0.684157
2,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",224099,Mickey Slimp,...,Department of Chemistry,,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0....",1,0.0,0.099395
3,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269666332,William H Hsu,...,Computer Science,bhsu@ksu.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0....",1,0.0,0.481332
4,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",270046494,Robert Fleming,...,Engineering,rofleming@AState.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0....",3,0.057143,0.368651


In [75]:
# df_combined['program_element'].unique()

In [76]:
# df_combined['program_reference'].unique()

In [64]:
df_combined.to_csv('data/combined_data.csv', index=False)

In [58]:
df_grouped.head()

Unnamed: 0,pi_id,experience_years,leadership,text_embedding,award_count,exp_norm,award_norm
0,84,1.572895,1,"[0.0027017621, 0.026450962, 0.04872246, 0.0352...",2,0.262109,0.028571
1,94,3.008898,1,"[-0.026076434, -0.012992876, -0.023296108, 0.0...",1,0.526741,0.0
2,132,3.780972,1,"[-0.0791296, 0.043611363, -0.075964525, 0.0038...",1,0.669021,0.0
3,390,2.30527,1,"[-0.09878656, -0.029403467, 0.008581905, 0.061...",1,0.397074,0.0
4,453,2.255989,1,"[-0.064013764, -0.05129957, -0.045931555, 0.03...",2,0.387992,0.028571


## Influencers by DEPARTMENT

In [113]:
df.head()

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,role,department,email,start_date,combined_text,leadership,experience_years,text_embedding
0,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269967889,Terrance Figy,Co-Principal Investigator,"Mathematics, Statistics, and Physics",Terrance.Figy@wichita.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
1,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269758255,Pratul K Agarwal,Principal Investigator,,pratul.agarwal@okstate.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
2,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",224099,Mickey Slimp,Co-Principal Investigator,Department of Chemistry,,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
4,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269666332,William H Hsu,Co-Principal Investigator,Computer Science,bhsu@ksu.edu,2022-08-03,Standard Grant MRI: Acquisition of a High-Perf...,1,2.762491,"[-0.0059216134, -0.061306752, -0.043238774, 0...."
7,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",270046494,Robert Fleming,Co-Principal Investigator,Engineering,rofleming@AState.edu,2024-08-29,Standard Grant MRI: Acquisition of a High-Perf...,1,0.689938,"[-0.0059216134, -0.061306752, -0.043238774, 0...."


In [68]:
# --- Example Usage ---
try:
    # --- Find Influencers by DEPARTMENT ---
    dept_to_search = "Computer Science"
    dept_ranking = find_influencers_by_criterion(
        df, df_grouped, embedder, model,
        criterion_type="department",
        criterion_value=dept_to_search,
        top_k_candidates=10 # Select top 10 PIs from this department (ranked by awards)
    )
    if dept_ranking:
        print(f"\n--- Final Influencer Ranking for Department '{dept_to_search}' ---")
        print(dept_ranking)

except NameError as e:
     print(f"Error: Required variable not defined (e.g., 'df', 'df_grouped', 'embedder', 'model'). Details: {e}")
except Exception as e:
     print(f"An error occurred during the influencer search process: {e}")


--- Starting Influencer Search by Department: 'Computer Science' ---
Selecting top 10 candidates based on department: 'Computer Science'...
  (Found 2938 PIs, selecting top 10 based on award count)
Selected candidate PI IDs: ['269935164', '269779708', '269765937', '000235919', '270031750', '000207040', '270018850', '269779084', '269985475', '269680242']

--- Analyzing Selected Candidates for Influence ---

--- Starting Influencer Identification Process for PI IDs: ['269935164', '269779708', '269765937', '000235919', '270031750', '000207040', '270018850', '269779084', '269985475', '269680242'] ---
Formatting influencer data for PI IDs: ['269935164', '269779708', '269765937', '000235919', '270031750', '000207040', '270018850', '269779084', '269985475', '269680242']...
  Processing data for Yanfang   Ye (269935164)...
************************************************************
--- Potential Influencer: Yanfang   Ye (ID: 269935164) ---
Total Projects Involved In (as PI/Co-PI): 13
Total U

In [23]:
# df[df['pi_id'].isin(pi_ids_to_analyze)][['pi_full_name', 'pi_id']]

In [24]:
# --- Prerequisite Imports and Setup ---
# Make sure you have these imports and objects loaded from your notebook
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai # Assuming 'model' is configured
# from sentence_transformers import SentenceTransformer # Assuming 'embedder' is loaded
import networkx as nx # For Method 7
from typing import List, Dict, Tuple, Optional
import collections # For Method 4 helper

# Assuming 'df', 'df_grouped', 'embedder', 'model' are loaded and preprocessed
# Assuming 'get_collaborators_for_awards', 'format_influencer_data',
# 'generate_influencer_prompt', 'identify_influencer_llm',
# 'select_candidate_pis', 'find_influencers_by_criterion',
# 'get_gemini_response' functions exist as defined previously.

# --- Method 4: Inter-Institutional Collaboration ---

# Helper function to get collaborator institutions
def get_collaborator_institutions(df: pd.DataFrame, pi_id: str, collaborators_names: List[str]) -> collections.Counter:
    """
    Finds the institutions of a given list of collaborators.

    Args:
        df: The main DataFrame.
        pi_id: The ID of the main PI (to exclude their own institution if needed).
        collaborators_names: A list of full names of the collaborators.

    Returns:
        A Counter object mapping institution names to their frequency.
    """
    if not collaborators_names:
        return collections.Counter()

    # Find entries for the collaborators
    collaborator_df = df[df['pi_full_name'].isin(collaborators_names)]

    # Get institutions, excluding NaNs and potentially the main PI's primary institution
    # For simplicity here, we count all unique institutions associated with collaborators
    institutions = collaborator_df['perf_inst_name'].dropna().unique()

    # We can return a Counter of unique institutions found for collaborators
    # For influence, we mostly care about the *number* of unique institutions
    return collections.Counter(institutions)


# Modify format_influencer_data to include institutional diversity
def format_influencer_data_v2(df: pd.DataFrame, pi_ids: List[str]) -> Tuple[str, Dict[str, str]]:
    """
    V2: Formats data including project/collaborator counts, field diversity,
        AND inter-institutional collaboration breadth.
    """
    print(f"Formatting influencer data (v2) for PI IDs: {pi_ids}...")
    formatted_data = ""
    pi_names = {}
    filtered_df = df[df['pi_id'].isin(pi_ids)].copy()

    if filtered_df.empty:
         # (Same empty handling as before)
         print("Warning: No data found for any specified PI IDs.")
         formatted_data = "No data could be retrieved for the specified potential influencers.\n"
         for pi_id in pi_ids:
             pi_names[pi_id] = f"PI ID {pi_id}"
         return formatted_data, pi_names

    for pi_id in pi_ids:
        pi_specific_data = filtered_df[filtered_df['pi_id'] == pi_id]

        if not pi_specific_data.empty:
            full_name = pi_specific_data['pi_full_name'].iloc[0]
            pi_names[pi_id] = full_name
            print(f"  Processing data for {full_name} ({pi_id})...")
            formatted_data += f"--- Potential Influencer: {full_name} (ID: {pi_id}) ---\n"

            # --- Project & Connection Analysis ---
            unique_award_titles = pi_specific_data['award_title'].unique()
            num_projects = len(unique_award_titles)
            formatted_data += f"Total Projects Involved In: {num_projects}\n"
            collaborators_by_award = get_collaborators_for_awards(df, list(unique_award_titles))
            all_collaborators = set(name for names in collaborators_by_award.values() for name in names if name != full_name)
            num_unique_collaborators = len(all_collaborators)
            formatted_data += f"Total Unique Collaborators: {num_unique_collaborators}\n"

            # --- Field Diversity Analysis ---
            unique_elements = pi_specific_data['program_element'].dropna().unique()
            unique_references = pi_specific_data['program_reference'].dropna().unique()
            all_fields = set(unique_elements) | set(unique_references)
            num_unique_fields = len(all_fields)
            formatted_data += f"Number of Unique Research Fields: {num_unique_fields}\n"

            # --- NEW: Inter-Institutional Collaboration ---
            collaborator_institutions = get_collaborator_institutions(df, pi_id, list(all_collaborators))
            num_unique_collab_institutions = len(collaborator_institutions)
            formatted_data += f"Number of Unique Collaborating Institutions: {num_unique_collab_institutions}\n"
            # Optionally list some institutions
            inst_preview = ", ".join(list(collaborator_institutions.keys())[:3])
            formatted_data += f"  Collaborating Institutions Sample: {inst_preview}{'...' if num_unique_collab_institutions > 3 else ''}\n\n"

        else:
             # (Same handling for missing PI as before)
             formatted_data += f"--- Potential Influencer ID: {pi_id} ---\n"
             formatted_data += "No award data found...\n\n"
             pi_names[pi_id] = f"PI ID {pi_id}"
        
    print("Influencer data formatting (v2) complete.")
    return formatted_data, pi_names

# Modify generate_influencer_prompt to include the new criterion
def generate_influencer_prompt_v2(formatted_data_string: str, pi_names_dict: Dict[str, str]) -> str:
    """
    V2: Generates prompt asking LLM to consider projects, collaborators,
        field diversity, AND institutional diversity.
    """
    print("Generating influencer prompt (v2)...")
    candidate_names_list = ", ".join(pi_names_dict.values())

    prompt = f"""
Context:
You are an AI assistant analyzing research collaboration data to identify 'influencers'. An influencer is defined as a researcher who has significant connections and reach, demonstrated by:
1.  High number of distinct projects/awards involved in.
2.  High number of unique collaborators worked with.
3.  Experience across a diverse range of research fields.
4.  Collaboration with individuals from a wide range of different institutions.

Below is summarized data for potential influencers ({candidate_names_list}):

{formatted_data_string}

Task:
Based *only* on the summarized information provided above, please analyze each researcher's profile according to the 'influencer' criteria (project count, collaborator count, field count, AND collaborating institution count).

Rank these individuals ({candidate_names_list}) from most influential to least influential based on this definition.

Provide a clear ranking and a concise justification for your ranking, referencing the specific metrics provided for each researcher.
"""
    print("Influencer prompt (v2) generated.")
    return prompt

# New Orchestrator using V2 functions
def identify_influencer_llm_v2(df: pd.DataFrame, model: genai.GenerativeModel, pi_ids: List[str]) -> Optional[str]:
    """ V2: Orchestrator using formatting and prompt that include institutional diversity. """
    print(f"\n--- Starting Influencer Identification Process (V2 - Incl. Institutions) for PI IDs: {pi_ids} ---")
    formatted_text, pi_names = format_influencer_data_v2(df, pi_ids) # Use V2 format
    print("----------------------------------------------------------")
    print("Formatted data for influencer analysis (v2):", formatted_text)
    print("----------------------------------------------------------")
    if not pi_names or all(name.startswith("PI ID") for name in pi_names.values()):
         # (Same error handling)
         return "Could not generate influencer ranking due to lack of data."

    prompt_text = generate_influencer_prompt_v2(formatted_text, pi_names) # Use V2 prompt
    print("--- Sending Request to Gemini for Influencer Ranking (V2) ---")
    ranking_result, duration = get_gemini_response(model, prompt_text)

    if ranking_result:
        print(f"--- Influencer Identification (V2) Complete ({duration:.2f}s) ---")
        return ranking_result
    else:
        print("--- Influencer Identification (V2) Failed ---")
        return "Failed to get influencer ranking (V2) from the model."

# --- Method 5: Specific Award Types ---

# Modify select_candidate_pis to handle 'award_type'
def select_candidate_pis_v2(
    df: pd.DataFrame,
    df_grouped: pd.DataFrame,
    embedder,
    criterion_type: str, # topic, department, award_type
    criterion_value: str,
    top_k: int = 10
) -> List[str]:
    """ V2: Selects candidates based on topic, department, OR award_type. """
    print(f"Selecting top {top_k} candidates based on {criterion_type}: '{criterion_value}'...")
    candidate_ids = []

    if criterion_type == "topic":
        # (Same logic as before)
        if 'text_embedding' not in df_grouped.columns or embedder is None: return []
        topic_emb = embedder.encode(criterion_value)
        all_embeddings = np.stack(df_grouped['text_embedding'].values)
        similarities = cosine_similarity([topic_emb], all_embeddings)[0]
        top_indices = np.argsort(similarities)[::-1][:top_k]
        candidate_ids = df_grouped.iloc[top_indices]['pi_id'].tolist()

    elif criterion_type == "department":
        # (Same logic as before)
        dept_match_df = df[df['department'].str.contains(criterion_value, case=False, na=False)]
        if dept_match_df.empty: return []
        unique_dept_pi_ids = dept_match_df['pi_id'].unique()
        if len(unique_dept_pi_ids) > top_k:
            candidate_subset = df_grouped[df_grouped['pi_id'].isin(unique_dept_pi_ids)]
            if 'award_count' in candidate_subset.columns:
                 ranked_candidates = candidate_subset.sort_values(by='award_count', ascending=False)
                 candidate_ids = ranked_candidates.head(top_k)['pi_id'].tolist()
            else: candidate_ids = list(unique_dept_pi_ids)[:top_k]
        else: candidate_ids = list(unique_dept_pi_ids)

    # --- NEW: Award Type Logic ---
    elif criterion_type == "award_title":
        # Filter main df by the specific award type (case-insensitive partial match)
        award_match_df = df[df['award_title'].str.contains(criterion_value, case=False, na=False)]
        if award_match_df.empty:
            print(f"No PIs found associated with award type: '{criterion_value}'")
            return []
        unique_award_pi_ids = award_match_df['pi_id'].unique()

        # Rank by award count within this group if needed
        if len(unique_award_pi_ids) > top_k:
            candidate_subset = df_grouped[df_grouped['pi_id'].isin(unique_award_pi_ids)]
            if 'award_count' in candidate_subset.columns:
                 ranked_candidates = candidate_subset.sort_values(by='award_count', ascending=False)
                 candidate_ids = ranked_candidates.head(top_k)['pi_id'].tolist()
            else: candidate_ids = list(unique_award_pi_ids)[:top_k]
            print(f"  (Found {len(unique_award_pi_ids)} PIs, selecting top {top_k} based on award count)")
        else:
            candidate_ids = list(unique_award_pi_ids)
    # --- End New Logic ---

    else:
        print(f"Error: Invalid criterion_type '{criterion_type}'. Use 'topic', 'department', or 'award_title'.")
        return []

    print(f"Selected candidate PI IDs: {candidate_ids}")
    return candidate_ids

# Modify the main orchestrator to use the updated selector
def find_influencers_by_criterion_v2(
    df: pd.DataFrame,
    df_grouped: pd.DataFrame,
    embedder,
    model: genai.GenerativeModel,
    criterion_type: str, # topic, department, award_type
    criterion_value: str,
    top_k_candidates: int = 10
) -> Optional[str]:
    """ V2: Finds influencers based on topic, department, OR award_type criterion. """
    print(f"\n--- Starting Influencer Search (V2 Selector) by {criterion_type.capitalize()}: '{criterion_value}' ---")

    # 1. Select Candidate PIs using the V2 selector
    candidate_pi_ids = select_candidate_pis_v2( # Use V2 selector
        df, df_grouped, embedder, criterion_type, criterion_value, top_k=top_k_candidates
    )

    if not candidate_pi_ids:
        # (Same error handling)
        return f"Could not find candidates matching {criterion_type}: '{criterion_value}'."

    # 2. Proceed with LLM analysis (using V1 or V2 formatting/prompting as desired)
    # Using V2 here to include institutional diversity analysis as well
    print(f"\n--- Analyzing Selected Candidates for Influence (V2 - Incl. Institutions) ---")
    influencer_ranking_result = identify_influencer_llm_v2(df, model, candidate_pi_ids)

    return influencer_ranking_result


# --- Method 6: Hybrid Approach (Example: Topic + Institutions in Prompt) ---

# We can reuse `find_influencers_by_criterion_v2` but need a modified prompt generator
# that explicitly tells the LLM to weigh two factors.

def generate_influencer_prompt_hybrid(
    formatted_data_string: str,
    pi_names_dict: Dict[str, str],
    primary_criterion: str, # e.g., "Topic Relevance to 'AI'"
    secondary_criterion: str # e.g., "Breadth of Institutional Collaboration"
    ) -> str:
    """ Hybrid: Asks LLM to rank based on two weighted criteria. """
    print("Generating influencer prompt (Hybrid)...")
    candidate_names_list = ", ".join(pi_names_dict.values())

    prompt = f"""
Context:
You are an AI assistant identifying 'influencers' based on multiple factors.
Primary Factor: {primary_criterion}
Secondary Factor: {secondary_criterion}

Below is data for potential influencers ({candidate_names_list}), including metrics for projects, collaborators, fields, and collaborating institutions:

{formatted_data_string}

Task:
Based *only* on the summarized information provided, please rank these individuals ({candidate_names_list}) from most influential to least influential.

Your ranking should primarily consider the **Primary Factor ({primary_criterion})**. Then, among those who rank highly on the primary factor, give preference based on the **Secondary Factor ({secondary_criterion})**.

Provide a clear ranking and justify your reasoning by referencing the specific metrics and how they relate to both factors.
"""
    print("Influencer prompt (Hybrid) generated.")
    return prompt

# Orchestrator for Hybrid approach
def find_influencers_hybrid(
    df: pd.DataFrame,
    df_grouped: pd.DataFrame,
    embedder,
    model: genai.GenerativeModel,
    primary_criterion_type: str, # e.g., "topic"
    primary_criterion_value: str, # e.g., "STATISTICS"
    secondary_criterion_desc: str, # e.g., "Breadth of Institutional Collaboration"
    top_k_candidates: int = 10
) -> Optional[str]:
    """ Hybrid: Selects on primary, then asks LLM to rank using primary+secondary factors. """

    primary_criterion_desc = f"{primary_criterion_type.capitalize()} related to '{primary_criterion_value}'"
    print(f"\n--- Starting Hybrid Influencer Search ({primary_criterion_desc} + {secondary_criterion_desc}) ---")

    # 1. Select candidates based on the PRIMARY criterion
    candidate_pi_ids = select_candidate_pis_v2(
        df, df_grouped, embedder, primary_criterion_type, primary_criterion_value, top_k=top_k_candidates
    )
    if not candidate_pi_ids:
        return f"Could not find candidates matching primary criterion: {primary_criterion_desc}."

    # 2. Format data (use V2 to ensure institutional data is included)
    formatted_text, pi_names = format_influencer_data_v2(df, candidate_pi_ids)
    if not pi_names or all(name.startswith("PI ID") for name in pi_names.values()):
         return "Could not generate ranking due to lack of data for selected candidates."

    # 3. Generate the HYBRID prompt
    prompt_text = generate_influencer_prompt_hybrid(
        formatted_text, pi_names, primary_criterion_desc, secondary_criterion_desc
    )

    # 4. Get LLM Response
    print("--- Sending Request to Gemini for Influencer Ranking (Hybrid) ---")
    ranking_result, duration = get_gemini_response(model, prompt_text)

    if ranking_result:
        print(f"--- Influencer Identification (Hybrid) Complete ({duration:.2f}s) ---")
        return ranking_result
    else:
        print("--- Influencer Identification (Hybrid) Failed ---")
        return "Failed to get influencer ranking (Hybrid) from the model."


# --- Method 7: Network Centrality (Requires NetworkX) ---

def calculate_network_centrality(df: pd.DataFrame, top_n: int = 20) -> Optional[pd.DataFrame]:
    """
    Calculates Degree and Betweenness Centrality for PIs based on co-awards.

    Args:
        df: The main DataFrame with 'award_title' and 'pi_id'.
        top_n: Number of top influencers to return based on centrality.

    Returns:
        A DataFrame with PI IDs, names, degree, and betweenness centrality,
        ranked by betweenness, then degree. Returns None if networkx is not installed.
    """
    try:
        import networkx as nx
    except ImportError:
        print("Error: networkx library is required for network centrality analysis. Install using 'pip install networkx'")
        return None

    print("\n--- Calculating Network Centrality ---")
    # Create graph
    G = nx.Graph()

    # Group by award to find collaborators
    awards = df.groupby('award_title')['pi_id'].apply(list)

    # Add edges between collaborators on the same award
    for award_id, collaborators in awards.items():
        # Remove duplicates just in case
        unique_collaborators = list(set(collaborators))
        if len(unique_collaborators) > 1:
            # Add edges between all pairs in this award group
            import itertools
            for pi1, pi2 in itertools.combinations(unique_collaborators, 2):
                if G.has_edge(pi1, pi2):
                    G[pi1][pi2]['weight'] = G[pi1][pi2].get('weight', 0) + 1
                else:
                    G.add_edge(pi1, pi2, weight=1)

    if not G.nodes():
        print("Graph contains no nodes. Cannot calculate centrality.")
        return pd.DataFrame(columns=['pi_id', 'pi_full_name', 'degree_centrality', 'betweenness_centrality'])


    # Calculate centrality measures
    print("Calculating Degree Centrality...")
    degree_centrality = nx.degree_centrality(G)
    print("Calculating Betweenness Centrality (may take time)...")
    betweenness_centrality = nx.betweenness_centrality(G, normalized=True, weight='weight') # Consider edge weights

    # Combine results into a DataFrame
    pi_ids = list(G.nodes())
    centrality_df = pd.DataFrame({
        'pi_id': pi_ids,
        'degree_centrality': [degree_centrality.get(pi, 0) for pi in pi_ids],
        'betweenness_centrality': [betweenness_centrality.get(pi, 0) for pi in pi_ids]
    })

    # Merge with PI names (get the first name found for each PI ID)
    pi_names_map = df[['pi_id', 'pi_full_name']].drop_duplicates(subset='pi_id').set_index('pi_id')
    centrality_df = centrality_df.join(pi_names_map, on='pi_id')

    # Rank: Prioritize betweenness, then degree
    ranked_df = centrality_df.sort_values(
        by=['betweenness_centrality', 'degree_centrality'],
        ascending=[False, False]
    ).reset_index(drop=True)

    print("--- Network Centrality Calculation Complete ---")
    return ranked_df[['pi_id', 'pi_full_name', 'degree_centrality', 'betweenness_centrality']].head(top_n)


##  Influencers BASED ON range of institutional connections.

In [25]:
# Example 1: Find influencers based on institutional breadth (using V2)
print("\n=== EXAMPLE: Influencers by Institutional Breadth ===")
candidate_ids_for_inst = ['000025762', '269811881', '269807623', '270021884'] # Example list
inst_ranking = identify_influencer_llm_v2(df, model, candidate_ids_for_inst)
if inst_ranking: print(inst_ranking)


=== EXAMPLE: Influencers by Institutional Breadth ===

--- Starting Influencer Identification Process (V2 - Incl. Institutions) for PI IDs: ['000025762', '269811881', '269807623', '270021884'] ---
Formatting influencer data (v2) for PI IDs: ['000025762', '269811881', '269807623', '270021884']...
  Processing data for Steven N MacEachern (000025762)...
  Processing data for Jianlin   Cheng (269811881)...
  Processing data for Dirk J Colbry (269807623)...
  Processing data for Guido F Montufar Cuartas (270021884)...
Influencer data formatting (v2) complete.
----------------------------------------------------------
Formatted data for influencer analysis (v2): --- Potential Influencer: Steven N MacEachern (ID: 000025762) ---
Total Projects Involved In: 2
Total Unique Collaborators: 4
Number of Unique Research Fields: 1
Number of Unique Collaborating Institutions: 3
  Collaborating Institutions Sample: Ohio State University, University of California-Santa Cruz, University of California Sa

## Influencers BASED ON Department

In [26]:
# Example 2: Find influencers from 'Computer Science' dept (using V2 selector & V2 analysis)
print("\n=== EXAMPLE: Influencers by Department (Computer Science) ===")
cs_ranking = find_influencers_by_criterion_v2(
    df, df_grouped, embedder, model,
    criterion_type="department",
    criterion_value="Computer Science",
    top_k_candidates=10
)
if cs_ranking: print(cs_ranking)


=== EXAMPLE: Influencers by Department (Computer Science) ===

--- Starting Influencer Search (V2 Selector) by Department: 'Computer Science' ---
Selecting top 10 candidates based on department: 'Computer Science'...
Selected candidate PI IDs: ['269935164', '269779708', '269765937', '000235919', '270031750', '000207040', '270018850', '269779084', '269985475', '269680242']

--- Analyzing Selected Candidates for Influence (V2 - Incl. Institutions) ---

--- Starting Influencer Identification Process (V2 - Incl. Institutions) for PI IDs: ['269935164', '269779708', '269765937', '000235919', '270031750', '000207040', '270018850', '269779084', '269985475', '269680242'] ---
Formatting influencer data (v2) for PI IDs: ['269935164', '269779708', '269765937', '000235919', '270031750', '000207040', '270018850', '269779084', '269985475', '269680242']...
  Processing data for Yanfang   Ye (269935164)...
  Processing data for Prasad   Calyam (269779708)...
  Processing data for Tiffany M Barnes (269

## Influencers BASED ON award title

In [27]:
# Example 3: Find influencers associated with 'MRI' award type (using V2 selector & V2 analysis)
print("\n=== EXAMPLE: Influencers by Award Type (MRI) ===")
mri_ranking = find_influencers_by_criterion_v2(
    df, df_grouped, embedder, model,
    criterion_type="award_title",
    criterion_value="NSF", # Major Research Instrumentation
    top_k_candidates=5
)
if mri_ranking: print(mri_ranking)


=== EXAMPLE: Influencers by Award Type (MRI) ===

--- Starting Influencer Search (V2 Selector) by Award_title: 'NSF' ---
Selecting top 5 candidates based on award_title: 'NSF'...
  (Found 4235 PIs, selecting top 5 based on award count)
Selected candidate PI IDs: ['269959504', '269935164', '270041430', '269772812', '269999971']

--- Analyzing Selected Candidates for Influence (V2 - Incl. Institutions) ---

--- Starting Influencer Identification Process (V2 - Incl. Institutions) for PI IDs: ['269959504', '269935164', '270041430', '269772812', '269999971'] ---
Formatting influencer data (v2) for PI IDs: ['269959504', '269935164', '270041430', '269772812', '269999971']...
  Processing data for Jerene   Shaheed (269959504)...
  Processing data for Yanfang   Ye (269935164)...
  Processing data for Jacqueline   El-Sayed (270041430)...
  Processing data for Nicholas G Feamster (269772812)...
  Processing data for Josiah D Hester (269999971)...
Influencer data formatting (v2) complete.
-------

## Topic & Institutional Breadth

In [28]:
# Example 4: Hybrid search - Topic: "Robotics" + Secondary: Institutional Breadth
print("\n=== EXAMPLE: Hybrid Search (Topic: Robotics + Institutions) ===")
hybrid_ranking = find_influencers_hybrid(
    df, df_grouped, embedder, model,
    primary_criterion_type="topic",
    primary_criterion_value="Robotics",
    secondary_criterion_desc="Breadth of Institutional Collaboration",
    top_k_candidates=10
)
if hybrid_ranking: print(hybrid_ranking)


=== EXAMPLE: Hybrid Search (Topic: Robotics + Institutions) ===

--- Starting Hybrid Influencer Search (Topic related to 'Robotics' + Breadth of Institutional Collaboration) ---
Selecting top 10 candidates based on topic: 'Robotics'...
Selected candidate PI IDs: ['269818291', '270031608', '269982422', '269693864', '270019686', '270057330', '270019331', '269999999', '269948584', '000182004']
Formatting influencer data (v2) for PI IDs: ['269818291', '270031608', '269982422', '269693864', '270019686', '270057330', '270019331', '269999999', '269948584', '000182004']...
  Processing data for Jason O'Kane (269818291)...
  Processing data for Naomi T Fitter (270031608)...
  Processing data for Aaron M Johnson (269982422)...
  Processing data for William D Smart (269693864)...
  Processing data for Berk   Calli (270019686)...
  Processing data for Kaiyu Hang (270057330)...
  Processing data for Stefanos   Nikolaidis (270019331)...
  Processing data for Philip M Dames (269999999)...
  Processi

## Network Centrality Analysis

In [29]:
# Example 5: Network Centrality Analysis
print("\n=== EXAMPLE: Network Centrality Analysis ===")
centrality_results = calculate_network_centrality(df, top_n=15)
if centrality_results is not None:
    print(centrality_results.to_string())


=== EXAMPLE: Network Centrality Analysis ===

--- Calculating Network Centrality ---
Calculating Degree Centrality...
Calculating Betweenness Centrality (may take time)...
--- Network Centrality Calculation Complete ---
        pi_id          pi_full_name  degree_centrality  betweenness_centrality
0   269778721      Ilkay   Altintas           0.000706                0.010524
1   269942561    Robert P Guralnick           0.001201                0.009557
2   269870051          Yiran   Chen           0.000424                0.008780
3   269730671         Daniel S Katz           0.000330                0.008575
4   269709711  Ashutosh   Sabharwal           0.000565                0.008439
5   269950450     Stephen E Greiman           0.005133                0.008236
6   000159554         Joseph A Cook           0.001107                0.007703
7   269786973     Rebecca M Willett           0.000659                0.007341
8   269662729            Ian Foster           0.000471              

## PI classification rule based

In [30]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Combine relevant text columns into one (you may adjust columns as needed)
text_columns = [
    "award_type", "award_title", "abstract", 
    "org_name", "org_name2", "perf_inst_name", 
    "program_element", "program_reference"
]
df["combined_text"] = df[text_columns].astype(str).agg(" ".join, axis=1)

# a. Leadership indicator: 1 if role suggests prior leadership (e.g., contains "Principal Investigator")
df["leadership"] = df["role"].apply(lambda x: 1 if "Principal Investigator" in str(x) else 0)

# b. Experience in years: use start_date and a reference date (here we use today)
df["start_date"] = pd.to_datetime(df["start_date"], errors='coerce')
reference_date = datetime.now()  # or use a fixed project date
df["experience_years"] = (reference_date - df["start_date"]).dt.days / 365.25

# Load a pre-trained sentence transformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embedding for each award's combined text
df["text_embedding"] = df["combined_text"].apply(lambda x: embedder.encode(x))

# We assume each row has a researcher ID ("pi_id"). If a researcher has multiple rows, we aggregate.
# For aggregated text, we average the embeddings; for numeric features, we use appropriate aggregation.
award_counts = df.groupby("pi_id").size().reset_index(name="award_count")
df_grouped = df.groupby("pi_id").agg({
    "experience_years": "mean",       # average experience across awards
    "leadership": "max",              # if they have ever been a PI, mark as leadership
    "text_embedding": lambda embs: np.mean(np.stack(embs), axis=0)
}).reset_index()
df_grouped = df_grouped.merge(award_counts, on="pi_id", how="left")

# For later scoring, normalize the numeric features (experience and award_count)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_grouped[["exp_norm", "award_norm"]] = scaler.fit_transform(df_grouped[["experience_years", "award_count"]])

In [31]:
def rank_candidates(research_topic, candidate_ids, base_weight=0.5, topic_weight=0.5):
    # Compute embedding for the research topic
    topic_emb = embedder.encode(research_topic)
    
    candidate_scores = []
    for cid in candidate_ids:
        candidate = df_grouped[df_grouped["pi_id"] == cid].iloc[0]
        
        # Topic relevance score: cosine similarity between candidate's aggregated embedding and the topic
        candidate_emb = candidate["text_embedding"]
        relevance_score = cosine_similarity([candidate_emb], [topic_emb])[0][0]
        
        # Base score: a simple weighted sum of normalized features plus a bonus for leadership
        # Adjust weights as needed. Here, leadership gets a bonus of 1 if present.
        base_score = candidate["exp_norm"] + candidate["award_norm"] + (1 if candidate["leadership"] == 1 else 0)
        
        # Combined score: weighted combination of base score and topic relevance
        combined_score = base_weight * base_score + topic_weight * relevance_score
        candidate_scores.append(combined_score)
    
    candidate_scores = np.array(candidate_scores)
    best_index = np.argmax(candidate_scores)
    pi_candidate = candidate_ids[best_index]
    co_pi_candidates = [cid for i, cid in enumerate(candidate_ids) if i != best_index]
    
    return pi_candidate, co_pi_candidates, candidate_scores

In [32]:
# Test with various topics dynamically.
for test_topic in ["knowledge graph", "AI", "Neuroscience", "STATISTICS"]:
    print("Testing with topic:", test_topic)
    pi_candidate, co_pi_candidates, scores = rank_candidates(test_topic, pi_ids_to_analyze)
    print("Predicted PI:", pi_candidate)
    print("Predicted Co-PIs:", co_pi_candidates)
    print("Candidate Combined Scores:", scores)
    print("-" * 50)

Testing with topic: knowledge graph
Predicted PI: 000025762
Predicted Co-PIs: ['000025017', '000030655']
Candidate Combined Scores: [0.72689373 0.88944828 0.83865565]
--------------------------------------------------
Testing with topic: AI
Predicted PI: 000025762
Predicted Co-PIs: ['000025017', '000030655']
Candidate Combined Scores: [0.74238132 0.88151991 0.83861753]
--------------------------------------------------
Testing with topic: Neuroscience
Predicted PI: 000025762
Predicted Co-PIs: ['000025017', '000030655']
Candidate Combined Scores: [0.68108185 0.8349548  0.78906286]
--------------------------------------------------
Testing with topic: STATISTICS
Predicted PI: 000025762
Predicted Co-PIs: ['000025017', '000030655']
Candidate Combined Scores: [0.73771358 0.92949853 0.86005208]
--------------------------------------------------


In [33]:
# df[df.pi_id.isin(pi_ids_to_analyze)][['pi_id', 'pi_full_name', 'role', 'department', 'leadership', 'experience_years', 'program_element']]

In [34]:
# # Define the list of keywords for departments of interest,
# # including alternative spellings (e.g., math and mathmatics, stat and statistics)
# allowed_keywords = ['computer', 'electrical', 'biomedical', 'bioinformatics', 'math', 'mathmatics', 'stat', 'statistics']

# # Filter the DataFrame to only include rows where the 'department' field contains one of the keywords (case insensitive)
# qualified_df = df[df['department'].fillna('').str.lower().str.contains('|'.join(allowed_keywords))]

# # Get unique PI IDs from the filtered DataFrame
# unique_pi_ids = qualified_df['pi_id'].unique()

# # Generate 15 sets, each containing 3 distinct PI IDs sampled without replacement
# pi_id_sets = [list(np.random.choice(unique_pi_ids, 3, replace=False)) for _ in range(15)]

# # Output the sets with department and program_element information
# for idx, pi_set in enumerate(pi_id_sets, start=1):
#     print(f"Set {idx}:")
#     for pi in pi_set:
#         info = qualified_df[qualified_df['pi_id'] == pi].iloc[0]
#         print(f"  PI: {pi} | Department: {info['department']} | Program Element: {info['program_element']}")

In [35]:
candidate_sets = [
    (['269948909', '000173003', '269886945'], 'STATISTICS'),
    (['269807623', '269794080', '269879497'], 'AI Circuits design'),
    (['269807623', '269794080', '269879497'], 'hardware software co-design'),
    (['269677663', '269988546', '270021884'], 'Trustworthy AI'),
    (['269677663', '269988546', '269814599'], 'Networking safety'),
    (['269811881', '269958535', '270083608'], 'Bioinformatics'),
    (['270082637', '269726900', '269963435'], 'Robotics'),
    (['270082637', '269726900', '270021884'], 'AI in Robotics'),
    (['269934201', '269769382', '269911544'], 'Algorithm'),
    (['269721983', '269928133', '000171581'], 'Data Science')
]

for pi_ids, topic in candidate_sets:
    print(f"Topic: {topic}")
    pi_candidate, co_pi_candidates, scores = rank_candidates(topic, pi_ids)
    print("Predicted PI:", pi_candidate)
    print("Predicted Co-PIs:", co_pi_candidates)
    print("Candidate Combined Scores:", scores)
    # display(df[df.pi_id.isin(pi_ids)][['pi_id', 'pi_full_name', 'role', 'department', 'leadership', 'experience_years', 'program_element']])
    # display(df_grouped[df_grouped.pi_id.isin(pi_ids)][['pi_id', 'award_count', 'experience_years', 'leadership']])
    print("-" * 50)

Topic: STATISTICS
Predicted PI: 000173003
Predicted Co-PIs: ['269948909', '269886945']
Candidate Combined Scores: [0.97993527 1.00498589 0.88427914]
--------------------------------------------------
Topic: AI Circuits design
Predicted PI: 269794080
Predicted Co-PIs: ['269807623', '269879497']
Candidate Combined Scores: [0.89764881 1.03518402 0.91554893]
--------------------------------------------------
Topic: hardware software co-design
Predicted PI: 269794080
Predicted Co-PIs: ['269807623', '269879497']
Candidate Combined Scores: [0.92971756 0.93638837 0.8663779 ]
--------------------------------------------------
Topic: Trustworthy AI
Predicted PI: 270021884
Predicted Co-PIs: ['269677663', '269988546']
Candidate Combined Scores: [0.7858882  0.92252901 0.94673603]
--------------------------------------------------
Topic: Networking safety
Predicted PI: 269814599
Predicted Co-PIs: ['269677663', '269988546']
Candidate Combined Scores: [0.83801364 0.85126384 0.9884872 ]
---------------

In [36]:
# import random

# for i in range(20):
#     pi_ids = candidate_sets[random.randint(1, 10) - 1][0]
#     topic = candidate_sets[random.randint(1, 10) - 1][1]
#     print(f"Topic: {topic}")
#     pi_candidate, co_pi_candidates, scores = rank_candidates(topic, pi_ids)
#     print("Predicted PI:", pi_candidate)
#     print("Predicted Co-PIs:", co_pi_candidates)
#     print("Candidate Combined Scores:", scores)
#     print("-" * 50)

In [37]:

# pi = [269948909, 000173003, 269886945], topic = 'STATISTICS'
# pi = [269807623, 269794080, 269879497], topic = 'AI Circuits design'
# pi = [269807623, 269794080, 269879497], topic = 'hardware software co-design'
# pi = [269677663, 269988546, 270021884], topic = 'Trustworthy AI'
# pi = [269677663, 269988546, 269814599], topic = 'Networking safety'
# pi = [269811881, 269958535, 270083608], topic = 'Bioinformatics'
# pi = [270082637, 269726900, 269963435], topic = 'Robotics'
# pi = [270082637, 269726900, 270021884], topic = 'AI in Robotics'
# pi = [269934201, 269769382, 269911544], topic = 'Algorithm'
# pi = [269721983, 269928133, 000171581], topic = 'Data Science'
# rank_candidates(research_topic, candidate_ids

In [38]:
# df[df['pi_id'] == '000173003'][['pi_full_name', 'pi_id', 'role', 'department', 'leadership', 'experience_years', 'program_element']]
df[df.pi_id == '269769382']['pi_full_name'].to_list()[0]

'Susan D Nickerson'

In [39]:
df[df.pi_full_name == 'Xin Zhang']['pi_id'].to_list()[0]

'269975480'

In [40]:
pi_ids_to_analyze = ['270082637', '269726900', '270021884']
df[df.pi_id.isin(pi_ids_to_analyze)][['pi_id', 'pi_full_name', 'role', 'department', 'leadership', 'experience_years', 'program_element']]

Unnamed: 0,pi_id,pi_full_name,role,department,leadership,experience_years,program_element
99,270021884,Guido F Montufar Cuartas,Principal Investigator,Mathematics and Statistics,1,3.394935,OFFICE OF MULTIDISCIPLINARY AC
10069,270021884,Guido F Montufar Cuartas,Principal Investigator,Mathematics and Statistics,1,2.855578,Comm & Information Foundations
35485,269726900,Shrideep B Pallickara,Co-Principal Investigator,Department of Computer Science,1,1.790554,CPS-Cyber-Physical Systems
86924,270082637,Rodrigo O Spinola,Principal Investigator,Computer Science,1,1.916496,NRI-National Robotics Initiati


In [41]:
p_r = [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]

In [42]:
y_pred = [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]  # predicted
y_true = [1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1]  # actual/ground truth
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")


Precision: 0.8333333333333334
Recall: 1.0
F1 Score: 0.9090909090909091
Accuracy: 0.8571428571428571


In [43]:
# Sample raw results (each as a string).
raw_results = [
    "['269948909', '000173003', '269886945'] - STATISTICS - Xin Zhang - 269948909 - correct",
    "['269807623', '269794080', '269879497'] - AI Circuits design - Azadeh Davoodi - 269794080 - correct",
    "['269807623', '269794080', '269879497'] - hardware software co-design - Azadeh Davoodi - 269794080 - correct",
    "['269677663', '269988546', '270021884'] - Trustworthy AI - Benjamin Fuller - 269988546 - wrong",
    "['269677663', '269988546', '269814599'] - Networking safety - Benjamin Fuller - 269988546 - correct",
    "['269811881', '269958535', '270083608'] - Bioinformatics - Jianlin Cheng - 269811881 - correct",
    "['270082637', '269726900', '269963435'] - Robotics - Rodrigo O Spinola - 270082637 - correct",
    "['270082637', '269726900', '270021884'] - AI in Robotics - Guido F Montufar Cuartas - 270021884 - wrong",
    "['269934201', '269769382', '269911544'] - Algorithm - Michael Dinitz - 269934201 - correct",
    "['269721983', '269928133', '000171581'] - Data Science - Sofya Raskhodnikova - 269721983 - correct",
    "",
    "['000025017', '000025762', '000030655'] - knowledge graph - Steven N MacEachern - 000025762 - correct",
    "['000025017', '000025762', '000030655'] - AI - Steven N MacEachern - 000025762 - correct",
    "['000025017', '000025762', '000030655'] - Neuroscience - Steven N MacEachern - 000025762 - correct",
    "['000025017', '000025762', '000030655'] - STATISTICS - Steven N MacEachern - 000025762 - correct"
]

# Initialize counters.
tp = 0  # true positives
fp = 0  # false positives
fn = 0  # false negatives
total = 0  # total evaluated samples

# Loop through each line in the results.
for line in raw_results:
    # Skip empty lines (if any)
    if not line.strip():
        continue

    total += 1
    
    # The label is the last token when splitting by ' - '
    # We assume that the parts are separated by " - " and the last part is the status.
    parts = line.split(" - ")
    status = parts[-1].strip().lower()  # e.g., 'correct', 'wrong', 'waiting'
    
    if status == "correct":
        tp += 1
    elif status == "wrong":
        # A wrong prediction means the algorithm made a prediction but it did not match
        # the true answer. This counts as a false positive and a missed correct answer (FN).
        fp += 1
        fn += 1
    else:
        print(f"Unrecognized status: {status}")

# Calculate precision, recall, F1 and accuracy rate.
if (tp + fp) > 0:
    precision = tp / (tp + fp)
else:
    precision = 0

if (tp + fn) > 0:
    recall = tp / (tp + fn)
else:
    recall = 0

if (precision + recall) > 0:
    f1 = 2 * (precision * recall) / (precision + recall)
else:
    f1 = 0

accuracy = tp / total if total > 0 else 0

# Display the results.
print("Evaluation metrics:")
print("-------------------")
print(f"Total samples: {total}")
print(f"Correct (TP): {tp}")
print(f"Wrong (FP): {fp}")
print(f"Waiting (FN): {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 score:  {f1:.4f}")
print(f"Accuracy:  {accuracy:.4f}")


Evaluation metrics:
-------------------
Total samples: 14
Correct (TP): 12
Wrong (FP): 2
Waiting (FN): 2
Precision: 0.8571
Recall:    0.8571
F1 score:  0.8571
Accuracy:  0.8571


In [44]:
# Updated raw results list.
raw_results = [
    "['269948909', '000173003', '269886945'] - STATISTICS - 000173003 - Peter D Hislop - 2nd most optimal option",
    "['269807623', '269794080', '269879497'] - AI Circuits design - 269794080 - Azadeh Davoodi - correct",
    "['269807623', '269794080', '269879497'] - hardware software co-design - 269794080 - Azadeh Davoodi - correct",
    "['269677663', '269988546', '270021884'] - Trustworthy AI - 270021884 - Guido F Montufar Cuartas - correct",
    "['269677663', '269988546', '269814599'] - Networking safety - 269814599 - Srinivas Shakkottai - wrong",
    "['269811881', '269958535', '270083608'] - Bioinformatics - 269958535 - HaiYing   Wang - wrong",
    "['270082637', '269726900', '269963435'] - Robotics - 270082637 - Rodrigo O Spinola - correct",
    "['270082637', '269726900', '270021884'] - AI in Robotics - 270021884 - Guido F Montufar Cuartas - wrong",
    "['269934201', '269769382', '269911544'] - Algorithm - 269769382 - Susan D Nickerson - wrong",
    "['269721983', '269928133', '000171581'] - Data Science - 269721983 - Sofya Raskhodnikova - correct",
    "['000025017', '000025762', '000030655'] - knowledge graph - 000025762 - Steven N MacEachern - correct",
    "['000025017', '000025762', '000030655'] - AI - 000025762 - Steven N MacEachern - correct",
    "['000025017', '000025762', '000030655'] - Neuroscience - 000025762 - Steven N MacEachern - correct",
    "['000025017', '000025762', '000030655'] - STATISTICS - 000025762 - Steven N MacEachern - correct"
]

# Initialize counters.
tp = 0  # true positives
fp = 0  # false positives
fn = 0  # false negatives
total = 0  # total valid samples

# Process each line in the results.
for line in raw_results:
    if not line.strip():
        continue  # Skip empty lines.
    
    total += 1
    parts = line.split(" - ")
    # The final token is the prediction label.
    outcome = parts[-1].strip().lower()
    
    # Count outcomes; only exactly 'correct' is TP.
    if outcome == "correct":
        tp += 1
    elif outcome in ["wrong", "2nd most optimal option"]:
        fp += 1
        fn += 1
    else:
        print(f"Unrecognized status: {outcome}")

# Compute metrics.
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall    = tp / (tp + fn) if (tp + fn) > 0 else 0
f1        = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
accuracy  = tp / total if total > 0 else 0

# Output the results.
print("Evaluation metrics:")
print("-------------------")
print(f"Total samples: {total}")
print(f"Correct (TP): {tp}")
print(f"Wrong (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 score:  {f1:.4f}")
print(f"Accuracy:  {accuracy:.4f}")


Evaluation metrics:
-------------------
Total samples: 14
Correct (TP): 9
Wrong (FP): 5
False Negatives (FN): 5
Precision: 0.6429
Recall:    0.6429
F1 score:  0.6429
Accuracy:  0.6429


In [93]:
df_combined.columns

Index(['award_type', 'award_title', 'abstract', 'org_name', 'org_name2',
       'perf_inst_name', 'program_element', 'program_reference', 'pi_id',
       'pi_full_name', 'role', 'department', 'email', 'start_date',
       'combined_text', 'leadership', 'experience_years', 'text_embedding',
       'award_count', 'award_norm', 'exp_norm'],
      dtype='object')

In [104]:
df_combined.columns

Index(['award_type', 'award_title', 'abstract', 'org_name', 'org_name2',
       'perf_inst_name', 'program_element', 'program_reference', 'pi_id',
       'pi_full_name', 'role', 'department', 'email', 'start_date',
       'combined_text', 'leadership', 'experience_years', 'text_embedding',
       'award_count', 'award_norm', 'exp_norm'],
      dtype='object')

## ML XGBOOST

In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import xgboost as xgb

# --- Simulate Research Groups for Demonstration ---
# df = df_combined
df_combined['topic'] = df_combined['award_title'].apply(lambda x: x.split()[0])  # crude topic placeholder
df_combined['topic_id'] = df_combined.groupby('topic').ngroup()

# --- Heuristic Labeling: Best = PI ---
df_combined['heuristic_score'] = df_combined['leadership'] * 2 + df_combined['experience_years'] + df_combined['award_norm']
df_combined['label'] = df_combined.groupby('topic_id')['heuristic_score'].transform(
    lambda x: (x == x.max()).astype(int)
)

# --- Flatten text_embedding (vector of 384 dims) ---
embedding_df = pd.DataFrame(df_combined['text_embedding'].to_list(), index=df_combined.index)
embedding_df.columns = [f'emb_{i}' for i in range(embedding_df.shape[1])]
df_combined = pd.concat([df_combined, embedding_df], axis=1)

# --- Final feature selection ---
features = ['leadership', 'experience_years', 'award_norm'] + list(embedding_df.columns)
df_combined = df_combined[['pi_id', 'topic_id', 'label'] + features]

# --- Train/Test Split ---
df_train, df_test = train_test_split(df_combined, test_size=0.2, stratify=df_combined['label'], random_state=42)
train_groups = df_train.groupby('topic_id').size().to_numpy()
test_groups = df_test.groupby('topic_id').size().to_numpy()

X_train = df_train[features]
y_train = df_train['label']
X_test = df_test[features]
y_test = df_test['label']

# --- DMatrix ---
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(train_groups)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(test_groups)

# --- XGBoost Config ---
params = {
    'objective': 'rank:ndcg',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'verbosity': 1,
    'eval_metric': 'ndcg@1'
}

# --- Train ---
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# --- Predict & Rank ---
df_test['score'] = model.predict(dtest)
ranked = df_test.groupby('topic_id').apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)
ranked['predicted_label'] = ranked.groupby('topic_id').cumcount().apply(lambda x: 1 if x == 0 else 0)

# --- Evaluate ---
precision = precision_score(ranked['label'], ranked['predicted_label'])
recall = recall_score(ranked['label'], ranked['predicted_label'])
f1 = f1_score(ranked['label'], ranked['predicted_label'])
accuracy = accuracy_score(ranked['label'], ranked['predicted_label'])

print("\nEvaluation Results:")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Accuracy:  {accuracy:.4f}")


[0]	train-ndcg@1:0.89344	test-ndcg@1:0.90506
[1]	train-ndcg@1:0.92088	test-ndcg@1:0.90506
[2]	train-ndcg@1:0.93632	test-ndcg@1:0.90549
[3]	train-ndcg@1:0.94318	test-ndcg@1:0.91181
[4]	train-ndcg@1:0.95047	test-ndcg@1:0.91181
[5]	train-ndcg@1:0.95304	test-ndcg@1:0.91392
[6]	train-ndcg@1:0.95819	test-ndcg@1:0.91730
[7]	train-ndcg@1:0.96162	test-ndcg@1:0.91603
[8]	train-ndcg@1:0.96398	test-ndcg@1:0.91899
[9]	train-ndcg@1:0.96805	test-ndcg@1:0.92068
[10]	train-ndcg@1:0.97063	test-ndcg@1:0.91941
[11]	train-ndcg@1:0.97213	test-ndcg@1:0.92236
[12]	train-ndcg@1:0.97491	test-ndcg@1:0.91941
[13]	train-ndcg@1:0.97620	test-ndcg@1:0.92110
[14]	train-ndcg@1:0.97877	test-ndcg@1:0.92236
[15]	train-ndcg@1:0.98049	test-ndcg@1:0.92447
[16]	train-ndcg@1:0.98070	test-ndcg@1:0.92405
[17]	train-ndcg@1:0.98349	test-ndcg@1:0.92321
[18]	train-ndcg@1:0.98435	test-ndcg@1:0.92152
[19]	train-ndcg@1:0.98521	test-ndcg@1:0.92152
[20]	train-ndcg@1:0.98542	test-ndcg@1:0.92194
[21]	train-ndcg@1:0.98563	test-ndcg@1:0.9215

  ranked = df_test.groupby('topic_id').apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)


In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import xgboost as xgb

# Copy original data
df_XGB = df_combined.copy()

# --- Prepare topics and heuristic labels ---
df_XGB['topic'] = df_XGB['award_title'].apply(lambda x: x.split()[0])
df_XGB['topic_id'] = df_XGB.groupby('topic').ngroup()
df_XGB['heuristic_score'] = (
    df_XGB['leadership'] * 2 +
    df_XGB['experience_years'] +
    df_XGB['award_norm']
)
df_XGB['label'] = df_XGB.groupby('topic_id')['heuristic_score'].transform(
    lambda x: (x == x.max()).astype(int)
)

# --- Flatten embeddings ---
embeddings = pd.DataFrame(df_XGB['text_embedding'].tolist(), index=df_XGB.index)
embeddings.columns = [f'emb_{i}' for i in range(embeddings.shape[1])]
df_XGB = pd.concat([df_XGB, embeddings], axis=1)

# --- Feature and model DataFrame ---
# nfeature_cols = ['leadership', 'experience_years', 'award_norm'] + list(embeddings.columns)
nfeature_cols = ['leadership', 'experience_years', 'award_norm'] + list(embeddings.columns)
model_df = df_XGB[['pi_id', 'topic_id', 'label'] + nfeature_cols].copy()

# --- Train/Test Split ---
train_df, test_df = train_test_split(
    model_df, test_size=0.2, stratify=model_df['label'], random_state=42
)
train_groups = train_df.groupby('topic_id').size().to_numpy()
test_groups = test_df.groupby('topic_id').size().to_numpy()

# --- Prepare for XGBoost ---
X_train = train_df[nfeature_cols].to_numpy()
y_train = train_df['label'].to_numpy()
X_test = test_df[nfeature_cols].to_numpy()
y_test = test_df['label'].to_numpy()

dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(train_groups)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(test_groups)

# --- Train XGBoost Ranking Model ---
params = {
    'objective': 'rank:ndcg', 'eta': 0.1, 'gamma': 1.0,
    'min_child_weight': 0.1, 'max_depth': 6, 'verbosity': 1,
    'eval_metric': 'ndcg@1'
}
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# --- Evaluate on held-out test split ---
results_df = test_df.copy()
results_df['score'] = model.predict(dtest)
ranked_df = results_df.groupby('topic_id').apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)
ranked_df['predicted_label'] = ranked_df.groupby('topic_id').cumcount().apply(lambda x: 1 if x == 0 else 0)
precision, recall, f1, accuracy = (
    precision_score(ranked_df['label'], ranked_df['predicted_label']),
    recall_score(ranked_df['label'], ranked_df['predicted_label']),
    f1_score(ranked_df['label'], ranked_df['predicted_label']),
    accuracy_score(ranked_df['label'], ranked_df['predicted_label'])
)
print("Held-out test split evaluation:")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")

# --- Evaluate on custom candidate sets ---
candidate_sets = [
    (['269948909', '000173003', '269886945'], 'STATISTICS'),
    (['269807623', '269794080', '269879497'], 'AI Circuits design'),
    (['269807623', '269794080', '269879497'], 'hardware software co-design'),
    (['269677663', '269988546', '270021884'], 'Trustworthy AI'),
    (['269677663', '269988546', '269814599'], 'Networking safety'),
    (['269811881', '269958535', '270083608'], 'Bioinformatics'),
    (['270082637', '269726900', '269963435'], 'Robotics'),
    (['270082637', '269726900', '270021884'], 'AI in Robotics'),
    (['269934201', '269769382', '269911544'], 'Algorithm'),
    (['269721983', '269928133', '000171581'], 'Data Science')
]

all_true, all_pred = [], []
for ids, topic_str in candidate_sets:
    # Build small group
    group_df = df_XGB[df_XGB['pi_id'].isin(ids)].copy()
    # Use heuristic as true label
    group_df['true_label'] = (group_df['heuristic_score'] == group_df['heuristic_score'].max()).astype(int)
    # Features
    X_group = group_df[nfeature_cols].to_numpy()
    # Predict scores
    scores = model.predict(xgb.DMatrix(X_group))
    group_df['score'] = scores
    # Predicted label: top score = 1
    sorted_idx = np.argsort(-scores)
    preds = np.zeros(len(scores), dtype=int)
    preds[sorted_idx[0]] = 1
    # Collect
    all_true.extend(group_df['true_label'].tolist())
    all_pred.extend(preds.tolist())
    # Print group result
    print(f"\nTopic: {topic_str}")
    print(group_df[['pi_id', 'true_label']].assign(predicted=preds))

# Metrics on candidate sets
p2, r2, f12, acc2 = (
    precision_score(all_true, all_pred),
    recall_score(all_true, all_pred),
    f1_score(all_true, all_pred),
    accuracy_score(all_true, all_pred)
)
print("\nCustom candidate sets evaluation:")
print(f"Precision: {p2:.4f}, Recall: {r2:.4f}, F1: {f12:.4f}, Accuracy: {acc2:.4f}")


[0]	train-ndcg@1:0.89344	test-ndcg@1:0.90506
[1]	train-ndcg@1:0.92088	test-ndcg@1:0.90506
[2]	train-ndcg@1:0.93632	test-ndcg@1:0.90549
[3]	train-ndcg@1:0.94318	test-ndcg@1:0.91181
[4]	train-ndcg@1:0.95047	test-ndcg@1:0.91181
[5]	train-ndcg@1:0.95304	test-ndcg@1:0.91392
[6]	train-ndcg@1:0.95819	test-ndcg@1:0.91730
[7]	train-ndcg@1:0.96162	test-ndcg@1:0.91603
[8]	train-ndcg@1:0.96398	test-ndcg@1:0.91899
[9]	train-ndcg@1:0.96805	test-ndcg@1:0.92068
[10]	train-ndcg@1:0.97063	test-ndcg@1:0.91941
[11]	train-ndcg@1:0.97213	test-ndcg@1:0.92236
[12]	train-ndcg@1:0.97491	test-ndcg@1:0.91941
[13]	train-ndcg@1:0.97620	test-ndcg@1:0.92110
[14]	train-ndcg@1:0.97877	test-ndcg@1:0.92236
[15]	train-ndcg@1:0.98049	test-ndcg@1:0.92447
[16]	train-ndcg@1:0.98070	test-ndcg@1:0.92405
[17]	train-ndcg@1:0.98349	test-ndcg@1:0.92321
[18]	train-ndcg@1:0.98435	test-ndcg@1:0.92152
[19]	train-ndcg@1:0.98521	test-ndcg@1:0.92152
[20]	train-ndcg@1:0.98542	test-ndcg@1:0.92194
[21]	train-ndcg@1:0.98563	test-ndcg@1:0.9215

  ranked_df = results_df.groupby('topic_id').apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)


In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import xgboost as xgb

# Assume df_combined is already loaded and available
# For example:
# df_combined = pd.read_csv('your_data.csv') # Placeholder for how df_combined is loaded

# Copy df_combined to avoid modifying it
df_XGB = df_combined.copy()

# --- Prepare DataFrame with topic grouping ---
df_XGB['topic'] = df_XGB['award_title'].apply(lambda x: x.split()[0])
df_XGB['topic_id'] = df_XGB.groupby('topic').ngroup()

# --- Extract metadata from original df_combined ---
meta_XGB = df_combined[['pi_id', 'award_title', 'pi_full_name', 'department', 'experience_years', 'award_count', 'leadership']].copy()

# --- Heuristic Labeling: Best = PI ---
df_XGB['heuristic_score'] = (
    df_XGB['leadership'] * 2 +
    df_XGB['experience_years'] +
    df_XGB['award_norm']
)
df_XGB['label'] = df_XGB.groupby('topic_id')['heuristic_score'].transform(
    lambda x: (x == x.max()).astype(int)
)

# --- Flatten text_embedding into separate columns ---
# Assuming 'text_embedding' column exists and contains list-like/array-like embeddings
embeddings = pd.DataFrame(df_XGB['text_embedding'].tolist(), index=df_XGB.index)
embeddings.columns = [f'emb_{i}' for i in range(embeddings.shape[1])]
df_XGB = pd.concat([df_XGB, embeddings], axis=1)

# --- Build modeling DataFrame ---
feature_cols = ['leadership', 'experience_years', 'award_norm'] + list(embeddings.columns)
model_df = df_XGB[['pi_id', 'topic_id', 'label'] + feature_cols].copy()

# --- Train/Test Split ---
# Ensure there's enough data for stratification, especially with small groups or imbalanced labels
# If 'label' stratification causes issues due to insufficient samples in some strata,
# you might consider removing stratify or ensuring more data.
# For group-wise splitting, GroupShuffleSplit or similar might be needed if topics shouldn't be split.
# However, the current code splits individual rows and then reconstructs groups for XGBoost.
train_df, test_df = train_test_split(model_df, test_size=0.2, stratify=model_df['label'], random_state=42)

# --- Prepare arrays for XGBoost ---
# Ensure that topic_id is sorted before calling .size().to_numpy() if order matters for XGBoost groups.
# Pandas groupby().size() on a non-categorical 'topic_id' might not preserve a specific order
# unless 'topic_id' is already sorted in train_df and test_df.
# For safety, explicitly sort by topic_id before grouping for group counts if not already done.
train_df_sorted_for_groups = train_df.sort_values('topic_id')
test_df_sorted_for_groups = test_df.sort_values('topic_id')

X_train = train_df_sorted_for_groups[feature_cols].to_numpy()
y_train = train_df_sorted_for_groups['label'].to_numpy()
train_groups = train_df_sorted_for_groups.groupby('topic_id').size().to_numpy()

X_test = test_df_sorted_for_groups[feature_cols].to_numpy()
y_test = test_df_sorted_for_groups['label'].to_numpy()
test_groups = test_df_sorted_for_groups.groupby('topic_id').size().to_numpy()


dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(train_groups)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(test_groups)

# --- Configure and Train XGBoost Ranking Model ---
params = {
    'objective': 'rank:ndcg',
    'eta': 0.1,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'max_depth': 6,
    'verbosity': 1,
    'eval_metric': 'ndcg@1' # Can also use other ndcg metrics like 'ndcg' or 'ndcg@k'
}
evals = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# --- Predictions and Ranking ---
# Important: Predictions should be made on data sorted in the same way as group information was derived
# test_df_sorted_for_groups was used for X_test, y_test, and test_groups
results_df = test_df_sorted_for_groups.copy() # Use the sorted test_df
preds = model.predict(dtest)
results_df['score'] = preds

# ranked_df is created by grouping the results_df (which is sorted by topic_id)
# and then sorting within each group by the predicted score.
ranked_df = results_df.groupby('topic_id', group_keys=False).apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)
ranked_df['predicted_label'] = ranked_df.groupby('topic_id')['score'].transform(lambda x: (x == x.max()).astype(int))
# A common way to assign predicted label in ranking is to mark the top item in each group:
# ranked_df['predicted_label'] = ranked_df.groupby('topic_id').cumcount().apply(lambda x: 1 if x == 0 else 0)


# --- Evaluation Metrics ---
# Ensure that 'label' (true label) and 'predicted_label' are correctly aligned in ranked_df
precision = precision_score(ranked_df['label'], ranked_df['predicted_label'], zero_division=0)
recall = recall_score(ranked_df['label'], ranked_df['predicted_label'], zero_division=0)
f1 = f1_score(ranked_df['label'], ranked_df['predicted_label'], zero_division=0)
accuracy = accuracy_score(ranked_df['label'], ranked_df['predicted_label']) # Accuracy might be less informative for ranking tasks

print("\nEvaluation Results:")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Accuracy:  {accuracy:.4f}")


# --- Merge with Metadata for Detailed Inspection ---
# ** THIS IS THE CORRECTED SECTION **
# Select only the necessary metadata columns from meta_XGB to avoid suffixing
# 'experience_years' and 'leadership' as they are already in ranked_df.
meta_columns_to_add = ['pi_id', 'award_title', 'pi_full_name', 'department', 'award_count']
# Ensure 'pi_id' is in meta_XGB[meta_columns_to_add] for merging
detailed_results = ranked_df.merge(meta_XGB[meta_columns_to_add], on='pi_id', how='left')

# --- Select and Order Columns ---
display_cols = ['topic_id', 'award_title', 'pi_id', 'pi_full_name', 'department',
                'experience_years', 'award_count', 'leadership', 'score', 'label', 'predicted_label']

# Ensure all display_cols are actually in detailed_results before selection
# For example, check: print(detailed_results.columns)
detailed_results = detailed_results[display_cols].sort_values(['topic_id', 'score'], ascending=[True, False])

# --- Output Detailed Results ---
detailed_results.to_csv('pi_ranking_test_details.csv', index=False)
print("\nDetailed test results saved to 'pi_ranking_test_details.csv'.")
print(detailed_results.head(10))

[0]	train-ndcg@1:0.87350	test-ndcg@1:0.96540
[1]	train-ndcg@1:0.92603	test-ndcg@1:0.97342
[2]	train-ndcg@1:0.94383	test-ndcg@1:0.97553
[3]	train-ndcg@1:0.95412	test-ndcg@1:0.97595
[4]	train-ndcg@1:0.96033	test-ndcg@1:0.97764
[5]	train-ndcg@1:0.96677	test-ndcg@1:0.97975
[6]	train-ndcg@1:0.97384	test-ndcg@1:0.98228
[7]	train-ndcg@1:0.97727	test-ndcg@1:0.98143
[8]	train-ndcg@1:0.98113	test-ndcg@1:0.98397
[9]	train-ndcg@1:0.98328	test-ndcg@1:0.98608
[10]	train-ndcg@1:0.98349	test-ndcg@1:0.98523
[11]	train-ndcg@1:0.98542	test-ndcg@1:0.98776
[12]	train-ndcg@1:0.98628	test-ndcg@1:0.98776
[13]	train-ndcg@1:0.98756	test-ndcg@1:0.98776
[14]	train-ndcg@1:0.98821	test-ndcg@1:0.98734
[15]	train-ndcg@1:0.98928	test-ndcg@1:0.98692
[16]	train-ndcg@1:0.98949	test-ndcg@1:0.98608
[17]	train-ndcg@1:0.99057	test-ndcg@1:0.98565
[18]	train-ndcg@1:0.99121	test-ndcg@1:0.98523
[19]	train-ndcg@1:0.99142	test-ndcg@1:0.98481
[20]	train-ndcg@1:0.99185	test-ndcg@1:0.98565
[21]	train-ndcg@1:0.99228	test-ndcg@1:0.9865

  ranked_df = results_df.groupby('topic_id', group_keys=False).apply(lambda g: g.sort_values('score', ascending=False)).reset_index(drop=True)



Evaluation Results:
Precision: 0.4763
Recall:    0.9745
F1 Score:  0.6398
Accuracy:  0.9197

Detailed test results saved to 'pi_ranking_test_details.csv'.
   topic_id                                        award_title      pi_id  \
0         1  Planning Grant: Collaborative Research: The Wi...  269897386   
1         1  EAGER: Collaborative Research: Enhancing Asian...  269897386   
2         1  Collaborative Research: Wolf RACE (Resource Av...  269897386   
3         1  "CAREER:" Shark Survivor! Interdisciplinary ap...  269897386   
4         4  "IUCRC Phase I: University of Maryland at Balt...  269960017   
5         6  Constructing Valid, Equitable, and Flexible Ki...  269692271   
6         6  "STEM Fluency": Expanding the Effectiveness, R...  269692271   
7         6  NSF Convergence Accelerator- Track C: QuSTEAM:...  269692271   
8         6  NSF Convergence Accelerator- Track C: QuSTEAM:...  269692271   
9         8  "Track 1" for Planning or Conference Grant, CO...  269981447 