In [1]:
from dotenv import load_dotenv
load_dotenv() ## load all the environment variables from .env
import glob
# import streamlit as st
import os
from PIL import Image
import google.generativeai as genai
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import textwrap
from typing import List, Dict, Tuple, Optional # For type hinting
import time
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

## Load Gemini model
model=genai.GenerativeModel('gemini-2.0-flash-thinking-exp-01-21')

In [3]:
def get_gemini_response(input,image,user_prompt):
    response=model.generate_content([input,image[0],user_prompt])
    return response.text

In [4]:
import os
import json
import pandas as pd

data_directory = 'data/ranking_data/'
records = []

def safe_get(data, keys, default=None):
    """
    Safely get a nested key from a dictionary using a list of keys.
    Returns default if any key is missing.
    """
    for key in keys:
        if isinstance(data, dict) and key in data:
            data = data[key]
        else:
            return default
    return data

for sub_dir in os.listdir(data_directory):
    print(f"Reading files in {sub_dir}...")
    sub_directory = os.path.join(data_directory, sub_dir)
    for filename in os.listdir(sub_directory):
        if filename.endswith('.json'):
            filepath = os.path.join(sub_directory, filename)
            try:
                with open(filepath, 'r') as file:
                    data = json.load(file)
            except Exception as e:
                print(f"Error reading {filepath}: {e}")
                continue

            # Extract award-level context information safely
            award_type = data.get("awd_istr_txt")
            award_title = data.get("awd_titl_txt")
            abstract = data.get("abst_narr_txt")
            org_name = data.get("org_long_name")
            org_name2 = data.get("org_long_name2")
            perf_inst_name = safe_get(data, ["perf_inst", "perf_inst_name"])
            
            # Extract program element and reference safely (checking if list exists)
            pgm_ele_list = data.get("pgm_ele")
            if isinstance(pgm_ele_list, list) and len(pgm_ele_list) > 0:
                program_element = pgm_ele_list[0].get("pgm_ele_long_name")
            else:
                program_element = None

            pgm_ref_list = data.get("pgm_ref")
            if isinstance(pgm_ref_list, list) and len(pgm_ref_list) > 0:
                program_reference = pgm_ref_list[0].get("pgm_ref_long_name")
            else:
                program_reference = None

            # Get investigator information, ensuring it's a list
            pi_list = data.get("pi")
            if not isinstance(pi_list, list):
                continue

            # Loop through each investigator in the file
            for pi in pi_list:
                record = {
                    "award_type": award_type,
                    "award_title": award_title,
                    "abstract": abstract,
                    "org_name": org_name,
                    "org_name2": org_name2,
                    "perf_inst_name": perf_inst_name,
                    "program_element": program_element,
                    "program_reference": program_reference,
                    "pi_id": pi.get("pi_id"),
                    "pi_full_name": pi.get("pi_full_name", "").strip() if pi.get("pi_full_name") else None,
                    "role": pi.get("proj_role_code2", "").strip() if pi.get("proj_role_code2") else None,
                    "department": pi.get("pi_dept_name"),
                    "email": pi.get("pi_email_addr"),
                    "start_date": pi.get("start_date")
                }
                records.append(record)

# Create a DataFrame from the records
df = pd.DataFrame(records)


Reading files in 2022...
Reading files in 2024...
Reading files in 2023...
Reading files in 2021...
Reading files in 2020...


In [5]:
df = df[df['role'].isin(['Co-Principal Investigator', 'Principal Investigator'])]
df.head()

Unnamed: 0,award_type,award_title,abstract,org_name,org_name2,perf_inst_name,program_element,program_reference,pi_id,pi_full_name,role,department,email,start_date
0,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269967889,Terrance Figy,Co-Principal Investigator,"Mathematics, Statistics, and Physics",Terrance.Figy@wichita.edu,2024-08-29
1,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269758255,Pratul K Agarwal,Principal Investigator,,pratul.agarwal@okstate.edu,2022-08-03
2,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",224099,Mickey Slimp,Co-Principal Investigator,Department of Chemistry,,2024-08-29
4,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",269666332,William H Hsu,Co-Principal Investigator,Computer Science,bhsu@ksu.edu,2022-08-03
7,Standard Grant,MRI: Acquisition of a High-Performance Computa...,This project will acquire and deploy a high-pe...,Directorate for Computer and Information Scien...,Office of Advanced Cyberinfrastructure (OAC),Oklahoma State University,Major Research Instrumentation,"WOMEN, MINORITY, DISABLED, NEC",270046494,Robert Fleming,Co-Principal Investigator,Engineering,rofleming@AState.edu,2024-08-29


In [None]:
# import pandas as pd
# import google.generativeai as genai # Assuming you use this library
# import time
# import textwrap # Useful for formatting text nicely

# # --- Input Parameters ---
# pi_ids_to_analyze = ['000025017', '000025762', '000030655']
# research_topic = 'STATISTICS'

# # --- 1. Filter DataFrame ---
# # Select rows where 'pi_id' is in our list of interest
# filtered_df = df[df['pi_id'].isin(pi_ids_to_analyze)].copy() # Use .copy() to avoid potential SettingWithCopyWarning

# # --- 2. Format Data for the Prompt ---
# formatted_pi_data = ""
# pi_names = {} # Dictionary to store PI names for easier reference

# if filtered_df.empty:
#     print(f"Warning: No data found in the DataFrame for the provided PI IDs: {pi_ids_to_analyze}")
#     # Decide how to proceed - perhaps exit or send a prompt indicating no data
#     formatted_pi_data = "No data could be retrieved for the specified potential collaborators."
#     # If you still want to query the model, it needs to know data is missing.
#     # Or you might skip the API call entirely.
# else:
#     # Group data by PI to present it coherently
#     for pi_id in pi_ids_to_analyze:
#         pi_specific_data = filtered_df[filtered_df['pi_id'] == pi_id]

#         if not pi_specific_data.empty:
#             # Try to get a consistent name and department
#             # Taking the first entry assuming it's representative for name/dept
#             full_name = pi_specific_data['pi_full_name'].iloc[0]
#             department = pi_specific_data['department'].iloc[0]
#             pi_names[pi_id] = full_name # Store for later use in prompt

#             formatted_pi_data += f"--- Researcher: {full_name} (ID: {pi_id}) ---\n"
#             formatted_pi_data += f"Department: {department}\n"
#             formatted_pi_data += "Relevant Roles & Awards Found:\n"

#             # Iterate through each award/entry for this PI
#             for index, row in pi_specific_data.iterrows():
#                 formatted_pi_data += f"- Role: {row.get('role', 'N/A')}\n" # Use .get for safety if column might be missing
#                 formatted_pi_data += f"  Award Title: {row.get('award_title', 'N/A')}\n"
#                 formatted_pi_data += f"  Start Date: {row.get('start_date', 'N/A')}\n"
#                 # Shorten abstract to keep context concise
#                 abstract_preview = textwrap.shorten(row.get('abstract', 'N/A'), width=200, placeholder="...")
#                 formatted_pi_data += f"  Abstract Snippet: {abstract_preview}\n"
#                 formatted_pi_data += f"  Program Element/Reference: {row.get('program_element', 'N/A')} / {row.get('program_reference', 'N/A')}\n\n" # Add space between awards

#         else:
#             # Handle case where a PI ID from the list was not found in the df
#              formatted_pi_data += f"--- Researcher ID: {pi_id} ---\n"
#              formatted_pi_data += "No award data found in the provided dataset for this PI.\n\n"
#              pi_names[pi_id] = f"PI ID {pi_id}" # Placeholder name

# # --- 3. Construct the Prompt ---
# # Get a comma-separated list of names for the prompt text
# collaborator_names_list = ", ".join(pi_names.values())



In [None]:
# collaborator_names_list
# print(formatted_pi_data)

In [None]:
# full_prompt = f"""
# Context:
# The following researchers ({collaborator_names_list}) are considering collaborating on a new research project focused on the topic '{research_topic}'. Below is information extracted from a database about their previous grants and roles:

# {formatted_pi_data}

# Task:
# Based *only* on the information provided above, please analyze the qualifications, experience, and relevance of past work for each researcher ({collaborator_names_list}). Recommend which of these individuals would be the most suitable Principal Investigator (PI) to lead this new collaborative project on '{research_topic}'.

# Provide a detailed explanation for your recommendation. Consider factors apparent from the data, such as:
# - Direct relevance of their past research (award titles, abstracts, program elements) to the topic '{research_topic}'.
# - Demonstrated experience (e.g., number of awards listed, roles held like 'Principal Investigator').
# - Any indicators of leadership or seniority (e.g., award types like 'Career Award' if present, consistent PI roles).

# Please identify the suggested PI clearly by name and justify your choice thoroughly using specific evidence from the provided context. If the data is insufficient to make a strong recommendation for any particular candidate, please state that clearly as well.
# """

# print("--- Sending Request to Gemini ---")

# start_time = time.time()

# contents = [full_prompt] # Pass the combined prompt as a single text part

# try:
#     # *** Ensure 'model' is your loaded and configured Gemini model object ***
#     responses = model.generate_content(contents, stream=True)

#     # --- 5. Process and Print the Response ---
#     print("\n-------Response--------")
#     full_response_text = ""
#     for response in responses:
#         print(response.text, end="")
#         full_response_text += response.text # Accumulate the full response if needed later
#     print("\n-----------------------")

#     response_time = time.time() - start_time
#     print(f"\nResponse generated in {response_time:.2f} seconds.")

# except AttributeError:
#      print("\nError: 'model' object not found or not configured correctly.")
#      print("Please ensure the 'model' variable holds your loaded Gemini model.")
# except Exception as e:
#     print(f"\nAn error occurred during the API call: {e}")
#     response_time = time.time() - start_time # Measure time even if error occurs
#     print(f"Attempt failed after {response_time:.2f} seconds.")

In [6]:
df.shape

(83112, 14)

In [7]:
def filter_data_by_pi(df: pd.DataFrame, pi_ids: List[str]) -> pd.DataFrame:
    """
    Filters the DataFrame to include only rows matching the provided PI IDs.

    Args:
        df: The input DataFrame.
        pi_ids: A list of PI IDs (strings) to filter by.

    Returns:
        A DataFrame containing only the rows for the specified PI IDs.
    """
    print(f"Filtering DataFrame for PI IDs: {pi_ids}...")
    filtered = df[df['pi_id'].isin(pi_ids)].copy()
    print(f"Found {len(filtered)} relevant entries.")
    return filtered

In [8]:
def format_pi_data_for_prompt(filtered_df: pd.DataFrame, pi_ids_to_format: List[str]) -> Tuple[str, Dict[str, str]]:
    """
    Formats the filtered PI data into a string suitable for the prompt context.

    Args:
        filtered_df: The DataFrame already filtered for relevant PIs.
        pi_ids_to_format: The original list of PI IDs requested, to ensure all are mentioned.


    Returns:
        A tuple containing:
            - formatted_data_string: A string with formatted details for each PI.
            - pi_names_dict: A dictionary mapping PI ID to PI full name.
    """
    print("Formatting data for prompt...")
    formatted_data = ""
    pi_names = {} # Dictionary to store PI names

    if filtered_df.empty:
        print("Warning: Filtered DataFrame is empty. Formatting 'no data' message.")
        formatted_data = "No data could be retrieved for the specified potential collaborators.\n"
        for pi_id in pi_ids_to_format:
             pi_names[pi_id] = f"PI ID {pi_id}" # Use ID as placeholder name
        return formatted_data, pi_names

    # Iterate through the original list to ensure all requested PIs are accounted for
    for pi_id in pi_ids_to_format:
        pi_specific_data = filtered_df[filtered_df['pi_id'] == pi_id]

        if not pi_specific_data.empty:
            # Get consistent name and department from the first entry
            full_name = pi_specific_data['pi_full_name'].iloc[0]
            department = pi_specific_data['department'].iloc[0]
            pi_names[pi_id] = full_name

            formatted_data += f"--- Researcher: {full_name} (ID: {pi_id}) ---\n"
            formatted_data += f"Department: {department}\n"
            formatted_data += "Relevant Roles & Awards Found:\n"

            for index, row in pi_specific_data.iterrows():
                formatted_data += f"- Role: {row.get('role', 'N/A')}\n"
                formatted_data += f"  Award Title: {row.get('award_title', 'N/A')}\n"
                formatted_data += f"  Start Date: {row.get('start_date', 'N/A')}\n"
                abstract_preview = textwrap.shorten(row.get('abstract', 'N/A'), width=200, placeholder="...")
                formatted_data += f"  Abstract Snippet: {abstract_preview}\n"
                formatted_data += f"  Program Element/Reference: {row.get('program_element', 'N/A')} / {row.get('program_reference', 'N/A')}\n\n"
        else:
            # Handle case where a specific PI ID from the list had no data in the filtered df
            formatted_data += f"--- Researcher ID: {pi_id} ---\n"
            formatted_data += "No award data found in the provided dataset for this PI.\n\n"
            pi_names[pi_id] = f"PI ID {pi_id}" # Use ID as placeholder name

    print("Data formatting complete.")
    return formatted_data, pi_names

In [9]:
def generate_recommendation_prompt(formatted_data_string: str, pi_names_dict: Dict[str, str], research_topic: str) -> str:
    """
    Generates the full prompt string for the Gemini model.

    Args:
        formatted_data_string: The formatted string containing PI details.
        pi_names_dict: A dictionary mapping PI ID to PI name.
        research_topic: The research topic for collaboration.

    Returns:
        The complete prompt string.
    """
    print("Generating prompt...")
    collaborator_names_list = ", ".join(pi_names_dict.values())

    prompt = f"""
        Context:
        The following researchers ({collaborator_names_list}) are considering collaborating on a new research project focused on the topic '{research_topic}'. Below is information extracted from a database about their previous grants and roles:

        {formatted_data_string}

        Task:
        Based *only* on the information provided above, please analyze the qualifications, experience, and relevance of past work for each researcher ({collaborator_names_list}). Recommend which of these individuals would be the most suitable Principal Investigator (PI) to lead this new collaborative project on '{research_topic}'.

        Provide a detailed explanation for your recommendation. Consider factors apparent from the data, such as:
        - Direct relevance of their past research (award titles, abstracts, program elements) to the topic '{research_topic}'.
        - Demonstrated experience (e.g., number of awards listed, roles held like 'Principal Investigator').
        - Any indicators of leadership or seniority (e.g., award types like 'Career Award' if present, consistent PI roles).

        Please identify the suggested PI clearly by name and justify your choice thoroughly using specific evidence from the provided context. If the data is insufficient to make a strong recommendation for any particular candidate, please state that clearly as well.
        """
    print("Prompt generated.")
    return prompt

In [10]:
def get_gemini_response(model: genai.GenerativeModel, prompt: str) -> Tuple[Optional[str], float]:
    """
    Sends the prompt to the Gemini model, streams the response, and measures time.

    Args:
        model: The configured Gemini model object.
        prompt: The prompt string to send to the model.

    Returns:
        A tuple containing:
            - The full response text as a string (or None if an error occurs).
            - The time taken for the API call in seconds.
    """
    print("--- Sending Request to Gemini ---")
    start_time = time.time()
    full_response_text = ""
    contents = [prompt] # Prepare contents for the API

    try:
        responses = model.generate_content(contents, stream=True)

        print("\n-------Response--------")
        for response in responses:
            print(response.text, end="")
            full_response_text += response.text
        print("\n-----------------------")

        response_time = time.time() - start_time
        print(f"\nResponse generated in {response_time:.2f} seconds.")
        return full_response_text, response_time

    except AttributeError:
        response_time = time.time() - start_time
        print("\nError: 'model' object not found or not configured correctly.")
        print("Please ensure the 'model' variable holds your loaded Gemini model.")
        return None, response_time
    except Exception as e:
        response_time = time.time() - start_time
        print(f"\nAn error occurred during the API call: {e}")
        print(f"Attempt failed after {response_time:.2f} seconds.")
        return None, response_time

In [11]:
def recommend_pi(df: pd.DataFrame, model: genai.GenerativeModel, pi_ids: List[str], research_topic: str) -> Optional[str]:
    """
    Orchestrates the process of filtering data, formatting, generating prompt,
    and getting a PI recommendation from the Gemini model.

    Args:
        df: The main DataFrame.
        model: The configured Gemini model object.
        pi_ids: A list of PI IDs to consider.
        research_topic: The topic for collaboration.

    Returns:
        The recommendation text from the model, or None if an error occurred
        or essential steps failed.
    """
    print(f"\n--- Starting PI Recommendation Process for Topic: '{research_topic}' ---")

    # 1. Filter Data
    filtered_data = filter_data_by_pi(df, pi_ids)
    # Optional: Add a check here if you want to stop if no data is found at all
    # if filtered_data.empty:
    #     print("Stopping process as no data was found for any specified PI.")
    #     return None

    # 2. Format Data
    # Pass the original pi_ids list to ensure all are mentioned in formatting
    formatted_text, pi_names = format_pi_data_for_prompt(filtered_data, pi_ids)

    # 3. Generate Prompt
    prompt_text = generate_recommendation_prompt(formatted_text, pi_names, research_topic)

    # 4. Get Response
    recommendation, duration = get_gemini_response(model, prompt_text)

    print(f"--- PI Recommendation Process Complete ({duration:.2f}s) ---")
    return recommendation

In [12]:
if __name__ == "__main__":
    
    # Example model setup (replace with your actual model)
    # Note: You need to handle API key configuration securely
    try:
        # -----------------------------------------------------------------

        # --- Input Parameters ---
        pi_ids_to_analyze = ['000025017', '000025762', '000030655']
        research_topic = 'STATISTICS'

        # --- Run the Recommendation Process ---
        recommendation_result = recommend_pi(df, model, pi_ids_to_analyze, research_topic)

        # Optional: Do something with the result
        if recommendation_result:
            print("\n--- Final Recommendation Text ---")
            # print(recommendation_result) # Already printed during streaming
            pass # Result is already printed by get_gemini_response
        else:
            print("\nRecommendation could not be generated.")

    except ImportError:
        print("Please install required libraries: pip install pandas google-generativeai")
    except Exception as e:
        print(f"An error occurred during setup or execution: {e}")


--- Starting PI Recommendation Process for Topic: 'STATISTICS' ---
Filtering DataFrame for PI IDs: ['000025017', '000025762', '000030655']...
Found 8 relevant entries.
Formatting data for prompt...
Data formatting complete.
Generating prompt...
Prompt generated.
--- Sending Request to Gemini ---

-------Response--------
## Analysis of Researcher Qualifications and Experience for 'STATISTICS' Project:

Here's an analysis of each researcher based on the provided data, focusing on their suitability as PI for a 'STATISTICS' project:

**1. Robert D Palmer (ID: 000025017)**

* **Department:** Meteorology
* **Relevance to 'STATISTICS':**  Low. His research is centered around weather radar technology (RaXPol), atmospheric research, and meteorological education. The keywords in his award titles and abstracts are "Radar," "Weather," "Atmospheric," "Mobile," and "Education."  There is no explicit mention of statistical methodologies, statistical theory, or statistical applications as the primary

In [None]:
df[df['pi_id'].isin(pi_ids_to_analyze)][['pi_full_name', 'pi_id']]

In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Combine relevant text columns into one (you may adjust columns as needed)
text_columns = [
    "award_type", "award_title", "abstract", 
    "org_name", "org_name2", "perf_inst_name", 
    "program_element", "program_reference"
]
df["combined_text"] = df[text_columns].astype(str).agg(" ".join, axis=1)

# a. Leadership indicator: 1 if role suggests prior leadership (e.g., contains "Principal Investigator")
df["leadership"] = df["role"].apply(lambda x: 1 if "Principal Investigator" in str(x) else 0)

# b. Experience in years: use start_date and a reference date (here we use today)
df["start_date"] = pd.to_datetime(df["start_date"], errors='coerce')
reference_date = datetime.now()  # or use a fixed project date
df["experience_years"] = (reference_date - df["start_date"]).dt.days / 365.25

# Load a pre-trained sentence transformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embedding for each award's combined text
df["text_embedding"] = df["combined_text"].apply(lambda x: embedder.encode(x))

# We assume each row has a researcher ID ("pi_id"). If a researcher has multiple rows, we aggregate.
# For aggregated text, we average the embeddings; for numeric features, we use appropriate aggregation.
award_counts = df.groupby("pi_id").size().reset_index(name="award_count")
df_grouped = df.groupby("pi_id").agg({
    "experience_years": "mean",       # average experience across awards
    "leadership": "max",              # if they have ever been a PI, mark as leadership
    "text_embedding": lambda embs: np.mean(np.stack(embs), axis=0)
}).reset_index()
df_grouped = df_grouped.merge(award_counts, on="pi_id", how="left")

# For later scoring, normalize the numeric features (experience and award_count)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_grouped[["exp_norm", "award_norm"]] = scaler.fit_transform(df_grouped[["experience_years", "award_count"]])

In [None]:
def rank_candidates(research_topic, candidate_ids, base_weight=0.5, topic_weight=0.5):
    # Compute embedding for the research topic
    topic_emb = embedder.encode(research_topic)
    
    candidate_scores = []
    for cid in candidate_ids:
        candidate = df_grouped[df_grouped["pi_id"] == cid].iloc[0]
        
        # Topic relevance score: cosine similarity between candidate's aggregated embedding and the topic
        candidate_emb = candidate["text_embedding"]
        relevance_score = cosine_similarity([candidate_emb], [topic_emb])[0][0]
        
        # Base score: a simple weighted sum of normalized features plus a bonus for leadership
        # Adjust weights as needed. Here, leadership gets a bonus of 1 if present.
        base_score = candidate["exp_norm"] + candidate["award_norm"] + (1 if candidate["leadership"] == 1 else 0)
        
        # Combined score: weighted combination of base score and topic relevance
        combined_score = base_weight * base_score + topic_weight * relevance_score
        candidate_scores.append(combined_score)
    
    candidate_scores = np.array(candidate_scores)
    best_index = np.argmax(candidate_scores)
    pi_candidate = candidate_ids[best_index]
    co_pi_candidates = [cid for i, cid in enumerate(candidate_ids) if i != best_index]
    
    return pi_candidate, co_pi_candidates, candidate_scores

In [None]:
# Test with various topics dynamically.
for test_topic in ["knowledge graph", "AI", "Neuroscience", "STATISTICS"]:
    print("Testing with topic:", test_topic)
    pi_candidate, co_pi_candidates, scores = rank_candidates(test_topic, pi_ids_to_analyze)
    print("Predicted PI:", pi_candidate)
    print("Predicted Co-PIs:", co_pi_candidates)
    print("Candidate Combined Scores:", scores)
    print("-" * 50)

In [None]:
df[df.pi_id.isin(pi_ids_to_analyze)][['pi_id', 'pi_full_name', 'role', 'department', 'leadership', 'experience_years', 'program_element']]