In [1]:
import os
import json
import pandas as pd
import re
from collections import defaultdict
from typing import List
from dotenv import load_dotenv
from IPython.display import display

In [2]:
load_dotenv()
S2_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

In [3]:
NSF_RAW_DATA_DIR = "../data/ranking_data/" 
if not os.path.exists(NSF_RAW_DATA_DIR):
    print(f"Error: Ranking data directory not found at {NSF_RAW_DATA_DIR}")

In [4]:
def safe_get(dictionary, keys):
    """Safely gets nested dictionary values."""
    for key in keys:
        if isinstance(dictionary, dict):
            dictionary = dictionary.get(key)
        else:
            return None
    return dictionary

def extract_keywords(text: str, top_n: int = 15) -> List[str]:
    """A simple keyword extractor based on word frequency (ignoring common stop words)."""
    if not isinstance(text, str):
        return []
    stop_words = {'the', 'a', 'an', 'and', 'in', 'is', 'of', 'for', 'to', 'with', 'on', 'this', 'grant', 'project', 'research', 'award', 'support', 'program'}
    words = re.findall(r'\b\w{4,}\b', text.lower()) # Find words with 4+ letters
    filtered_words = [word for word in words if word not in stop_words]
    word_counts = defaultdict(int)
    for word in filtered_words:
        word_counts[word] += 1
    return sorted(word_counts, key=word_counts.get, reverse=True)[:top_n]

In [5]:
def create_profiles_from_raw_data() -> pd.DataFrame:
    """
    Reads all raw NSF JSONs, aggregates ALL data for each unique researcher,
    and returns a DataFrame of rich profiles.
    """
    print(f"--- Step 1: Loading and Processing NSF Data from '{NSF_RAW_DATA_DIR}' ---")
    
    records = []
    for sub_dir in os.listdir(NSF_RAW_DATA_DIR):
        # sub_dir = '2024'
        sub_directory = os.path.join(NSF_RAW_DATA_DIR, sub_dir)
        if os.path.isdir(sub_directory):
            print(f"Reading files in {sub_dir}...")
            for filename in os.listdir(sub_directory):
                if filename.endswith('.json'):
                    filepath = os.path.join(sub_directory, filename)
                    try:
                        with open(filepath, 'r') as file:
                            data = json.load(file)
                        
                        # --- THE FIX IS HERE ---
                        # 1. Get the 'pi' list from the JSON data.
                        pi_list = data.get("pi")
                        
                        # 2. Add a safety check. If 'pi' is missing, is null, or is not a list, skip this file.
                        if not isinstance(pi_list, list):
                            print(f"Warning: 'pi' key in {filepath} is missing or not a list. Skipping.")
                            continue

                        # If we get here, we know pi_list is safe to iterate over.
                        award_type = data.get("awd_istr_txt")
                        award_title = data.get("awd_titl_txt")
                        abstract = data.get("abst_narr_txt")
                        org_name = data.get("org_long_name")
                        org_name2 = data.get("org_long_name2")
                        perf_inst_name = safe_get(data, ["perf_inst", "perf_inst_name"])

                        pgm_ele_list = data.get("pgm_ele")
                        program_element = pgm_ele_list[0].get("pgm_ele_long_name") if isinstance(pgm_ele_list, list) and len(pgm_ele_list) > 0 else None

                        pgm_ref_list = data.get("pgm_ref")
                        program_reference = pgm_ref_list[0].get("pgm_ref_long_name") if isinstance(pgm_ref_list, list) and len(pgm_ref_list) > 0 else None
                        
                        for pi in pi_list:
                            records.append({
                                "pi_id": pi.get("nsf_id"),
                                "pi_full_name": pi.get("pi_full_name", "").strip(),
                                "email": pi.get("pi_email_addr"),
                                "role": pi.get("proj_role_code2", "").strip() if pi.get("proj_role_code2") else None,
                                "award_title": award_title, "award_type": award_type,
                                "abstract": abstract, "perf_inst_name": perf_inst_name,
                                "org_name": org_name, "org_name2": org_name2,
                                "program_element": program_element, "program_reference": program_reference,
                            })
                    except Exception as e:
                        print(f"Error reading or processing {filepath}: {e}")
        break
    if not records:
        print("No records were processed. Returning empty DataFrame.")
        return pd.DataFrame()
        
    df = pd.DataFrame(records)
    df.dropna(subset=['pi_id', 'pi_full_name'], inplace=True)

    print(f"\nAggregating data for {df['pi_id'].nunique()} unique researchers...")
    
    text_fields = ['award_title', 'abstract', 'program_element', 'program_reference']
    df['content'] = df[text_fields].fillna('').astype(str).agg(' '.join, axis=1)

    agg_funcs = {
        'pi_full_name': (lambda x: x.mode()[0]), 'email': (lambda x: list(x.dropna().unique())),
        'role': (lambda x: list(x.unique())), 'award_title': list, 'award_type': list,
        'perf_inst_name': (lambda x: list(x.dropna().unique())),
        'org_name': (lambda x: list(x.dropna().unique())),
        'program_element': (lambda x: list(x.dropna().unique())),
        'program_reference': (lambda x: list(x.dropna().unique())), 'content': ' '.join
    }
    profiles_df = df.groupby('pi_id').agg(agg_funcs).reset_index()

    profiles_df['primary_affiliation'] = profiles_df['perf_inst_name'].apply(lambda x: x[0] if x else None)
    profiles_df['keywords'] = profiles_df['content'].apply(extract_keywords)
    
    print("--- Step 1 Complete: NSF Profiles Created ---")
    return profiles_df

In [6]:
# ranking_data = create_profiles_from_raw_data()
# ranking_data.head()

In [7]:
OUTPUT_DIR = "s2_author_data"
from scholar_api_service import SemanticScholarAPI

def run_enrichment_pipeline():
    """ The main orchestrator function. """
    profiles_df = create_profiles_from_raw_data()
    
    if profiles_df.empty:
        print("No profiles to process. Exiting.")
        return

    print("\n--- Step 2: Starting Bulk Data Fetching from Semantic Scholar ---")
    
    if not S2_API_KEY:
        print("FATAL: SEMANTIC_SCHOLAR_API_KEY not found. Exiting.")
        return
        
    s2_api = SemanticScholarAPI(api_key=S2_API_KEY)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Output will be saved to '{OUTPUT_DIR}'.")
    
    profiles_df = profiles_df.iloc[:50,:]
    total_researchers = len(profiles_df)
    print(f"Found {total_researchers} unique researcher profiles to process.")
    
    for i, profile in profiles_df.iterrows():
        pi_id = profile['pi_id']
        name = profile['pi_full_name']
        
        print(f"\n[{i+1}/{total_researchers}] Processing: {name} (NSF ID: {pi_id})")

        output_filename = os.path.join(OUTPUT_DIR, f"{pi_id}.json")
        if os.path.exists(output_filename):
            print(f"  -> Data already exists. Skipping.")
            continue

        # --- KEY CHANGE FOR EFFICIENCY ---
        # Call the new efficient search method
        candidates = s2_api.search_authors_with_content(name)
        if not candidates:
            print("  -> No candidates found on Semantic Scholar.")
            continue
            
        best_match = s2_api.find_best_author_match(candidates, profile)
        if not best_match:
            print("  -> Could not determine a confident match.")
            continue
            
        s2_id = best_match['authorId']
        
        print(f"  -> Fetching full details for S2 ID: {s2_id}...")
        author_details = s2_api.get_author_details(s2_id)
        
        if author_details:
            author_details['nsf_pi_id'] = pi_id
            with open(output_filename, 'w') as f:
                json.dump(author_details, f, indent=4)
            print(f"  -> SUCCESS: Saved detailed data to {output_filename}")
        else:
            print(f"  -> FAILED: Could not fetch details for S2 ID {s2_id}.")

    print("\n--- Full Enrichment Pipeline Complete ---")

if __name__ == "__main__":
    run_enrichment_pipeline()

--- Step 1: Loading and Processing NSF Data from '../data/ranking_data/' ---
Reading files in 2013...

Aggregating data for 17454 unique researchers...
--- Step 1 Complete: NSF Profiles Created ---

--- Step 2: Starting Bulk Data Fetching from Semantic Scholar ---
Semantic Scholar API client initialized with API key.
Output will be saved to 's2_author_data'.
Found 50 unique researcher profiles to process.

[1/50] Processing: Axel K Schmitt (NSF ID: 000002157)
  -> Data already exists. Skipping.

[2/50] Processing: Huan Z Huang (NSF ID: 000003534)
  -> Data already exists. Skipping.

[3/50] Processing: Maryann P Feldman (NSF ID: 000004317)
  -> Data already exists. Skipping.

[4/50] Processing: Ioan Bejenaru (NSF ID: 000004907)
  -> Data already exists. Skipping.

[5/50] Processing: Qing Zhou (NSF ID: 000005427)
  -> Data already exists. Skipping.

[6/50] Processing: edgar a mendoza (NSF ID: 000010704)
  -> Data already exists. Skipping.

[7/50] Processing: Martha H Conklin (NSF ID: 000