# 1. Extracting themes and expertise

In [None]:
import os
import json
import pandas as pd

# New import path for v1:
from openai import OpenAI


client = OpenAI(
    api_key="xxx"  # Add OpenAI key
)

# Configuration
MODEL_NAME = "gpt-3.5-turbo"
MAX_CHARS_PER_CHUNK = 4000

def chunk_text(text, chunk_size=MAX_CHARS_PER_CHUNK):
    """
    Splits text into smaller pieces so we don't exceed token limits.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end
    return chunks

def call_openai_chat(messages):
    """
    Calls the new openai v1 library client:
    client.chat.completions.create(...)
    """
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=0.0,   # 0 for deterministic
            max_tokens=1000     # Adjust if you want longer or shorter responses
        )
        # Return the text from the first choice
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return ""

def extract_and_classify(text):
    """
    1) Split the researcher's text into chunks.
    2) For each chunk, request up to 20 keywords/phrases.
    3) Combine them, then classify into "Themes" vs. "Expertise" in JSON.
    """
    text_chunks = chunk_text(text)
    all_keywords = set()

    # Extract up to 40 keywords from each chunk
    for i, chunk in enumerate(text_chunks):
        if not chunk.strip():
            continue

        prompt_chunk = f"""
You are a specialized assistant with expertise in linguistic, cognitive, and social science research.
Below is a text chunk from a researcher's bio/publications:

\"\"\"{chunk}\"\"\"

Identify up to 40 specific and actionable keywords or phrases that represent:
1. "Research Themes" - Domains/subdomains (e.g., semantics, cognitive science, hearing loss, primate behavior), Specific theories (e.g., iconicity, motor theory, prosodic prominence)
2. "Core Expertise" - Research methods (EEG, MRI, production/perception experiments), Data analysis methods (Bayesian stats, linear mixed-effects models, etc.)

Avoid overly generic terms like "language," "communication," "research," "education," or "methodology."
Focus on unique and impactful keywords that highlight this researcher’s work.

Respond with a semicolon-separated list of keywords. If no relevant content is found, return an empty list.
"""
        
        messages_chunk = [
            {"role": "system", "content": "You are a helpful research assistant."},
            {"role": "user", "content": prompt_chunk}
        ]
        chunk_result = call_openai_chat(messages_chunk)

        # chunk_result might be something like: "Primate behavior; Cognition; EEG; ..."
        # Convert to a list
        chunk_list = [k.strip() for k in chunk_result.split(";") if k.strip()]
        for item in chunk_list:
            all_keywords.add(item)

    # If no keywords found at all
    if not all_keywords:
        return {"Themes": [], "Expertise": []}

    # Combine them into one string
    combined_keywords = "; ".join(all_keywords)
    classify_prompt = f"""
We have this list of keywords/phrases:
\"\"\"{combined_keywords}\"\"\"

1. Separate them into two categories: "Research Themes" (broad domain areas) 
   and "Core Expertise" (methods, technical skills, or specific tools).
2. Limit each category to between 1 and 30 items, based on relevance.
3. Return the result in JSON format exactly, like:

{{
  "Themes": ["Theme1", "Theme2", ...],
  "Expertise": ["Expertise1", "Expertise2", ...]
}}

If you cannot fill some category, return an empty array for it.
"""
    messages_classify = [
        {"role": "system", "content": "You are a helpful research assistant."},
        {"role": "user", "content": classify_prompt}
    ]
    classify_result = call_openai_chat(messages_classify).strip()

    # Attempt to parse as JSON
    try:
        parsed = json.loads(classify_result)
        themes = parsed.get("Themes", [])
        expertise = parsed.get("Expertise", [])
    except json.JSONDecodeError:
        print("Warning: GPT classification not in JSON format. Returning empty.")
        themes, expertise = [], []

    return {"Themes": themes, "Expertise": expertise}

def main():
    # Load your CSV
    input_csv = "Data_clean/06. Processed_Researcher_Data.csv"
    df = pd.read_csv(input_csv)
    df["Text"] = df["Text"].fillna("")

    all_themes = []
    all_expertise = []

    for idx, row in df.iterrows():
        full_name = row["Full Name"]
        text = str(row["Text"])
        print(f"Processing {full_name}...")

        result_dict = extract_and_classify(text)
        # Convert lists to semicolon-separated strings
        themes_str = "; ".join(result_dict["Themes"])
        expertise_str = "; ".join(result_dict["Expertise"])

        all_themes.append(themes_str)
        all_expertise.append(expertise_str)

    df["Themes"] = all_themes
    df["Expertise"] = all_expertise

    output_csv = "Data_clean/08.researchers_with_themes_expertise_openai.csv"
    df.to_csv(output_csv, index=False)
    print(f"Done! Results saved to {output_csv}")

if __name__ == "__main__":
    main()