# 1. Set Up the OpenAlex API Integration


In [None]:
pip install requests pandas

# Step 1: Load and Inspect the Data

In [14]:
import pandas as pd
# Load researcher data
file_path = "Data_clean/01_participants_with_geo.csv"
researchers = pd.read_csv(file_path)

# Inspect the data
print(researchers.head())

            full_name   first_name last_name tittle  \
0      Federica Amici     Federica     Amici    Dr.   
1   Carly A. Anderson     Carly A.  Anderson    NaN   
2  Mailin Ines Antomo  Mailin Ines    Antomo    Dr.   
3         Nadine Bade       Nadine      Bade    Dr.   
4      Kathryn Barnes      Kathryn    Barnes    NaN   

                   Affiliation                                 Email  \
0        University of Leipzig                      amici@eva.mpg.de   
1    University College London              carly.anderson@ucl.ac.uk   
2      University of Göttingen  mailin.antomo@phil.uni-goettingen.de   
3        University of Potsdam            nadine.bade@uni-potsdam.de   
4  Goethe University Frankfurt        barnes@lingua.uni-frankfurt.de   

                                               Photo   Latitude  Longitude  
0  https://i0.wp.com/vicom.info/wp-content/upload...  51.338574  12.378462  
1  https://i0.wp.com/vicom.info/wp-content/upload...  51.524559  -0.134040  
2  ht

# Step 3: Search for ALl Researchers in OpenAlex


In [54]:
import requests
import pandas as pd

# Load researcher data
def load_researcher_data(file_path):
    """Load researchers' data from a CSV file."""
    return pd.read_csv(file_path)

# Search for a researcher in OpenAlex
def search_researcher(name):
    """Search for a researcher by name in OpenAlex."""
    url = "https://api.openalex.org/authors"
    params = {"filter": f"display_name.search:{name}", "per-page": 5}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get("results", [])
    else:
        print(f"Error {response.status_code}: {response.text}")
        return []

# Fetch works (publications) for a researcher by author ID
def fetch_publications(author_id, start_year=2018):
    """Fetch works for a researcher by author ID."""
    url = "https://api.openalex.org/works"
    params = {
        "filter": f"author.id:{author_id},from_publication_date:{start_year}-01-01",
        "per-page": 50
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get("results", [])
    else:
        print(f"Error {response.status_code}: {response.text}")
        return []


def fetch_all_publications(author_id, start_year=2018):
    """Fetch all publications for a researcher by author ID, handling pagination."""
    url = "https://api.openalex.org/works"
    publications = []
    page = 1  # Start from the first page

    while True:
        params = {
            "filter": f"author.id:{author_id},from_publication_date:{start_year}-01-01",
            "per-page": 50,  # Maximum allowed per page
            "page": page
        }
        response = requests.get(url, params=params)

        if response.status_code == 200:
            results = response.json().get("results", [])
            publications.extend(results)
            
            # Check if there are more pages
            if "next" in response.json() and response.json()["next"]:
                page += 1
            else:
                break
        else:
            print(f"Error fetching publications: {response.status_code} - {response.text}")
            break

    return publications




# Reconstruct the abstract from OpenAlex's inverted index
def reconstruct_abstract(inverted_index):
    """Reconstruct the abstract from OpenAlex's inverted index."""
    if not inverted_index:
        return "Abstract not available"
    abstract = []
    for word, positions in inverted_index.items():
        for position in positions:
            while len(abstract) <= position:
                abstract.append("")
            abstract[position] = word
    reconstructed = " ".join(abstract)
    return reconstructed.lstrip("Abstract").strip()


# Extract publication source with link
def construct_source_with_openalex_id(publication):
    """Construct a source column with a clickable OpenAlex ID link."""
    # Get the OpenAlex ID from the publication metadata
    openalex_id = publication.get("id")
    
    if openalex_id:
        # Return a clickable OpenAlex link
        return f'<a href="{openalex_id}" target="_blank">OpenAlex</a>'
    else:
        # Fallback if the OpenAlex ID is missing
        return "OpenAlex ID not available"



def extract_primary_topic(publication):
    """Extract the primary topic of a publication."""
    primary_topic = publication.get("primary_topic")
    if primary_topic and isinstance(primary_topic, dict):
        return primary_topic.get("display_name", "Topic not available")
    return "Topic not available"


# Add open access information
def get_open_access_status(publication):
    """Get the open access status of a publication."""
    open_access = publication.get("open_access", {})
    return open_access.get("is_oa", False)


def check_affiliation_match(publication, researcher_affiliation):
    """Check if the affiliation in the publication matches the researcher's affiliation."""
    # Extract institutions from the publication
    publication_institutions = [
        inst.get("display_name", "Unknown")
        for authorship in publication.get("authorships", [])
        for inst in authorship.get("institutions", [])
    ]

    # Check if the researcher's affiliation is listed in the publication institutions
    matched_affiliation = "Yes" if researcher_affiliation in publication_institutions else "No"

    # Define review flag logic
    review_flag = "Yes" if matched_affiliation == "No" or not publication_institutions else "No"

    return ", ".join(publication_institutions), matched_affiliation, review_flag




# Process a single researcher
def process_researcher(row):
    """Process a single researcher to retrieve and validate their publications."""
    full_name = row['full_name']
    first_name, last_name = full_name.split()[0], full_name.split()[-1]
    researcher_affiliation = row['Affiliation']
    print(f"Processing researcher: {full_name} with affiliation: {researcher_affiliation}")

    # Search for researcher in OpenAlex using full name
    matches = search_researcher(full_name)

    # If no matches, search using first and last name only
    if not matches:
        short_name = f"{first_name} {last_name}"
        print(f"No matches found for {full_name}, trying {short_name}")
        matches = search_researcher(short_name)

    # If still no matches, add a placeholder row and return
    if not matches:
        print(f"No matches found for {full_name}")
        return [{
            "Researcher": full_name,
            "Year": None,
            "Type": "No publications found",
            "Title": None,
            "Abstract": None,
            "Source": None,
            "Authors": None,
            "Database Affiliation": researcher_affiliation,
            "Publication Institutions": None,
            "Matched Affiliation": "No",
            "Topic": None,
            "Review Flag": "Yes",  # Flag for manual review
        }]

    # Use the first match
    author_id = matches[0]["id"]
    print(f"Selected match: {matches[0]['display_name']} with ID: {author_id}")

    # Fetch all publications (handles pagination)
    publications = fetch_all_publications(author_id)
    if not publications:
        print(f"No publications found for {full_name}")
        # Add a placeholder row for researchers with no publications
        return [{
            "Researcher": full_name,
            "Year": None,
            "Type": "No publications found",
            "Title": None,
            "Abstract": None,
            "Source": None,
            "Authors": None,
            "Database Affiliation": researcher_affiliation,
            "Publication Institutions": None,
            "Matched Affiliation": "No",
            "Topic": None,
            "Review Flag": "Yes",  # Flag for manual review
        }]

    # Collect valid publications
    valid_publications = []
    for pub in publications:
        # Extract title
        title = pub.get("title", "Title not available")

        # Reconstruct abstract
        abstract = reconstruct_abstract(pub.get("abstract_inverted_index", {}))

        # Construct source with OpenAlex link
        source = construct_source_with_openalex_id(pub)

        # Check for affiliation match and flag if needed
        publication_institutions, matched_affiliation, review_flag = check_affiliation_match(pub, researcher_affiliation)

        # Extract topics
        topics = extract_primary_topic(pub)

        valid_publications.append({
            "Researcher": full_name,
            "Year": pub.get("publication_year"),
            "Type": pub.get("type", "N/A"),
            "Title": title,
            "Abstract": abstract,
            "Source": source,
            "Authors": ", ".join([auth["author"]["display_name"] for auth in pub.get("authorships", [])]),
            "Database Affiliation": researcher_affiliation,
            "Publication Institutions": publication_institutions,
            "Matched Affiliation": matched_affiliation,
            "Topic": topics,
            "Review Flag": review_flag,
        })

    print(f"Returning {len(valid_publications)} valid publications for {full_name}")
    return valid_publications



# Process a subset of researchers
def process_all(input_file, output_file):
    """Process all researchers and save results to a CSV file."""
    researchers = load_researcher_data(input_file)
    all_publications = []

    for _, row in researchers.iterrows():
        researcher_publications = process_researcher(row)
        all_publications.extend(researcher_publications)

    # Save results to CSV
    if all_publications:
        pd.DataFrame(all_publications).to_csv(output_file, index=False)
        print(f"Saved results to {output_file}")
    else:
        print("No publications to save.")


# Main function to run the subset pipeline
if __name__ == "__main__":
    input_file = "Data_clean/01_participants_with_geo.csv"  # Path to your input file
    output_file = "ViCom_Publications_Full.csv"  # Path to save results
    process_all(input_file, output_file)

Processing researcher: Federica Amici with affiliation: University of Leipzig
Selected match: Federica Amici with ID: https://openalex.org/A5005613346
Returning 50 valid publications for Federica Amici
Processing researcher: Carly A. Anderson with affiliation: University College London
Selected match: Carly A. Anderson with ID: https://openalex.org/A5000467902
Returning 7 valid publications for Carly A. Anderson
Processing researcher: Mailin Ines Antomo with affiliation: University of Göttingen
No matches found for Mailin Ines Antomo, trying Mailin Antomo
Selected match: Mailin Antomo with ID: https://openalex.org/A5084666745
Returning 8 valid publications for Mailin Ines Antomo
Processing researcher: Nadine Bade with affiliation: University of Potsdam
Selected match: Nadine Bade with ID: https://openalex.org/A5008433566
Returning 32 valid publications for Nadine Bade
Processing researcher: Kathryn Barnes with affiliation: Goethe University Frankfurt
Selected match: Kathryn Barnes-Burr