In [None]:
import requests
import json
import urllib.parse
import pandas as pd
import os

GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
SEARCH_ENGINE_ID = os.environ["SEARCH_ENGINE_ID"] 

def linkedin_profile_search(query: str, university_names: list = None, course_names: list = None, index: int = 1, top_n: int = None):
    """
    Find the URL(s) of a Linkedin profile for a person using Google Custom Search,
    including options for university and course names, and returning the top N results.

    Args:
        query: The person's full name whose profile you want to find on Linkedin.
        university_names: An optional list of university names the person might have studied at.
        course_names: An optional list of course or degree names the person might have completed.
        index: Get a specific result index (1-based). Defaults to 1.
               If top_n is specified, index is ignored.
               If index is 0, returns all available filtered results (up to API's num limit).
        top_n: An optional integer specifying the number of top filtered results to return.
               If specified, the 'index' parameter is ignored.

    Returns:
        A single URL string if index is used (and found),
        A list of URLs if index is 0 or top_n is used,
        or None/empty list if no relevant results are found.

    Raises:
        requests.exceptions.RequestException: If there's an issue with the API request.
        ValueError: If the API response cannot be parsed as JSON or if API keys are missing.
        KeyError: If the expected keys are not found in the API response.
        Exception: For other unexpected errors during processing.
    """
    api_key = "AIzaSyADgEL2n_mZGgES6llJ-kKqP_CIX-1WHhI"
    search_engine_id = "91c0d179ebab44df3"

    if api_key == "yourApiKey" or search_engine_id == "yourSearchEngineId":
        raise ValueError("Please replace 'yourApiKey' and 'yourSearchEngineId' with your actual credentials.")

    url_query = "https://www.googleapis.com/customsearch/v1"

    # --- Constructing the search query ---
    # Start with the person's full name, often best as an exact phrase
    search_terms = [f'"{query}"']

    # Add university names using OR for multiple universities
    if university_names:
        # Ensure university names are treated as exact phrases
        university_clauses = [f'"{name}"' for name in university_names]
        search_terms.append(f'({" OR ".join(university_clauses)})')

    # Add course names using OR for multiple courses
    if course_names:
        # Ensure course names are treated as exact phrases
        course_clauses = [f'"{name}"' for name in course_names]
        search_terms.append(f'({" OR ".join(course_clauses)})')

    # Combine all terms with AND
    q = " AND ".join(search_terms)

    # Restrict the search to LinkedIn profiles only
    q += " site:linkedin.com/in/"

    # --- End of query construction ---

    params = {
        "key": api_key,
        "cx": search_engine_id,
        "q": q,
        "num": 10  # Request up to 10 results
    }

    # print(f"Constructed search query: {q}") # Optional: print the query to see how it looks

    try:
        response = requests.get(url_query, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

        res = response.json()

        # Check if 'items' are present in the response
        if 'items' not in res:
             # Return based on the requested output format
             return [] if (index == 0 or isinstance(top_n, int) and top_n >= 0) else None

        # Extract formatted URLs from the response items and filter for person profiles (/in/)
        # The site: operator should handle most of this, but an extra check is safe.
        filtered_output = [
            item.get("formattedUrl") for item in res.get("items", [])
            if item.get("formattedUrl") and "/in/" in item.get("formattedUrl")
        ]

        # --- Determine which results to return based on top_n or index ---
        if isinstance(top_n, int) and top_n >= 0:
            # Return the top_n results
            return filtered_output[:top_n]
        else:
            # Use the existing index logic
            return _get_results_indexed(filtered_output, index)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Google Custom Search API: {e}")
        raise
    except json.JSONDecodeError:
        print("Error decoding JSON response from API.")
        raise ValueError("Invalid JSON response from API.")
    except (KeyError, TypeError) as e:
        print(f"Error parsing API response structure: {e}")
        # It's helpful to print the response if possible to debug structure issues
        # print(f"API Response content: {response.text[:500]}") # Print first 500 chars
        raise KeyError(f"Unexpected API response structure: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        raise


def _get_results_indexed(data: list, index: int):
    """
    Helper function to get results based on the index.
    Used when top_n is NOT specified.

    Args:
        data: A list of result URLs.
        index: The index of the desired result (1-based) or 0 for all results.

    Returns:
        A single URL string, a list of indexed URLs, or None/empty list.
    """
    if index == 0:
        # Return all results with 1-based indexing prefix
        if not data:
            return []
        # Note: The original JS returned a flat list of strings like "1 url", "2 url"
        # This Python version returns a list of strings, matching that format.
        return [f"{i+1} {url}" for i, url in enumerate(data)]
    elif 1 <= index <= len(data):
        # Return the specific result at the given index
        return data[index - 1]
    else:
        # Index is out of bounds or data is empty for a specific index request
        return None



In [2]:

universities = ["University of Amsterdam", "Universiteit van Amsterdam"]
courses = ["Accountancy"] 

In [3]:
alumn = pd.read_csv('alumni_cleaned.csv')

In [4]:
alumn['name'] = alumn['first_name'] + " " + alumn["last_name"]

In [5]:
alumn

Unnamed: 0,full_name,program,student_id,first_name,last_name,name
0,"Christiaanse, C.",EMFC,EMFC_001,C.,Christiaanse,C. Christiaanse
1,"Daals, H.M.",EMFC,EMFC_002,H.M.,Daals,H.M. Daals
2,"Haan, B.H. de",EMFC,EMFC_003,B.H. de,Haan,B.H. de Haan
3,"Schomakers, M.",EMFC,EMFC_004,M.,Schomakers,M. Schomakers
4,"Slot, G.",EMFC,EMFC_005,G.,Slot,G. Slot
...,...,...,...,...,...,...
4928,"Duijst,Mariëlle",PMA,PMA_4929,Mariëlle,Duijst,Mariëlle Duijst
4929,"Cok,Quinty",PMA,PMA_4930,Quinty,Cok,Quinty Cok
4930,"Mel,Floor",PMA,PMA_4931,Floor,Mel,Floor Mel
4931,"Norder,Bram",PMA,PMA_4932,Bram,Norder,Bram Norder


In [6]:
test = alumn[4533:4633]

In [7]:
test

Unnamed: 0,full_name,program,student_id,first_name,last_name,name
4533,"Bakker,Kevin",PMA,PMA_4534,Kevin,Bakker,Kevin Bakker
4534,"Burgt,Erik van de",PMA,PMA_4535,Erik van de,Burgt,Erik van de Burgt
4535,"Kuijt,Marco",PMA,PMA_4536,Marco,Kuijt,Marco Kuijt
4536,"Smeets,Sheila",PMA,PMA_4537,Sheila,Smeets,Sheila Smeets
4537,"Dienst,Mike van",PMA,PMA_4538,Mike van,Dienst,Mike van Dienst
...,...,...,...,...,...,...
4628,"Lemaire,Suzanne",PMA,PMA_4629,Suzanne,Lemaire,Suzanne Lemaire
4629,"Meij,Paul de",PMA,PMA_4630,Paul de,Meij,Paul de Meij
4630,"Langius,Rutger",PMA,PMA_4631,Rutger,Langius,Rutger Langius
4631,"Akker,Mello van den",PMA,PMA_4632,Mello van den,Akker,Mello van den Akker


In [8]:
df = pd.DataFrame(columns=["name", "urls"])

for name in test['name']: 
    urls = linkedin_profile_search(name,universities,top_n=1,course_names=["Accountancy"])
    df = pd.concat([df, pd.DataFrame([{"name": name, "urls": urls}])], ignore_index=True)


In [9]:
len(df[df["urls"].apply(lambda x: len(x) > 0)])

57

In [10]:
df.index = [i for i in range(4633,4733)]

In [11]:
df.to_csv('saved_alumni_4633_4733.csv')

In [12]:
df

Unnamed: 0,name,urls
4733,Zhina Chaichi,[]
4734,Don Schulkes,[]
4735,Ivar Willemstein,[]
4736,Bart de Vries,[https://nl.linkedin.com/in/joost-winkelhorst-...
4737,Kathrina Ty,[https://nl.linkedin.com/in/kathrina-ty-ra-888...
...,...,...
4828,Adriaan de Wit,[https://au.linkedin.com/in/adriaandewit]
4829,Jordi Conijn,[]
4830,Karim R'Gui,[]
4831,SIDNEY David,[https://nl.linkedin.com/in/bas-werkhoven-msc-...
