In [2]:
import requests
from bs4 import BeautifulSoup

def fetch_alma_mater(name):
    url = f"https://en.wikipedia.org/wiki/{name.replace(' ', '_')}"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return {"name": name, "alma_mater": None}

    for row in infobox.find_all("tr"):
        th = row.find("th")
        td = row.find("td")
        if th and td:
            label = th.get_text(strip=True)
            if label in ["Alma mater", "Education"]:
                return {
                    "name": name,
                    "alma_mater": td.get_text(" | ", strip=True)
                }

    return {"name": name, "alma_mater": None}

# Example run
print(fetch_alma_mater("Bernard Arnault"))


{'name': 'Bernard Arnault', 'alma_mater': None}


In [54]:
import numpy as np

names_file = "../data/interim/forbes/names_for_wiki_2025-09-01.csv"

# load the first column (skip header row)
names_array = np.loadtxt(names_file, delimiter=",", dtype=str, skiprows=1, usecols=0)
print(names_array[2])         # → array([...])


Mark_Zuckerberg


In [17]:
import requests
from bs4 import BeautifulSoup

def scrape_and_print_inbox_data(name):
    url = f"https://en.wikipedia.org/wiki/{name.replace(' ', '_')}"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    infobox = soup.select_one("table.infobox")
    if not infobox:
        print(f"{name}: no infobox")
        return

    for row in infobox.select("tr"):
        header = row.find("th")
        if header and header.get_text(strip=True) in ["Education", "Alma mater"]:
            data = row.find("td")
            if data:
                print(f"---- {name} ----")
                print(data.get_text(" | ", strip=True))  # prints all contents nicely
            return

    print(f"{name}: no Education/Alma mater field")

# Example
scrape_and_print_inbox_data("Elon Musk")
scrape_and_print_inbox_data("Jeff Bezos")


---- Elon Musk ----
University of Pennsylvania | ( | BA | , | BS | )
---- Jeff Bezos ----
Princeton University | ( | BSE | )


In [16]:
import requests
from bs4 import BeautifulSoup

names = "../data/interim/forbes/names_for_wiki_2025-09-01.csv"

def scrape_education(name):
    url = f"https://en.wikipedia.org/wiki/{name.replace(' ', '_')}"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    education = []
    for row in soup.select("table.infobox tr"):
        header = row.find("th", class_="infobox-label")
        if header and header.get_text(strip=True) in ["Education", "Alma mater"]:
            data = row.find("td", class_="infobox-data")
            if data:
                for link in data.find_all("a"):
                    education.append(link.get_text(strip=True))
            break

    university = education[0] if len(education) > 0 else None
    degree = education[1] if len(education) > 1 else None

    return {"name": name, "university": university, "degree": degree}

# safer load
df = pd.read_csv(names)
names_array = df.iloc[:, 0].values

for name in names_array[:50]:
    print(scrape_education(name))

{'name': 'Elon_Musk', 'university': 'University of Pennsylvania', 'degree': 'BA'}
{'name': 'Larry_Ellison', 'university': 'University of Illinois, Urbana-Champaign', 'degree': 'University of Chicago'}
{'name': 'Mark_Zuckerberg', 'university': 'Harvard University', 'degree': None}
{'name': 'Jeff_Bezos', 'university': 'Princeton University', 'degree': 'BSE'}
{'name': 'Larry_Page', 'university': 'University of Michigan', 'degree': 'BSE'}
{'name': 'Sergey_Brin', 'university': 'University of Maryland, College Park', 'degree': 'BS'}
{'name': 'Bernard_Arnault', 'university': None, 'degree': None}
{'name': 'Steve_Ballmer', 'university': 'Harvard University', 'degree': 'BA'}
{'name': 'Jensen_Huang', 'university': 'Oregon State University', 'degree': 'BS'}
{'name': 'Warren_Buffett', 'university': 'University of Pennsylvania', 'degree': 'University of Nebraska'}
{'name': 'Michael_Dell', 'university': None, 'degree': None}
{'name': 'Rob_Walton', 'university': 'University of Arkansas', 'degree': 'B

In [15]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

names = "../data/interim/forbes/names_for_wiki_2025-09-01.csv"

def scrape_education(name):
    url = f"https://en.wikipedia.org/wiki/{name.replace(' ', '_')}"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    education = []
    for row in soup.select("table.infobox tr"):
        header = row.find("th", class_="infobox-label")
        if header and header.get_text(strip=True) in ["Education", "Alma mater"]:
            data = row.find("td", class_="infobox-data")
            if data:
                for link in data.find_all("a"):
                    education.append(link.get_text(strip=True))
            break
    
    university = education[0] if len(education) > 0 else None
    degree = education[1] if len(education) > 1 else None
    
    return {"name": name, "university": university, "degree": degree}

# safer load
df = pd.read_csv(names)
names_array = df.iloc[:, 0].values

for name in names_array[:50]:
    print(scrape_education(name))


{'name': 'Elon_Musk', 'university': 'University of Pennsylvania', 'degree': 'BA'}
{'name': 'Larry_Ellison', 'university': 'University of Illinois, Urbana-Champaign', 'degree': 'University of Chicago'}
{'name': 'Mark_Zuckerberg', 'university': 'Harvard University', 'degree': None}
{'name': 'Jeff_Bezos', 'university': 'Princeton University', 'degree': 'BSE'}
{'name': 'Larry_Page', 'university': 'University of Michigan', 'degree': 'BSE'}
{'name': 'Sergey_Brin', 'university': 'University of Maryland, College Park', 'degree': 'BS'}
{'name': 'Bernard_Arnault', 'university': None, 'degree': None}
{'name': 'Steve_Ballmer', 'university': 'Harvard University', 'degree': 'BA'}
{'name': 'Jensen_Huang', 'university': 'Oregon State University', 'degree': 'BS'}
{'name': 'Warren_Buffett', 'university': 'University of Pennsylvania', 'degree': 'University of Nebraska'}
{'name': 'Michael_Dell', 'university': None, 'degree': None}
{'name': 'Rob_Walton', 'university': 'University of Arkansas', 'degree': 'B

In [5]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Bernard_Arnault"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/126.0 Safari/537.36"
}

resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, "html.parser")

# soup


In [28]:
import requests
from bs4 import BeautifulSoup

def scrape_education(name):
    url = f"https://en.wikipedia.org/wiki/{name}"
    headers = {"User-Agent": "Mozilla/5.0"}  # always add UA
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    education = []
    for row in soup.select("table.infobox tr"):
        header = row.find("th", class_="infobox-label")
        if header and "Education" in header.get_text():
            data = row.find("td", class_="infobox-data")
            if data:
                # grab only <a> tag text (universities + degrees)
                for link in data.find_all("a"):
                    education.append(link.get_text(strip=True))
            break
    
    return name, education[0], education[1]

scrape_education("Jeff_Bezos")    


('Jeff_Bezos', 'Princeton University', 'BSE')