In [117]:
import re
import pandas as pd
import glob

# Load CSV
splited_path = glob.glob("../data/external/web_scraped/wiki_university_degree_*.csv")[0]
df = pd.read_csv(splited_path)

pattern = re.compile(r"([^()]+?)\s*\(([^()]+)\)")

def extract_pairs(edu):
    if pd.isna(edu):
        return [(None, None)] * 3
    s = str(edu).replace("\xa0", " ").strip()
    if not s:
        return [(None, None)] * 3

    matches = pattern.findall(s)
    if matches:
        pairs = [(u.strip(" ,;"), d.strip()) for u, d in matches[:3]]
    else:
        # No parentheses: treat entire string as university_1
        uni = s.strip(" ,;") or None
        pairs = [(uni, None)]

    while len(pairs) < 3:
        pairs.append((None, None))
    return pairs

# Apply and expand
parsed = df["education"].apply(extract_pairs)
for i in range(3):
    df[f"university_{i+1}"] = parsed.apply(lambda lst: lst[i][0])
    df[f"degree_{i+1}"]     = parsed.apply(lambda lst: lst[i][1])

# Check the problematic row (index 6 in your preview)
# df.loc[:35, ["forbes_name", "education","university_1","degree_1","university_2","degree_2","university_3","degree_3"]]
df.head(35)

Unnamed: 0,forbes_name,clean_name,education,university_1,degree_1,university_2,degree_2,university_3,degree_3
0,Elon Musk,Elon Musk,"University of Pennsylvania ( BA , BS )",University of Pennsylvania,"BA , BS",,,,
1,Larry Ellison,Larry Ellison,"University of Illinois, Urbana-Champaign (no d...","University of Illinois, Urbana-Champaign",no degree,University of Chicago,no degree,,
2,Mark Zuckerberg,Mark Zuckerberg,Harvard University (dropped out),Harvard University,dropped out,,,,
3,Jeff Bezos,Jeff Bezos,Princeton University ( BSE ),Princeton University,BSE,,,,
4,Larry Page,Larry Page,University of Michigan ( BSE ) Stanford Univer...,University of Michigan,BSE,Stanford University,MS,,
5,Sergey Brin,Sergey Brin,"University of Maryland, College Park ( BS ) St...","University of Maryland, College Park",BS,Stanford University,MS,,
6,Bernard Arnault & family,Bernard Arnault,École polytechnique,École polytechnique,,,,,
7,Steve Ballmer,Steve Ballmer,Harvard University ( BA ) Stanford University ...,Harvard University,BA,Stanford University,dropped out,,
8,Warren Buffett,Warren Buffett,University of Pennsylvania University of Nebra...,University of Pennsylvania University of Nebraska,BS,Columbia University,MS,,
9,Jensen Huang,Jensen Huang,Oregon State University ( BS ) Stanford Univer...,Oregon State University,BS,Stanford University,MS,,


In [68]:
import re
import pandas as pd

text = "Shanghai Jiaotong University ( BEng ) South China University of Technology ( MSc ) Institute of Physics, Chinese Academy of Sciences ( PhD )"

# Regex: everything before () as university, inside () as degree
pattern = re.compile(r"([^()]+?)\s*\(([^()]+)\)")
matches = pattern.findall(text)

# Build dict with up to 3 pairs
data = {}
for i, (uni, deg) in enumerate(matches, 1):
    data[f"university_{i}"] = [uni.strip()]
    data[f"degree_{i}"]     = [deg.strip()]

# Fill missing pairs with None
for i in range(len(matches)+1, 4):
    data[f"university_{i}"] = [None]
    data[f"degree_{i}"]     = [None]

df = pd.DataFrame(data)
df


Unnamed: 0,university_1,degree_1,university_2,degree_2,university_3,degree_3
0,Shanghai Jiaotong University,BEng,South China University of Technology,MSc,"Institute of Physics, Chinese Academy of Sciences",PhD


In [87]:
import glob
import re
import pandas as pd

# Load CSV file
splited_path = glob.glob("../data/interim/splited_edu_*.csv")[0]
df = pd.read_csv(splited_path)

# Regex pattern
pattern = re.compile(r"([^()]+?)\s*\(([^()]+)\)")

def extract_pairs(edu):
    if pd.isna(edu) or not str(edu).strip():
        return [(None, None)] * 3
    matches = pattern.findall(str(edu))
    pairs = [(u.strip(), d.strip()) for u, d in matches[:3]]
    # Pad to 3 pairs
    while len(pairs) < 3:
        pairs.append((None, None))
    return pairs

# Apply function
parsed = df["education"].apply(extract_pairs)

for i in range(3):
    df[f"university_{i+1}"] = parsed.apply(lambda lst: lst[i][0])
    df[f"degree_{i+1}"]     = parsed.apply(lambda lst: lst[i][1])

# Show first 10 rows with only relevant columns
df_preview = df.loc[:9, ["education",
                         "university_1", "degree_1",
                         "university_2", "degree_2",
                         "university_3", "degree_3"]]

df_preview


Unnamed: 0,education,university_1,degree_1,university_2,degree_2,university_3,degree_3
0,"University of Pennsylvania ( BA , BS )",University of Pennsylvania,"BA , BS",,,,
1,"University of Illinois, Urbana-Champaign (no d...","University of Illinois, Urbana-Champaign",no degree,University of Chicago,no degree,,
2,Harvard University (dropped out),Harvard University,dropped out,,,,
3,Princeton University ( BSE ),Princeton University,BSE,,,,
4,University of Michigan ( BSE ) Stanford Univer...,University of Michigan,BSE,Stanford University,MS,,
5,"University of Maryland, College Park ( BS ) St...","University of Maryland, College Park",BS,Stanford University,MS,,
6,École polytechnique,,,,,,
7,Harvard University ( BA ) Stanford University ...,Harvard University,BA,Stanford University,dropped out,,
8,University of Pennsylvania University of Nebra...,University of Pennsylvania University of Nebraska,BS,Columbia University,MS,,
9,Oregon State University ( BS ) Stanford Univer...,Oregon State University,BS,Stanford University,MS,,


In [111]:
import glob

splited_path = glob.glob("../data/interim/splited_edu_*.csv")[0]

splited_path = pd.read_csv(splited_path)

# splited_path = splited_path.drop(columns=["highest_university", "clean_name", "forbes_name", "education"])

splited_path.head(35)

Unnamed: 0,forbes_name,clean_name,education,university_1,degree_1,university_2,degree_2,university_3,degree_3
0,Elon Musk,Elon Musk,"University of Pennsylvania ( BA , BS )",University of Pennsylvania,"BA , BS",,,,
1,Larry Ellison,Larry Ellison,"University of Illinois, Urbana-Champaign (no d...","University of Illinois, Urbana-Champaign",no degree,University of Chicago,no degree,,
2,Mark Zuckerberg,Mark Zuckerberg,Harvard University (dropped out),Harvard University,dropped out,,,,
3,Jeff Bezos,Jeff Bezos,Princeton University ( BSE ),Princeton University,BSE,,,,
4,Larry Page,Larry Page,University of Michigan ( BSE ) Stanford Univer...,University of Michigan,BSE,Stanford University,MS,,
5,Sergey Brin,Sergey Brin,"University of Maryland, College Park ( BS ) St...","University of Maryland, College Park",BS,Stanford University,MS,,
6,Bernard Arnault & family,Bernard Arnault,École polytechnique,École polytechnique,,,,,
7,Steve Ballmer,Steve Ballmer,Harvard University ( BA ) Stanford University ...,Harvard University,BA,Stanford University,dropped out,,
8,Warren Buffett,Warren Buffett,University of Pennsylvania University of Nebra...,University of Pennsylvania University of Nebraska,BS,Columbia University,MS,,
9,Jensen Huang,Jensen Huang,Oregon State University ( BS ) Stanford Univer...,Oregon State University,BS,Stanford University,MS,,


In [109]:
import re
import pandas as pd
import numpy as np
import glob

def parse_education(edu: str):
    """
    Parse education string into up to 3 (university, degree) pairs.
    Always match 'text before ( ... )' as university, inside as degree.
    """
    if pd.isna(edu) or not str(edu).strip():
        return [(None, None)] * 3
    s = str(edu).replace("\xa0", " ")
    pairs = []
    # regex: capture (university name)(degree inside)
    # non-greedy before, then parentheses
    pattern = re.compile(r"\s*([^()]+?)\s*\(([^()]*)\)")
    for m in pattern.finditer(s):
        uni = m.group(1).strip(" ,;")
        deg = m.group(2).strip()
        pairs.append((uni, deg))
        if len(pairs) == 3:
            break
    # If nothing matched (no parentheses at all) → treat as one university, no degree
    if not pairs:
        pairs.append((s.strip(), None))
    # Pad to 3
    while len(pairs) < 3:
        pairs.append((None, None))
    return pairs

# Load data
splited_path = glob.glob("../data/interim/splited_edu_*.csv")[0]
df = pd.read_csv(splited_path)

# Parse education column
parsed = df["education"].apply(parse_education)

# Create new columns
for i in range(3):
    df[f"university_{i+1}"] = parsed.apply(lambda lst: lst[i][0])
    df[f"degree_{i+1}"] = parsed.apply(lambda lst: lst[i][1])


# df = df.drop(columns=["highest_university", "clean_name", "forbes_name", "education"])
# Output the DataFrame
df.head(35)

Unnamed: 0,forbes_name,clean_name,education,highest_university,highest_degree,university_1,degree_1,university_2,degree_2,university_3,degree_3
0,Elon Musk,Elon Musk,"University of Pennsylvania ( BA , BS )",University of Pennsylvania,"BA , BS",University of Pennsylvania,"BA , BS",,,,
1,Larry Ellison,Larry Ellison,"University of Illinois, Urbana-Champaign (no d...","University of Illinois, Urbana-Champaign",no degree,"University of Illinois, Urbana-Champaign",no degree,University of Chicago,no degree,,
2,Mark Zuckerberg,Mark Zuckerberg,Harvard University (dropped out),Harvard University,dropped out,Harvard University,dropped out,,,,
3,Jeff Bezos,Jeff Bezos,Princeton University ( BSE ),Princeton University,BSE,Princeton University,BSE,,,,
4,Larry Page,Larry Page,University of Michigan ( BSE ) Stanford Univer...,Stanford University,MS,University of Michigan,BSE,Stanford University,MS,,
5,Sergey Brin,Sergey Brin,"University of Maryland, College Park ( BS ) St...",Stanford University,MS,"University of Maryland, College Park",BS,Stanford University,MS,,
6,Bernard Arnault & family,Bernard Arnault,École polytechnique,,,École polytechnique,,,,,
7,Steve Ballmer,Steve Ballmer,Harvard University ( BA ) Stanford University ...,Harvard University,BA,Harvard University,BA,Stanford University,dropped out,,
8,Warren Buffett,Warren Buffett,University of Pennsylvania University of Nebra...,Columbia University,MS,University of Pennsylvania University of Nebraska,BS,Columbia University,MS,,
9,Jensen Huang,Jensen Huang,Oregon State University ( BS ) Stanford Univer...,Stanford University,MS,Oregon State University,BS,Stanford University,MS,,


In [3]:
import pandas as pd

df_edu = pd.read_csv("../data/processed/merged_dataset.csv")

df_edu.head(10)

Unnamed: 0,rank,name,source,country,gender,age,current_worth,year,month,day,highest_university,highest_degree
0,1,Elon Musk,"Tesla, SpaceX",United States,M,54,413.1,1971.0,6.0,28.0,University of Pennsylvania,"BA , BS"
1,2,Larry Ellison,Oracle,United States,M,81,271.6,1944.0,8.0,17.0,"University of Illinois, Urbana-Champaign",no degree
2,3,Mark Zuckerberg,Facebook,United States,M,41,251.8,1984.0,5.0,14.0,Harvard University,dropped out
3,4,Jeff Bezos,Amazon,United States,M,61,237.6,1964.0,1.0,12.0,Princeton University,BSE
4,5,Larry Page,Google,United States,M,52,177.1,1973.0,3.0,26.0,Stanford University,MS
5,6,Sergey Brin,Google,United States,M,52,164.8,1973.0,8.0,21.0,Stanford University,MS
6,7,Bernard Arnault & family,LVMH,France,M,76,156.9,1949.0,3.0,5.0,,
7,8,Steve Ballmer,Microsoft,United States,M,69,152.7,1956.0,3.0,24.0,Harvard University,BA
8,9,Warren Buffett,Berkshire Hathaway,United States,M,95,149.6,1930.0,8.0,30.0,Columbia University,MS
9,10,Jensen Huang,Semiconductors,United States,M,62,148.5,1963.0,2.0,17.0,Stanford University,MS


In [39]:
import glob

edu_path = glob.glob("../data/external/web_scraped/wiki_university_degree_*.csv")[0]

df_edu = pd.read_csv(edu_path)
df_edu.head()

Unnamed: 0,forbes_name,clean_name,education
0,Elon Musk,Elon Musk,"University of Pennsylvania ( BA , BS )"
1,Larry Ellison,Larry Ellison,"University of Illinois, Urbana-Champaign (no d..."
2,Mark Zuckerberg,Mark Zuckerberg,Harvard University (dropped out)
3,Jeff Bezos,Jeff Bezos,Princeton University ( BSE )
4,Larry Page,Larry Page,University of Michigan ( BSE ) Stanford Univer...


In [21]:
import glob

# Check if the file paths exist
print("Checking file paths...")

paths = [
    "../data/interim/trimmed_*.csv",
    "../data/external/web_scraped/wiki_date_of_birth_*.csv", 
    "../data/interim/splited_edu_*.csv"
]

for path in paths:
    files = glob.glob(path)
    if files:
        print(f"✅ {path} -> Found: {files}")
    else:
        print(f"❌ {path} -> No files found")

# Also check with ../ prefix
print("\nChecking with ../ prefix:")

paths_alt = [
    "../data/interim/trimmed_*.csv",
    "../data/external/web_scraped/wiki_date_of_birth_*.csv", 
    "../data/interim/splited_edu_*.csv"
]

for path in paths_alt:
    files = glob.glob(path)
    if files:
        print(f"✅ {path} -> Found: {files}")
    else:
        print(f"❌ {path} -> No files found")

Checking file paths...
✅ ../data/interim/trimmed_*.csv -> Found: ['../data/interim/trimmed_2025-09-03.csv']
✅ ../data/external/web_scraped/wiki_date_of_birth_*.csv -> Found: ['../data/external/web_scraped/wiki_date_of_birth_2025-09-03.csv']
✅ ../data/interim/splited_edu_*.csv -> Found: ['../data/interim/splited_edu_2025-09-03.csv']

Checking with ../ prefix:
✅ ../data/interim/trimmed_*.csv -> Found: ['../data/interim/trimmed_2025-09-03.csv']
✅ ../data/external/web_scraped/wiki_date_of_birth_*.csv -> Found: ['../data/external/web_scraped/wiki_date_of_birth_2025-09-03.csv']
✅ ../data/interim/splited_edu_*.csv -> Found: ['../data/interim/splited_edu_2025-09-03.csv']


In [18]:
import pandas as pd

df_trimmed = pd.read_csv("../data/interim/forbes/trimmed_2025-09-03.csv")

df_trimmed.head()

Unnamed: 0,rank,name,source,country,gender,age,current_worth
0,1,Elon Musk,"Tesla, SpaceX",United States,M,54,413.1
1,2,Larry Ellison,Oracle,United States,M,81,271.6
2,3,Mark Zuckerberg,Facebook,United States,M,41,251.8
3,4,Jeff Bezos,Amazon,United States,M,61,237.6
4,5,Larry Page,Google,United States,M,52,177.1


In [20]:
import pandas as pd

df_birth = pd.read_csv("../data/external/web_scraped/wiki_date_of_birth_2025-09-03.csv")

# convert columns to integer
for col in ["year", "month", "day"]:
    df_birth[col] = pd.to_numeric(df_birth[col], errors="coerce").astype("Int64")

df_birth = df_birth.drop(columns=["wiki_name"])

df_birth.head()

Unnamed: 0,clean_name,year,month,day
0,Elon Musk,1971,6,28
1,Larry Ellison,1944,8,17
2,Mark Zuckerberg,1984,5,14
3,Jeff Bezos,1964,1,12
4,Larry Page,1973,3,26


In [40]:
import pandas as pd

df_edu = pd.read_csv("../data/external/web_scraped/wiki_university_degree_2025-09-02.csv")

df_edu.head()

Unnamed: 0,forbes_name,clean_name,education
0,Elon Musk,Elon Musk,"University of Pennsylvania ( BA , BS )"
1,Larry Ellison,Larry Ellison,"University of Illinois, Urbana-Champaign (no degree) University of Chicago (no degree)"
2,Mark Zuckerberg,Mark Zuckerberg,Harvard University (dropped out)
3,Jeff Bezos,Jeff Bezos,Princeton University ( BSE )
4,Larry Page,Larry Page,University of Michigan ( BSE ) Stanford University ( MS )


In [73]:
# # show full string for all columns
# pd.set_option("display.max_colwidth", None)

# df_edu["education"].head(50)

In [56]:
import pandas as pd
import re
import numpy as np

df_edu = pd.read_csv("../data/external/web_scraped/wiki_university_degree_2025-09-02.csv")

def parse_education(education_string):
    """
    Parse education string to extract universities and degrees.
    Returns lists of universities and corresponding degrees.
    """
    if pd.isna(education_string) or education_string == '':
        return [], []
    
    # Pattern to match university name followed by optional degree in parentheses
    # This handles various formats including multiple degrees and "dropped out"
    pattern = r'([^()]+?)(?:\s*\(\s*([^)]+)\s*\))?(?=\s+[A-Z]|\s*$)'
    
    matches = re.findall(pattern, education_string)
    
    universities = []
    degrees = []
    
    for match in matches:
        university = match[0].strip()
        degree = match[1].strip() if match[1] else 'No degree specified'
        
        # Skip empty universities
        if university:
            universities.append(university)
            degrees.append(degree)
    
    return universities, degrees

def create_separate_columns(df_edu):
    """
    Create separate columns for universities and degrees.
    Each person can have multiple entries.
    """
    all_data = []
    
    for idx, education in df_edu['education'].items():
        universities, degrees = parse_education(education)
        
        if not universities:  # Handle NaN or empty cases
            all_data.append({
                'original_index': idx,
                'university': None,
                'degree': None
            })
        else:
            for uni, deg in zip(universities, degrees):
                all_data.append({
                    'original_index': idx,
                    'university': uni,
                    'degree': deg
                })
    
    return pd.DataFrame(all_data)

def create_structured_columns(df_edu):
    """
    Alternative approach: Create multiple university/degree columns for the same person.
    """
    max_institutions = 0
    parsed_data = []
    
    # First pass: parse all data and find max number of institutions
    for education in df_edu['education']:
        universities, degrees = parse_education(education)
        max_institutions = max(max_institutions, len(universities))
        parsed_data.append((universities, degrees))
    
    # Create the result dataframe
    result_df = df_edu.copy()
    
    # Add columns for each university and degree
    for i in range(max_institutions):
        result_df[f'university_{i+1}'] = None
        result_df[f'degree_{i+1}'] = None
    
    # Fill the columns
    for idx, (universities, degrees) in enumerate(parsed_data):
        for i, (uni, deg) in enumerate(zip(universities, degrees)):
            result_df.loc[idx, f'university_{i+1}'] = uni
            result_df.loc[idx, f'degree_{i+1}'] = deg
    
    return result_df

def simple_parse(education_string):
    """Simple parsing for straightforward cases"""
    if pd.isna(education_string):
        return None, None
    
    # Find pattern: text followed by parentheses
    match = re.search(r'^([^(]+?)\s*\(([^)]+)\)', education_string)
    if match:
        university = match.group(1).strip()
        degree = match.group(2).strip()
        return university, degree
    else:
        # No degree in parentheses found
        return education_string.strip(), 'No degree specified'

df['university'] = df['education'].apply(lambda x: simple_parse(x)[0])
df['degree'] = df['education'].apply(lambda x: simple_parse(x)[1])

df[['education', 'university', 'degree']].head(5)

Unnamed: 0,education,university,degree
0,"University of Pennsylvania ( BA , BS )",University of Pennsylvania,"BA , BS"
1,"University of Illinois, Urbana-Champaign (no degree) University of Chicago (no degree)","University of Illinois, Urbana-Champaign",no degree
2,Harvard University (dropped out),Harvard University,dropped out
3,Princeton University ( BSE ),Princeton University,BSE
4,University of Michigan ( BSE ) Stanford University ( MS ),University of Michigan,BSE


In [57]:
def split_education(df, education_col='education'):
    """
    Simple function to split education column into university and degree columns.
    Takes the first university and degree found in the string.
    """
    def parse_single(education_string):
        if pd.isna(education_string):
            return None, None
        
        # Find first university and degree pattern: "University Name (Degree)"
        match = re.search(r'^([^(]+?)\s*\(([^)]+)\)', education_string)
        if match:
            university = match.group(1).strip()
            degree = match.group(2).strip()
            return university, degree
        else:
            # No degree in parentheses, assume whole string is university
            return education_string.strip(), 'No degree specified'
    
    # Apply the parsing
    df['university'] = df[education_col].apply(lambda x: parse_single(x)[0])
    df['degree'] = df[education_col].apply(lambda x: parse_single(x)[1])
    
    return df

# Usage:
df_edu = split_education(df_edu)
df_edu[['education', 'university', 'degree']].head(5)

Unnamed: 0,education,university,degree
0,"University of Pennsylvania ( BA , BS )",University of Pennsylvania,"BA , BS"
1,"University of Illinois, Urbana-Champaign (no degree) University of Chicago (no degree)","University of Illinois, Urbana-Champaign",no degree
2,Harvard University (dropped out),Harvard University,dropped out
3,Princeton University ( BSE ),Princeton University,BSE
4,University of Michigan ( BSE ) Stanford University ( MS ),University of Michigan,BSE


In [70]:
import pandas as pd
import re

def get_highest_education(df, education_col='education'):
    """
    Extract the highest education level and corresponding university.
    Returns the university and degree with the highest academic level.
    """
    
    def parse_all_education(education_string):
        """Parse all universities and degrees from education string"""
        if pd.isna(education_string) or education_string == '':
            return []
        
        # Find all university-degree pairs
        pattern = r'([^()]+?)\s*\(\s*([^)]+)\s*\)'
        matches = re.findall(pattern, education_string)
        
        result = []
        for match in matches:
            university = match[0].strip()
            degree = match[1].strip()
            if university and degree:
                result.append((university, degree))
        
        return result
    
    def get_degree_level(degree):
        """Assign numerical level to degree (higher number = higher level)"""
        degree_lower = degree.lower()
        
        # Handle dropped out and no degree cases
        if 'dropped out' in degree_lower or 'no degree' in degree_lower:
            return 0
        
        # Doctorate level (highest)
        if any(x in degree_lower for x in ['phd', 'ph.d', 'doctorate', 'dsc', 'd.sc']):
            return 4
        
        # Professional degrees (JD, MD)
        if any(x in degree_lower for x in ['jd', 'md', 'dvm', 'pharmd']):
            return 3.5
        
        # Master's level
        if any(x in degree_lower for x in ['ms', 'ma', 'mba', 'msc', 'meng', 'm.s', 'm.a', 'master']):
            return 3
        
        # Bachelor's level
        if any(x in degree_lower for x in ['bs', 'ba', 'bsc', 'bse', 'beng', 'b.s', 'b.a', 'b.e', 'bachelor']):
            return 2
        
        # Associate or other lower degrees
        if any(x in degree_lower for x in ['as', 'aa', 'associate']):
            return 1
        
        # Unknown degree - assume some education
        return 1.5
    
    def find_highest_education(education_string):
        """Find the highest education level and return university, degree"""
        if pd.isna(education_string):
            return None, None
        
        # Parse all education entries
        edu_pairs = parse_all_education(education_string)
        
        if not edu_pairs:
            return None, None
        
        # Find the highest level
        highest_level = -1
        best_university = None
        best_degree = None
        
        for university, degree in edu_pairs:
            level = get_degree_level(degree)
            if level > highest_level:
                highest_level = level
                best_university = university
                best_degree = degree
        
        return best_university, best_degree
    
    # Apply to dataframe
    results = df[education_col].apply(find_highest_education)
    df['highest_university'] = results.apply(lambda x: x[0])
    df['highest_degree'] = results.apply(lambda x: x[1])
    
    return df

# Load your actual data
df_edu = pd.read_csv("../data/external/web_scraped/wiki_university_degree_2025-09-02.csv")

# Apply the function to your data
df_edu = get_highest_education(df_edu)

# # Display results
# print("Sample of your data with highest education extracted:")
# print(df_edu[['education', 'highest_university', 'highest_degree']].head(20))

# print(f"\nDataset shape: {df_edu.shape}")
# print(f"Records with education data: {df_edu['education'].notna().sum()}")
# print(f"Records with highest degree found: {df_edu['highest_degree'].notna().sum()}")

# # Show distribution of highest degrees
# print("\nDistribution of highest degrees:")
# print(df_edu['highest_degree'].value_counts().head(15))

# # Show some examples of complex cases
# print("\nExamples of people with multiple degrees:")
# multi_degree_mask = df_edu['education'].str.contains(r'\([^)]+\).*\([^)]+\)', na=False)
# complex_cases = df_edu[multi_degree_mask][['education', 'highest_university', 'highest_degree']].head(10)
complex_cases

Unnamed: 0,education,highest_university,highest_degree
1,"University of Illinois, Urbana-Champaign (no degree) University of Chicago (no degree)","University of Illinois, Urbana-Champaign",no degree
4,University of Michigan ( BSE ) Stanford University ( MS ),Stanford University,MS
5,"University of Maryland, College Park ( BS ) Stanford University ( MS )",Stanford University,MS
7,Harvard University ( BA ) Stanford University (dropped out),Harvard University,BA
8,University of Pennsylvania University of Nebraska ( BS ) Columbia University ( MS ),Columbia University,MS
9,Oregon State University ( BS ) Stanford University ( MS ),Stanford University,MS
11,University of Arkansas ( BS ) Columbia University ( JD ),Columbia University,JD
14,Johns Hopkins University ( BS ) Harvard University ( MBA ),Harvard University,MBA
30,Yale University ( BA ) Harvard University ( MBA ),Harvard University,MBA
34,"Shanghai Jiaotong University ( BEng ) South China University of Technology ( MSc ) Institute of Physics, Chinese Academy of Sciences ( PhD )","Institute of Physics, Chinese Academy of Sciences",PhD
