In [217]:
import pandas as pd
import numpy as np
import regex as re


In [218]:
df = pd.read_csv('gubernatorial_bios.csv')

In [219]:
import pandas as pd
import numpy as np

# Ensure birth_date is datetime
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')

# Only process rows where age_at_start is not NA
mask = df['age_at_start'].notna() & df['birth_date'].notna() & df['first_year'].notna()

# Extract month, day, last two digits of year
df['month'] = df['birth_date'].dt.month
df['day'] = df['birth_date'].dt.day
df['yy'] = df['birth_date'].dt.year % 100

# Compute target year
df['target'] = df['first_year'] - df['age_at_start']
df['base_century'] = (df['target'] // 100) * 100

# Candidate years
df['cand1'] = df['base_century'] - 100 + df['yy']
df['cand2'] = df['base_century'] + df['yy']
df['cand3'] = df['base_century'] + 100 + df['yy']

# Absolute differences
df['diff1'] = abs(df['cand1'] - df['target'])
df['diff2'] = abs(df['cand2'] - df['target'])
df['diff3'] = abs(df['cand3'] - df['target'])

# Choose closest candidate safely using np.select
conditions = [
    (df['diff1'] <= df[['diff1','diff2','diff3']].min(axis=1)),
    (df['diff2'] <= df[['diff1','diff2','diff3']].min(axis=1)),
    (df['diff3'] <= df[['diff1','diff2','diff3']].min(axis=1))
]
choices = [df['cand1'], df['cand2'], df['cand3']]
df['chosen_year'] = np.select(conditions, choices, default=np.nan)

# Rebuild birth_date only for rows with age_at_start
df['birth_date'] = np.where(
    mask,
    pd.to_datetime(dict(year=df['chosen_year'], month=df['month'], day=df['day']), errors='coerce'),
    df['birth_date']
)

# Drop helper columns
df.drop(columns=['month','day','yy','target','base_century','cand1','cand2','cand3','diff1','diff2','diff3','chosen_year'], inplace=True)


In [220]:
df[df['birth_date'].isna()]

Unnamed: 0.1,Unnamed: 0,state_territory,governor,party,first_year,years_in_office,school,birth_state_territory,spouse,birth_date,bio_text,college_attendance,ivy_attendance,lawyer,military_service,age_at_start,gender,born_in_state_territory,intl_born,intl_born_details
2260,2261,Virginia,Andrew Jackson Montague,Democratic,1902,1902 - 1906,"Richmond College, University of Virginia Law S...",Virginia,,NaT,"ANDREW JACKSON MONTAGUE was born in Lynchburg,...",1,0,1,0,,male,1,0,


In [221]:
# mapping = {'Luis Muñoz Marín': '1898-02-18',
#            'Kenneth Mapp' : '1955-11-02',
#            'John de Jongh, Jr.' : '1957-11-13',
#            'Charles W. Turnbull' : '1935-02-05',
#            'Roy Lester Schneider': '1939-05-13',
#            'Alexander A. Farrelly': '1923-12-29',
#            'Juan Francisco Luis': '1940-07-10',
#            'Cyril Emmanuel King': '1921-04-07',
#            'Melvin Herbert Evans': '1917-08-07',
#            'Ralph M. Paiewonsky': '1907-11-09'}

# df['birth_date'] = df['birth_date'].fillna(df['governor'].map(mapping))

In [222]:
df[df['age_at_start'].isna()]

Unnamed: 0.1,Unnamed: 0,state_territory,governor,party,first_year,years_in_office,school,birth_state_territory,spouse,birth_date,bio_text,college_attendance,ivy_attendance,lawyer,military_service,age_at_start,gender,born_in_state_territory,intl_born,intl_born_details
78,79,Alabama,William James Samford,Democratic,1900,1900 - 1901,East Alabama Male College (now Auburn Universi...,Georgia,,1944-09-16,"WILLIAM JAMES SAMFORD was born in Greenville, ...",1,0,1,1,,male,0,0,
436,437,Florida,Henry Laurens Mitchell,Democratic,1893,1893 - 1897,"system, moved to Tampa and studied law, and in...",Alabama,,1903-09-03,"HENRY LAURENS MITCHELL, Florida’s 16th governo...",1,0,1,1,,male,0,0,
794,795,Kentucky,John Adair,Democratic-Republican,1820,1820 - 1824,"system of Charlotte, North Carolina. Adair had...",South Carolina National Office(s) Served Repre...,,1840-01-09,"JOHN ADAIR was born in Chester District, Chest...",1,0,0,1,,male,0,0,
812,813,Louisiana,Sam Houston Jones,Democratic,1940,1940 - 1944,Louisiana State University,Louisiana,,1987-07-15,"SAM H. JONES was born in Merryville, Louisiana...",1,0,0,1,,male,1,0,
1067,1068,Michigan,Luren Dudley Dickinson,Republican,1939,1939 - 1941,in Michigan. Dickinson established an extensiv...,New York,,1959-04-15,"LUREN D. DICKINSON, the thirty-seventh governo...",1,0,0,0,,male,0,0,
1073,1074,Michigan,Alexander Joseph Groesbeck,Republican,1921,1921 - 1927,University of Michigan About ALEXANDER J. GROE...,Michigan,,1973-11-07,"ALEXANDER J. GROESBECK, the thirtieth governor...",1,0,1,0,,male,1,0,
1355,1356,New Hampshire,Walter Peterson,Republican,1969,1969 - 1973,Dartmouth College; College of William and Mary...,New Hampshire,,2011-06-01,"WALTER PETERSON was born in Nashua, New Hampsh...",1,1,0,0,,male,1,0,
1544,1545,New York,David Bennett Hill,Democratic,1885,1885 - 1892,,New York,,1943-08-29,"DAVID B. HILL, the thirty-second governor of N...",0,0,1,0,,male,1,0,
1877,1878,Rhode Island,Henry Smith,,1806,1905 - 1806,,Rhode Island,,1766-02-10,"HENRY SMITH was born in Providence, Rhode Isla...",0,0,0,0,,male,1,0,
1888,1889,Rhode Island,George Peabody Wetmore,Republican,1885,1885 - 1887,Yale University; Columbia University Law School,Other,,1946-08-02,"GEORGE PEABODY WETMORE was born in London, Eng...",1,1,1,0,,male,0,1,"London, England"


In [223]:
mapping_2 = {'Henry Laurens Mitchell': '1831-09-03',
             'John Adair' : '1757-01-09',
             'Sam Houston Jones' : '1897-07-15',
             'Luren Dudley Dickinson' : '1859-04-15',
             'Alexander Joseph Groesbeck' : '1873-11-07',
             'Walter Peterson' : '1922-09-19',
             'David Bennett Hill' : '1843-08-29',
             'Henry Smith' : '1766-02-10',
             'George Peabody Wetmore': '1846-08-02',
             'William James Samford' : '1844-09-16',
             'James Jr. Hamilton' : '1786-05-08',
             'William Trousdale' : '1790-09-23',
             'Charles W. Gates': '1856-01-12',
             'John Staniford Robinson' : '1804-11-10',
             'Lucius Fairchild' : '1831-12-27',
             'Lester Calloway Hunt' : '1892-07-08'}

df.loc[df['age_at_start'].isna(), 'birth_date'] = (
    df.loc[df['age_at_start'].isna(), 'governor'].map(mapping_2)
)

# Ensure consistent datetime dtype
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')

In [224]:
df[df['governor'] == 'Henry Smith']
df['years_in_office'] = np.where(df['governor'] == 'Henry Smith', '1805 - 1806', df['years_in_office'])
df['first_year'] = np.where(df['governor'] == 'Henry Smith', '1805', df['first_year'])

df[df['governor'] == 'George Dewey Clyde']
df['years_in_office'] = np.where(df['governor'] == 'George Dewey Clyde', '1957 - 1965', df['years_in_office'])
df['first_year'] = np.where(df['governor'] == 'George Dewey Clyde', '1957', df['first_year'])

In [225]:
df['first_year'] = pd.to_numeric(df['first_year'], errors='coerce')
df['new_age_at_start'] = df['first_year'] - df['birth_date'].dt.year

In [226]:
df[abs(df['new_age_at_start'] - df['age_at_start']) > 1]

Unnamed: 0.1,Unnamed: 0,state_territory,governor,party,first_year,years_in_office,school,birth_state_territory,spouse,birth_date,...,college_attendance,ivy_attendance,lawyer,military_service,age_at_start,gender,born_in_state_territory,intl_born,intl_born_details,new_age_at_start
656,657,Iowa,Terry E. Branstad,Republican,1983,1983 - 1999 2011 - 2017,University of Iowa; Drake University Law School,Iowa,,1946-11-17,...,1,0,1,1,64.0,male,1,0,,37.0
1141,1142,Mississippi,Phil Bryant,Republican,2012,2012 - 2016 2016 - 2020,"Hinds Community College, University of Souther...",Mississippi,Deborah Bryant Deborah Bryant About Phil Bryan...,1954-12-09,...,1,0,0,0,61.0,male,1,0,,58.0
1710,1711,Ohio,Edward Follansbee Noyes,Republican,1832,1872 - 1832,"Dartmouth College, Cincinnati Law School",Massachusetts,,1832-10-03,...,1,1,1,1,39.0,male,0,0,,0.0
1763,1764,Oregon,John A. Kitzhaber,Democratic,1995,1995 - 2003 2011 - 2015,"Dartmouth College, University of Oregon Medica...",Washington,,1947-03-05,...,1,1,0,0,63.0,male,0,0,,48.0
2143,2144,Utah,George Dewey Clyde,Republican,1957,1957 - 1965,"Utah State Agricultural College, University of...",Utah,,1798-07-21,...,1,0,0,0,58.0,male,1,0,,159.0
2359,2360,West Virginia,Henry Mason Mathews,Democratic,1881,1887 - 1881,University of Virginia,West Virginia,,1834-03-29,...,1,0,0,1,52.0,male,1,0,,47.0


In [227]:
df.loc[df['governor'] == 'George Dewey Clyde', 'birth_date'] = pd.to_datetime('1898-07-21')
df.loc[df['governor'] == 'Isaac Wilbour', 'birth_date'] = pd.to_datetime('1763-04-25')
df.loc[df['governor'] == 'Stevens Thomson Mason', 'birth_date'] = pd.to_datetime('1760-12-29')
df.loc[df['governor'] == 'Amos Walker Barber', 'birth_date'] = pd.to_datetime('1860-07-25')
df.loc[df['governor'] == 'Johnathan Jennings', 'birth_date'] = pd.to_datetime('1784-03-27')
df.loc[df['governor'] == 'Andrew Jackson Montague', 'birth_date'] = pd.to_datetime('1862-10-03')





df['years_in_office'] = np.where(df['governor'] == 'Edward Follansbee Noyes', '1877 - 1881', df['years_in_office'])
df['first_year'] = np.where(df['governor'] == 'Edward Follansbee Noyes', '1877', df['first_year'])
df['first_year'] = pd.to_numeric(df['first_year'], errors='coerce')
df['new_age_at_start'] = df['first_year'] - df['birth_date'].dt.year
df['age_at_start'] = df['new_age_at_start']
df.drop(columns = ['new_age_at_start'], inplace = True)

In [228]:
df.sort_values('age_at_start')

Unnamed: 0.1,Unnamed: 0,state_territory,governor,party,first_year,years_in_office,school,birth_state_territory,spouse,birth_date,bio_text,college_attendance,ivy_attendance,lawyer,military_service,age_at_start,gender,born_in_state_territory,intl_born,intl_born_details
833,834,Louisiana,Henry Clay Warmoth,Republican,1868,1868 - 1872,"system of his native state. He studied law, wa...",Illinois,,1842-05-09,"HENRY C. WARMOTH was born in Mc Leansboro, Ill...",1,0,1,1,26,male,0,0,
522,523,Georgia,George Walton,"Whig, Democratic-Republican",1779,1779 - 1780,,Virginia,,1749-01-01,"GEORGE WALTON, the youngest signer of the Decl...",0,0,1,1,30,male,0,0,
976,977,Maryland,Edward Lloyd,Democratic-Republican,1809,1809 - 1811,,Maryland,,1779-07-22,"EDWARD LLOYD was born in Talbot County, Maryla...",0,0,0,1,30,male,1,0,
1453,1454,New Jersey,Leon R. Taylor,Democratic,1913,1913 - 1914,Denison University,New Jersey,,1883-10-26,"Leon R. Taylor was born in Asbury Park, New Je...",1,0,0,0,30,male,1,0,
2439,2440,Wyoming,Amos Walker Barber,Republican,1890,1890 - 1893,University of Pennsylvania,Pennsylvania,,1860-07-25,"AMOS WALKER BARBER was born in Doylestown, Pen...",1,1,0,1,30,male,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1101,1102,Michigan,Stevens Thomson Mason,Democratic,1837,1837 - 1840,Transylvania University,Virginia,,1760-12-29,"STEVENS T. MASON, the first governor of Michig...",1,0,0,0,77,male,0,0,
1387,1388,New Hampshire,Moody Currier,Republican,1885,1885 - 1887,Dartmouth College,New Hampshire,,1806-04-22,"MOODY CURRIER, the forty-ninth governor of New...",1,1,1,0,79,male,1,0,
1846,1847,Pennsylvania,Benjamin Franklin,,1785,1785 - 1788,,Massachusetts,,1706-01-17,Like others among this nation’s Founding Fathe...,0,0,0,0,79,male,0,0,
1067,1068,Michigan,Luren Dudley Dickinson,Republican,1939,1939 - 1941,in Michigan. Dickinson established an extensiv...,New York,,1859-04-15,"LUREN D. DICKINSON, the thirty-seventh governo...",1,0,0,0,80,male,0,0,


In [229]:
df[df['intl_born_details'] == 'Washington, D.C.']


df.loc[df['governor'] == 'John Davis Lodge', 'birth_state_territory'] = 'Washington, D.C.'
df.loc[df['governor'] == 'John Davis Lodge', 'intl_born'] = 0
df.loc[df['governor'] == 'John Davis Lodge', 'intl_born_details'] = np.nan



In [230]:
df.sort_values(by="school", key=lambda x: x.str.len(), ascending=False)[:100]


Unnamed: 0.1,Unnamed: 0,state_territory,governor,party,first_year,years_in_office,school,birth_state_territory,spouse,birth_date,bio_text,college_attendance,ivy_attendance,lawyer,military_service,age_at_start,gender,born_in_state_territory,intl_born,intl_born_details
458,459,Georgia,Lester Garfield Maddox,Democratic,1967,1967 - 1971,to pursue odd jobs. Maddox engaged in industri...,Georgia,,1915-09-30,"LESTER GARFIELD MADDOX was born in Atlanta, Ge...",1,0,1,0,52,male,1,0,
2270,2271,Virginia,William Smith,Democratic,1846,1864 - 1865 1846 - 1849,"near home, studied privately in Fredericksburg...",Virginia,,1797-09-06,WILLIAM SMITH was born in Marengo in King Geor...,1,0,1,1,49,male,1,0,
2077,2078,Tennessee,Andrew Johnson,Democratic,1853,1862 - 1865 1853 - 1857,and was eventually taught to read and write by...,North Carolina,,1808-12-29,"Born in Raleigh, North Carolina, ANDREW JOHNSO...",1,0,1,0,45,male,0,0,
497,498,Georgia,George Rockingham Gilmer,Democratic-Whig,1829,1837 - 1839 1829 - 1831,and at the famous Academy of Moses Waddell in ...,Georgia,,1790-04-11,GEORGE ROCKINGHAM GILMER was born near Lexingt...,1,0,1,1,39,male,1,0,
1353,1354,New Hampshire,Hugh J. Gallen,Democratic,1979,1979 - 1982,". After graduation, Gallen had a successful tr...",Oregon,,1924-07-30,"HUGH J. GALLEN was born July 30, 1924, in Port...",1,0,0,0,55,male,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1445,1446,New Jersey,Harold Giles Hoffman,Republican,1935,1935 - 1938,", where he graduated in 1913. During World War...",New Jersey,,1896-02-07,"Harold G. Hoffman was born in South Amboy, New...",1,0,0,1,39,male,1,0,
2210,2211,Vermont,Stephen Royce,Whig; Republican,1854,1854 - 1856,Middlebury College About STEPHEN ROYCE was bor...,Vermont,,1787-08-12,"STEPHEN ROYCE was born in Tinmouth, Vermont. A...",1,0,1,0,67,male,1,0,
1164,1165,Mississippi,Edmond Favor Noel,Democratic,1908,1908 - 1912,"in Louisville, Kentucky. He later studied law,...",Mississippi,,1856-03-04,"EDMOND F. NOEL, the thirty-seventh governor of...",1,0,1,0,52,male,1,0,
975,976,Maryland,Robert Bowie,Democratic-Republican,1803,1811 - 1812 1803 - 1806,in Prince George’s County and at Reverend Crad...,Maryland,,1750-03-01,ROBERT BOWIE was born at “Mattaponi” in Prince...,1,0,0,1,53,male,1,0,


In [231]:
def capital_word_ratio(s):
    if not isinstance(s, str) or len(s.strip()) == 0:
        return np.nan
    words = s.split()
    caps = sum(w[0].isupper() for w in words if w)
    return caps / len(words)

df['cap_word_ratio'] = df['school'].apply(capital_word_ratio)

In [238]:
df.sort_values(by="cap_word_ratio")[:350].to_clipboard()


In [214]:
df[df['school'].str.contains('and', na=False)].to_clipboard()

In [239]:
pattern = r"(About [A-Z]+|National Offices? Served?|National Office)"

# Keep only the part before the match
df["school_cleaned"] = df["school"].str.split(pattern, regex=True).str[0].str.strip()



In [240]:
df['cap_word_ratio'] = df['school_cleaned'].apply(capital_word_ratio)

In [241]:
df.sort_values(by="cap_word_ratio")[:350].to_clipboard()


In [244]:
df['school_cleaned'] = np.where(df['cap_word_ratio'] < 0.6, np.nan, df['school_cleaned'])

In [245]:
school_mapping = {
    'William Paca' : 'College of Philadelphia (now the University of Pennsylvania)',
    'Frank W. Benson' : 'University of the Pacific',
    'Thomas Mifflin' : 'College of Philadelphia (University of Pennsylvania)',
    'Thomas Mann Randolph' : 'College of William and Mary and University of Edinburgh',
    'Matt Meyer' : 'Brown University (BA); University of Michigan (JD)',
    'Thomas Bahnson Stanley': np.nan
}

df['school_cleaned'] = df.apply(
    lambda row: school_mapping.get(row['governor'], row['school_cleaned']), axis=1
)

In [248]:
df['school'] = df['school_cleaned']
df.drop(columns = ['school_cleaned'], inplace = True)

In [249]:
df['college_attendance'] = np.where(df['school'].isna(), 0, 1)


In [251]:
ivy_keywords = [
    "harvard",
    "yale",
    "princeton",
    "columbia",
    "university of pennsylvania",
    "upenn",
    "dartmouth",
    "cornell",
    "brown university"
]

# Make a single regex pattern joining all keywords, ignoring case
ivy_pattern = '|'.join(ivy_keywords)

# Check if 'school_cleaned' contains any of the Ivy keywords
df['ivy_attendance'] = np.where(
    df['school'].str.lower().str.contains(ivy_pattern, regex=True, na=False), 
    1, 
    0
)

In [253]:
df.columns

Index(['Unnamed: 0', 'state_territory', 'governor', 'party', 'first_year',
       'years_in_office', 'school', 'birth_state_territory', 'spouse',
       'birth_date', 'bio_text', 'college_attendance', 'ivy_attendance',
       'lawyer', 'military_service', 'age_at_start', 'gender',
       'born_in_state_territory', 'intl_born', 'intl_born_details',
       'cap_word_ratio'],
      dtype='object')

In [254]:
df = df[['state_territory', 'governor', 'party', 'first_year',
       'years_in_office', 'school', 'birth_state_territory',
       'birth_date', 'bio_text', 'college_attendance', 'ivy_attendance',
       'lawyer', 'military_service', 'age_at_start', 'gender',
       'born_in_state_territory', 'intl_born', 'intl_born_details']]

In [255]:
df.to_csv('gubernatorial_bios_final.csv')