Attorney General Webscrape Module for group data. <br>
[x] Get all current AG by state and political affiliation in a dataframe. <br>
[x] Get all former AG by state <br>
[] look up political affiliation by wiki search on name. 

In [1]:
#imports
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import wikipedia as wk

# State Attorney Generals
## Current Attorney Generals by state and party

In [2]:
#URL and Response object instance
url = 'https://en.wikipedia.org/wiki/State_attorney_general' 
res = requests.get(url)
type(res)

requests.models.Response

In [3]:
res.status_code

200

In [4]:
#soup object on response data
soup = BeautifulSoup(res.content, 'html.parser')
table_class= 'wikitable sortable jquery-tablesorter'
table = soup.find('table', attrs={'class':'wikitable'})
trs  = table.find_all('tr')
type(trs)

bs4.element.ResultSet

In [5]:
inp = []

for tr in trs:
    item = []
    ths = tr.find_all('th')
    tds = tr.find_all('td')
    for th in ths:
        a = th.text
        a = str(a).replace("\n", "")
        item.append(a)
    for td in tds:
        a = td.text
        a = str(a).replace("\n", "")
        #a = a.lower()
        item.append(a)
    inp.append(item)

In [6]:
sag = pd.DataFrame(inp[1:], columns = inp[0])

In [7]:
sag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Officeholder    56 non-null     object
 1   State           56 non-null     object
 2   Party           56 non-null     object
 3   Assumed office  56 non-null     object
 4   Term expires    56 non-null     object
 5   Law school      56 non-null     object
 6   Term limits     56 non-null     object
dtypes: object(7)
memory usage: 3.2+ KB


In [8]:
#drop non_state rows from set. Should only have 51 rows. 
sag = sag[sag['State'] != 'American Samoa']
sag = sag[sag['State'] != 'Guam']
sag = sag[sag['State'] != 'Puerto Rico[7]']
sag = sag[sag['State'] != 'U.S. Virgin Islands']
sag = sag[sag['State'] != 'Northern Mariana Islands']
sag = sag.drop(columns = ['Law school', 'Term limits'])

In [9]:
sag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 55
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Officeholder    51 non-null     object
 1   State           51 non-null     object
 2   Party           51 non-null     object
 3   Assumed office  51 non-null     object
 4   Term expires    51 non-null     object
dtypes: object(5)
memory usage: 2.4+ KB


In [10]:
# list of states
lst_states = list(sag['State'])
lst_states

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

## Former Attorney Generals

In [11]:
def get_ag(state):
    '''
    Function takes a URL to go to and scrape the table of past AG's for a state.
    Meant to be run with loop. 
    '''
    if state == 'Louisiana':
        url_state = 'https://www.naag.org/attorneys-general/past-attorneys-general/louisiana-former-attorney-general/'
    
    elif state == 'Connecticut':
        url_state = 'https://www.naag.org/attorneys-general/past-attorneys-general/connecticutformer-attorneys-general/'
    
    else:    
        url_state = f'https://www.naag.org/attorneys-general/past-attorneys-general/{state}-former-attorneys-general/'
   
    print(f'Working on {state} to import to list')
    #Diagnostic print line
    #print(f'url is {url_state}')
    
    time.sleep(1)
    res2 = requests.get(url_state, headers={'User-Agent': 'Mozilla/5.0'})
    print('Code', res.status_code)
    
    try:
        soup2 = BeautifulSoup(res2.content, 'html.parser')
        table2 = soup2.find('tbody')
        trs2  = table2.find_all('tr')
        states = []
        for tr in trs2:
            item = [f'{state}']
            tds = tr.find_all('td')
            for td in tds:
                a = td.text
                item.append(a)
            states.append(item)
        states_df = pd.DataFrame(states, columns = ['State', 'Name', 'Term'])
    except:
        print(f'**************** \
        There was an error with {state}, please correct \
        ***************')
        states = []
    return states_df

In [12]:
main = []
for state in lst_states:
    out = get_ag(state)
    main.append(out)
    print(f'{state} to import completed')

Working on Alabama to import to list
Code 200
Alabama to import completed
Working on Alaska to import to list
Code 200
Alaska to import completed
Working on Arizona to import to list
Code 200
Arizona to import completed
Working on Arkansas to import to list
Code 200
Arkansas to import completed
Working on California to import to list
Code 200
California to import completed
Working on Colorado to import to list
Code 200
Colorado to import completed
Working on Connecticut to import to list
Code 200
Connecticut to import completed
Working on Delaware to import to list
Code 200
Delaware to import completed
Working on District of Columbia to import to list
Code 200
District of Columbia to import completed
Working on Florida to import to list
Code 200
Florida to import completed
Working on Georgia to import to list
Code 200
Georgia to import completed
Working on Hawaii to import to list
Code 200
Hawaii to import completed
Working on Idaho to import to list
Code 200
Idaho to import completed


In [13]:
former_ag = pd.concat(main, ignore_index = True)

In [14]:
former_ag[['start of term', 'end of term']] = former_ag['Term'].str.split('–', expand = True)
former_ag[['start of term', 'end of term']] = former_ag['Term'].str.split('– ', expand = True)

In [15]:
former_ag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2068 entries, 0 to 2067
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          2068 non-null   object
 1   Name           2068 non-null   object
 2   Term           2068 non-null   object
 3   start of term  2068 non-null   object
 4   end of term    1938 non-null   object
dtypes: object(5)
memory usage: 80.9+ KB


Need to edit output. Some states appoint or otherwise seat an Attorney Gen. Turn all strings to numbers. I will do this in a function with a try/except block to handle odd strings here and there. There are a number of people have no end data, will need to look at that too. 

In [16]:
def to_num(string):
    '''
    Accepts a string and turns it into a number if possible. If not possible the location of unconverted numbers
    returned in a list with location
    '''
    try:
        alpha = int(string)
        
    except:
        alpha = print(f'the string {string} could not be converted')
    
    return alpha

In [17]:
former_ag['start of term'].apply(to_num)

the string 1991 -1999 could not be converted
the string 1892 -1895 could not be converted
the string 1815-1817 could not be converted
the string  could not be converted
the string 1981- 1984 could not be converted
the string October 2006 could not be converted
the string Appointed: 1926 could not be converted
the string Appointed: 2013, Elected: 2014  could not be converted


0       2010.0
1       2004.0
2       1997.0
3       1995.0
4       1991.0
         ...  
2063    1905.0
2064    1898.0
2065    1895.0
2066    1891.0
2067    1886.0
Name: start of term, Length: 2068, dtype: float64

Investigate each of the strange ones: correcting the ones that are wrong, based on Wikipedia, and dropping the ones older than 1979

In [18]:
former_ag[former_ag['start of term'] == '1991 -1999']

Unnamed: 0,State,Name,Term,start of term,end of term
130,Arkansas,Winston Bryant,1991 -1999,1991 -1999,


In [19]:
former_ag.iloc[130]['start of term'] = 1991
former_ag.iloc[130]['end of term'] = 1999

In [20]:
former_ag[former_ag['start of term'] == '1981- 1984']

Unnamed: 0,State,Name,Term,start of term,end of term
1229,New Hampshire,Stephen E Merrill,1981- 1984,1981- 1984,


In [21]:
former_ag.iloc[1229]['start of term'] = 1985
former_ag.iloc[1229]['end of term'] = 1989

In [22]:
former_ag[former_ag['start of term'] == 'October 2006']

Unnamed: 0,State,Name,Term,start of term,end of term
1829,Tennessee,Michael E. Moore,October 2006,October 2006,


In [23]:
former_ag.iloc[1829]['Name'] = 'Robert E. Cooper, Jr.'
former_ag.iloc[1829]['start of term'] = 2006
former_ag.iloc[1829]['end of term'] = 2014

In [24]:
former_ag[former_ag['start of term'] == 'Appointed: 2013, Elected: 2014 ']

Unnamed: 0,State,Name,Term,start of term,end of term
1865,Utah,Sean Reyes,"Appointed: 2013, Elected: 2014 – Present","Appointed: 2013, Elected: 2014",Present


In [25]:
sag[sag['State'] == 'Utah']

Unnamed: 0,Officeholder,State,Party,Assumed office,Term expires
49,Sean Reyes,Utah,Republican,"December 30, 2013",2025


In [26]:
#reyes is a current AG
former_ag = former_ag.drop(1865)

In [27]:
former_ag[former_ag['start of term'] == '1892 -1895']

Unnamed: 0,State,Name,Term,start of term,end of term
306,Delaware,John R. Nicholson,1892 -1895,1892 -1895,


In [28]:
former_ag = former_ag.drop(306)

In [29]:
former_ag[former_ag['start of term'] == '1815-1817']

Unnamed: 0,State,Name,Term,start of term,end of term
808,Louisiana,Etienne Mazareu,1815-1817,1815-1817,


In [30]:
former_ag = former_ag.drop(808)

In [31]:
former_ag[former_ag['start of term'] == 'Appointed: 1926']

Unnamed: 0,State,Name,Term,start of term,end of term
1843,Tennessee,Charles L. Cornelius,Appointed: 1926,Appointed: 1926,


In [32]:
former_ag = former_ag.drop(1843)

In [33]:
former_ag[former_ag['start of term'] == '']

Unnamed: 0,State,Name,Term,start of term,end of term
946,Massachusetts,Office Abolished 1843 – 1849,,,


In [34]:
former_ag = former_ag.drop(946)

In [35]:
former_ag['start of term'] = former_ag['start of term'].apply(to_num)

In [36]:
former_ag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2063 entries, 0 to 2067
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          2063 non-null   object
 1   Name           2063 non-null   object
 2   Term           2063 non-null   object
 3   start of term  2063 non-null   int64 
 4   end of term    1940 non-null   object
dtypes: int64(1), object(4)
memory usage: 96.7+ KB


---

Now to adjust the end of term series

In [37]:
former_ag['end of term'].apply(to_num)

the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not

0       2017.0
1       2010.0
2       2004.0
3       1997.0
4       1995.0
         ...  
2063    1911.0
2064    1905.0
2065    1898.0
2066    1895.0
2067    1891.0
Name: end of term, Length: 2063, dtype: float64

In [38]:
former_ag[former_ag['end of term'] == 'Present']

Unnamed: 0,State,Name,Term,start of term,end of term
525,Idaho,Lawrence Wasden,2003 – Present,2003,Present
967,Michigan,Dana Nessel,2019 – Present,2019,Present
1251,New Jersey,Gurbir S. Grewal,2018 – Present,2018,Present
1314,New Mexico,Hector Balderas,2015 – Present,2015,Present
1345,New York,Letitia James,2019 – Present,2019,Present
1487,Ohio,Dave Yost,2019 – Present,2019,Present
1538,Oklahoma,Mike Hunter,2017 – Present,2017,Present
1556,Oregon,Ellen Rosenblum,2012 – Present,2012,Present
1573,Pennsylvania,Josh Shapiro,2017 – Present,2017,Present
1662,Rhode Island,Peter F. Neronha,2019 – Present,2019,Present


In [39]:
sag[sag['State'] == 'New Jersey']

Unnamed: 0,Officeholder,State,Party,Assumed office,Term expires
32,Andrew BruckActing,New Jersey,Democratic,"July 19, 2021",Appointed


In [40]:
#all entries are current AG and exist on our SAG dataset
former_ag = former_ag.drop([525, 967, 1314,1345,1487,1556,1573,1662, 1750, 1796, 1827, 1845, 1885, 1906, 1953, 1971, 1991, 2033, 1251,1538])

In [41]:
former_ag['end of term'].apply(to_num)

the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not be converted
the string None could not

0       2017.0
1       2010.0
2       2004.0
3       1997.0
4       1995.0
         ...  
2063    1911.0
2064    1905.0
2065    1898.0
2066    1895.0
2067    1891.0
Name: end of term, Length: 2043, dtype: float64

In [42]:
former_ag = former_ag.dropna()

In [43]:
former_ag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1920 entries, 0 to 2067
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          1920 non-null   object
 1   Name           1920 non-null   object
 2   Term           1920 non-null   object
 3   start of term  1920 non-null   int64 
 4   end of term    1920 non-null   object
dtypes: int64(1), object(4)
memory usage: 90.0+ KB


In [44]:
former_ag['start of term'] = former_ag['start of term'].apply(to_num)
former_ag['end of term'] = former_ag['end of term'].apply(to_num)

In [45]:
former_ag.head()

Unnamed: 0,State,Name,Term,start of term,end of term
0,Alabama,Luther Strange,2010 – 2017,2010,2017
1,Alabama,Troy King,2004 – 2010,2004,2010
2,Alabama,Bill Pryor,1997 – 2004,1997,2004
3,Alabama,Jeff Sessions,1995 – 1997,1995,1997
4,Alabama,Jimmy Evans,1991 – 1995,1991,1995


In [46]:
former_ag['start of term']

0       2010
1       2004
2       1997
3       1995
4       1991
        ... 
2063    1905
2064    1898
2065    1895
2066    1891
2067    1886
Name: start of term, Length: 1920, dtype: int64

In [47]:
#create large table for each year
#start with a slice
test = former_ag[:30].copy()

In [48]:
test['years in office'] = test['end of term'] - test['start of term']

In [49]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 30
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   State            30 non-null     object
 1   Name             30 non-null     object
 2   Term             30 non-null     object
 3   start of term    30 non-null     int64 
 4   end of term      30 non-null     int64 
 5   years in office  30 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 1.6+ KB


In [50]:
test.head()

Unnamed: 0,State,Name,Term,start of term,end of term,years in office
0,Alabama,Luther Strange,2010 – 2017,2010,2017,7
1,Alabama,Troy King,2004 – 2010,2004,2010,6
2,Alabama,Bill Pryor,1997 – 2004,1997,2004,7
3,Alabama,Jeff Sessions,1995 – 1997,1995,1997,2
4,Alabama,Jimmy Evans,1991 – 1995,1991,1995,4


In [51]:
ag_since_1979 = former_ag[former_ag['start of term'] >= 1965]

In [52]:
ag_since_1979.head(30)

Unnamed: 0,State,Name,Term,start of term,end of term
0,Alabama,Luther Strange,2010 – 2017,2010,2017
1,Alabama,Troy King,2004 – 2010,2004,2010
2,Alabama,Bill Pryor,1997 – 2004,1997,2004
3,Alabama,Jeff Sessions,1995 – 1997,1995,1997
4,Alabama,Jimmy Evans,1991 – 1995,1991,1995
5,Alabama,Don Siegelman,1987 – 1991,1987,1991
6,Alabama,Charles A. Graddick,1979 – 1987,1979,1987
7,Alabama,William J. Baxley,1971 – 1979,1971,1979
8,Alabama,MacDonald Gallion,1967 – 1971,1967,1971
47,Alaska,Kevin G. Clarkson,2018 – 2020,2018,2020


In [53]:
ag_since_1979.tail(30)

Unnamed: 0,State,Name,Term,start of term,end of term
1922,Virginia,Andrew P. Miller,1970 – 1977,1970,1977
1954,Washington,Rob McKenna,2005 – 2013,2005,2013
1955,Washington,Christine Gregoire,1993 – 2004,1993,2004
1956,Washington,Ken Eikenberry,1981 – 1992,1981,1992
1957,Washington,Slade Gorton,1969 – 1980,1969,1980
1972,West Virginia,"Darrell V. McGraw, Jr.",1993 – 2013,1993,2013
1973,West Virginia,Mario J. Palumbo,1991 – 1993,1991,1993
1974,West Virginia,Roger Tompkins,1989 – 1991,1989,1991
1975,West Virginia,Charles G. Brown,1985 – 1989,1985,1989
1976,West Virginia,Chauncey H. Browning,1969 – 1985,1969,1985


In [54]:
sag.sort_values(by='State')['Party'].value_counts()

Republican    26
Democratic    25
Name: Party, dtype: int64

In [55]:
#setup wiki utility to call for each row. 
def get_party(name):
    '''
    Accepts a dataframe row as argument. Intended to be called in a .apply() manner to search over wikipedia data to obtain values for df['party'] column.
    '''
    
    #search wikipedia for named person

    #ag = wk.page(f'{name} attorney general', redirect = True)

    try:
        ag = wk.page(f'{name} attorney general', redirect = True)
        for cat in ag.categories:
            if f'Republicans' in cat:
                party = 'Rep'
                return party
            elif 'Democrats' in cat:
                party = 'Dem'
                return party
    except:
        party = np.nan
        return party


In [56]:
print(test.iloc[1]['Name'], test.iloc[1]['State'])

Troy King Alabama


In [57]:
get_party(test.iloc[1]['Name'])

'Rep'

In [58]:
test['party'] = test['Name'].apply(get_party)



  lis = BeautifulSoup(html).find_all('li')


In [59]:
test['party'].value_counts()

Dem    10
Rep     5
Name: party, dtype: int64

In [60]:
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 0 to 30
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   State            30 non-null     object
 1   Name             30 non-null     object
 2   Term             30 non-null     object
 3   start of term    30 non-null     int64 
 4   end of term      30 non-null     int64 
 5   years in office  30 non-null     int64 
 6   party            15 non-null     object
dtypes: int64(3), object(4)
memory usage: 1.9+ KB


In [64]:
ag_since_1960 = former_ag[former_ag['start of term']>1960].copy()

In [65]:
ag_since_1960.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 0 to 2048
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          474 non-null    object
 1   Name           474 non-null    object
 2   Term           474 non-null    object
 3   start of term  474 non-null    int64 
 4   end of term    474 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 22.2+ KB


In [66]:
ag_since_1960['party'] = np.nan

In [67]:
ag_since_1960.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 0 to 2048
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State          474 non-null    object 
 1   Name           474 non-null    object 
 2   Term           474 non-null    object 
 3   start of term  474 non-null    int64  
 4   end of term    474 non-null    int64  
 5   party          0 non-null      float64
dtypes: float64(1), int64(2), object(3)
memory usage: 25.9+ KB


In [68]:
ag_since_1960['party'] = ag_since_1960['Name'].apply(get_party)



  lis = BeautifulSoup(html).find_all('li')


In [69]:
ag_since_1960.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 0 to 2048
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          474 non-null    object
 1   Name           474 non-null    object
 2   Term           474 non-null    object
 3   start of term  474 non-null    int64 
 4   end of term    474 non-null    int64 
 5   party          275 non-null    object
dtypes: int64(2), object(4)
memory usage: 25.9+ KB


In [70]:
ag_since_1960['party'].value_counts()

Dem    152
Rep    123
Name: party, dtype: int64

In [83]:
ag_since_1960[ag_since_1960['party'].isnull()]['State'].value_counts()

District of Columbia    20
Alaska                  17
Pennsylvania            11
New Jersey              10
Hawaii                  10
New Hampshire            9
Tennessee                9
Wyoming                  8
Nevada                   6
Maine                    6
Indiana                  5
Utah                     5
Oregon                   4
Nebraska                 4
Montana                  4
Kentucky                 4
Ohio                     4
Arkansas                 4
Arizona                  4
Illinois                 3
Idaho                    3
Vermont                  3
Florida                  3
California               3
New Mexico               3
South Dakota             2
South Carolina           2
Rhode Island             2
Virginia                 2
West Virginia            2
Wisconsin                2
Texas                    2
Missouri                 2
North Carolina           2
Maryland                 2
Kansas                   2
Iowa                     2
G

In [None]:
ag_since_1960[ag_since_1960['party'].isnull()]['State'].value_counts()

In [79]:
ag_since_1960[ag_since_1960['State']== 'Massachusetts']

Unnamed: 0,State,Name,Term,start of term,end of term,party
908,Massachusetts,Martha Coakley,2007 – 2015,2007,2015,Dem
909,Massachusetts,Thomas Reilly,1999 – 2007,1999,2007,Dem
910,Massachusetts,L. Scott Harshbarger,1991 – 1999,1991,1999,
911,Massachusetts,James M. Shannon,1987 – 1991,1987,1991,Dem
912,Massachusetts,Francis X. Bellotti,1975 – 1987,1975,1987,Dem
913,Massachusetts,Robert H. Quinn,1969 – 1975,1969,1975,
914,Massachusetts,Elliot L. Richardson,1967 – 1969,1967,1969,Rep
916,Massachusetts,Edward W. Brooke,1963 – 1967,1963,1967,Rep


In [78]:
ag_since_1960[ag_since_1960['State']== 'Massachusetts']['party'].mode().iloc[0]

'Dem'

In [84]:
ag_since_1960[ag_since_1960['State']== 'Massachusetts'] = ag_since_1960[ag_since_1960['State']== 'Massachusetts'].fillna(ag_since_1960[ag_since_1960['State']== 'Massachusetts']['party'].mode().iloc[0])

In [85]:
ag_since_1960[ag_since_1960['State']== 'Massachusetts']

Unnamed: 0,State,Name,Term,start of term,end of term,party
908,Massachusetts,Martha Coakley,2007 – 2015,2007,2015,Dem
909,Massachusetts,Thomas Reilly,1999 – 2007,1999,2007,Dem
910,Massachusetts,L. Scott Harshbarger,1991 – 1999,1991,1999,Dem
911,Massachusetts,James M. Shannon,1987 – 1991,1987,1991,Dem
912,Massachusetts,Francis X. Bellotti,1975 – 1987,1975,1987,Dem
913,Massachusetts,Robert H. Quinn,1969 – 1975,1969,1975,Dem
914,Massachusetts,Elliot L. Richardson,1967 – 1969,1967,1969,Rep
916,Massachusetts,Edward W. Brooke,1963 – 1967,1963,1967,Rep


In [86]:
ag_since_1960.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 0 to 2048
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          474 non-null    object
 1   Name           474 non-null    object
 2   Term           474 non-null    object
 3   start of term  474 non-null    int64 
 4   end of term    474 non-null    int64 
 5   party          277 non-null    object
dtypes: int64(2), object(4)
memory usage: 25.9+ KB


In [90]:
def impute_party(states, df):
    '''
    Imputes political party by meauring the mode of a state's prior AG representation. 
    Takes a list of US states and dataframe as an argument and operates a fillna operation for each state on the dataframe. 
    The function returns a dataframe. 
    '''
    
    for state in states:
        df[df['State']== f'{state}'] = df[df['State']== f'{state}'].fillna(df[df['State']== f'{state}']['party'].mode().iloc[0])
    return df
    
    

In [91]:
clean_ag = impute_party(lst_states, ag_since_1960)

In [92]:
clean_ag.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 474 entries, 0 to 2048
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   State          474 non-null    object
 1   Name           474 non-null    object
 2   Term           474 non-null    object
 3   start of term  474 non-null    int64 
 4   end of term    474 non-null    int64 
 5   party          474 non-null    object
dtypes: int64(2), object(4)
memory usage: 25.9+ KB


In [93]:
clean_ag['party'].value_counts()

Dem    255
Rep    219
Name: party, dtype: int64