In [1]:
url = "https://en.wikipedia.org/wiki/List_of_MPs_elected_in_the_2019_United_Kingdom_general_election"

In [2]:
# Get the webpage and find all tables.

import requests
from bs4 import BeautifulSoup

website_url = requests.get(url).text
soup = BeautifulSoup(website_url,'html.parser')
tables = soup.find_all('table')

In [3]:
# Find the table with members returned, extract rows as a list of lists and load into dataframe

import pandas as pd

for table in tables:
    if 'Member returned' in table.text:
        headers = [header.text.strip() for header in table.find_all('th')]
        # To fix - headers returns 7 but only 5 columns 
        headers = headers[:5]
        rows = []
        table_rows = table.find_all('tr')    
        for row in table_rows:
            td = row.find_all('td')
            row = [row.text for row in td if row.text!='\n']
            rows.append(row)

In [4]:
df = pd.DataFrame(rows, columns=headers) 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Constituency                       650 non-null    object
 1   Party of incumbentbefore election  650 non-null    object
 2   Member returned                    650 non-null    object
 3   Party of incumbentafter election   650 non-null    object
 4   Notes                              650 non-null    object
dtypes: object(5)
memory usage: 25.6+ KB


In [6]:
df.head(n=5)

Unnamed: 0,Constituency,Party of incumbentbefore election,Member returned,Party of incumbentafter election,Notes
0,,,,,
1,Aberavon\n,Labour\n,Stephen Kinnock\n,Labour\n,Seat held\n
2,Aberconwy\n,Conservative\n,Robin Millar\n,Conservative\n,"Previous incumbent, Guto Bebb, did not stand\n"
3,Aberdeen North\n,Scottish National\n,Kirsty Blackman\n,Scottish National\n,Seat held\n
4,Aberdeen South\n,Conservative\n,Stephen Flynn\n,Scottish National\n,"Previous incumbent, Ross Thomson, did not stand\n"


In [7]:
df.tail(n=5)

Unnamed: 0,Constituency,Party of incumbentbefore election,Member returned,Party of incumbentafter election,Notes
647,Yeovil\n,Conservative\n,Marcus Fysh\n,Conservative\n,Seat held\n
648,Ynys Môn\n,Labour\n,Virginia Crosbie\n,Conservative\n,"Previous incumbent, Albert Owen, did not stand\n"
649,York Central\n,Labour Co-operative\n,Rachael Maskell\n,Labour Co-operative\n,Seat held\n
650,York Outer\n,Conservative\n,Julian Sturdy\n,Conservative\n,Seat held\n
651,,,,,


In [8]:
# Remove null rows at start and end and strip tailing '\n'

df = df.dropna()
df['Constituency'] = df['Constituency'].str.rstrip("\n")
df['Fullname'] = df['Member returned'].str.rstrip("\n")

In [9]:
# Check \n not elsewhere in Fullname

df[df['Fullname'].astype(str).str.contains('\n')]

Unnamed: 0,Constituency,Party of incumbentbefore election,Member returned,Party of incumbentafter election,Notes,Fullname
228,Finchley and Golders Green,Conservative\n,\nMike Freer\n\n,Conservative\n,Seat held\n,\nMike Freer
373,Mole Valley,Conservative\n,\nPaul Beresford\n\n,Conservative\n,Seat held\n,\nPaul Beresford
545,Stockport,Change UK[i]\n,\nNavendu Mishra\n\n,Labour\n,"Previous incumbent, Ann Coffey, did not stand\n",\nNavendu Mishra
546,Stockton North,Labour\n,\nAlex Cunningham\n\n,Labour\n,Seat held\n,\nAlex Cunningham
547,Stockton South,Labour\n,\nMatt Vickers\n\n,Conservative\n,"Defeated incumbent, Paul Williams\n",\nMatt Vickers
...,...,...,...,...,...,...
613,West Bromwich East,Labour\n,\nNicola Richards\n\n,Conservative\n,"Previous incumbent, Tom Watson, did not stand\n",\nNicola Richards
614,West Bromwich West,Labour Co-operative\n,\nShaun Bailey\n\n,Conservative\n,"Previous incumbent, Adrian Bailey, did not sta...",\nShaun Bailey
616,West Dunbartonshire,Scottish National\n,\nMartin Docherty-Hughes\n\n,Scottish National\n,Seat held\n,\nMartin Docherty-Hughes
619,West Suffolk,Conservative\n,\nMatthew Hancock\n\n,Conservative\n,Seat held\n,\nMatthew Hancock


In [10]:
# Check \n not elsewhere in Fullname

df[df['Constituency'].astype(str).str.contains('\n')]

Unnamed: 0,Constituency,Party of incumbentbefore election,Member returned,Party of incumbentafter election,Notes,Fullname


In [11]:
# Strip leading '\n' 

df['Fullname'] = df['Fullname'].str.lstrip("\n")

In [12]:
# Split into Firstname and compound Lastname

df['Firstname'] = df['Fullname'].str.split().str[0]
df['Lastname'] = df['Fullname'].astype(str).apply(lambda x: ' '.join(x.split()[1:]))

In [13]:
# Check for compound lastnames

df[df['Lastname'].astype(str).str.contains(' ')]['Lastname']

31       de Cordova
134    Duncan Smith
393    Marie Morris
592      Ahmad Khan
Name: Lastname, dtype: object

In [14]:
# Remove unwanted columns and check length

df = df[['Constituency','Firstname','Lastname']]
len(df)

650

In [15]:
# Download all current Members of Parliament

url = "https://www.theyworkforyou.com/mps/?f=csv"
df_mp=pd.read_csv(url, header=0)

In [16]:
# Save file to local storage
# Post next UK election when theyworkforyou update their website you may wish to 
# comment out this next line and use the raw file (captured in 2023) provided.
df_mp.to_csv('mps_raw.csv')

df_mp = pd.read_csv('mps_raw.csv')

In [17]:
df_mp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    649 non-null    int64 
 1   Person ID     649 non-null    int64 
 2   First name    649 non-null    object
 3   Last name     649 non-null    object
 4   Party         649 non-null    object
 5   Constituency  649 non-null    object
 6   URI           649 non-null    object
dtypes: int64(2), object(5)
memory usage: 35.6+ KB


In [18]:
df_mp.head(n=5)

Unnamed: 0.1,Unnamed: 0,Person ID,First name,Last name,Party,Constituency,URI
0,0,10001,Diane,Abbott,Labour,Hackney North and Stoke Newington,https://www.theyworkforyou.com/mp/10001/diane_...
1,1,25034,Debbie,Abrahams,Labour,Oldham East and Saddleworth,https://www.theyworkforyou.com/mp/25034/debbie...
2,2,24878,Nigel,Adams,Conservative,Selby and Ainsty,https://www.theyworkforyou.com/mp/24878/nigel_...
3,3,25661,Bim,Afolami,Conservative,Hitchin and Harpenden,https://www.theyworkforyou.com/mp/25661/bim_af...
4,4,11929,Adam,Afriyie,Conservative,Windsor,https://www.theyworkforyou.com/mp/11929/adam_a...


In [19]:
df_mp.tail(n=5)

Unnamed: 0.1,Unnamed: 0,Person ID,First name,Last name,Party,Constituency,URI
644,644,11791,Jeremy,Wright,Conservative,Kenilworth and Southam,https://www.theyworkforyou.com/mp/11791/jeremy...
645,645,25649,Mohammad,Yasin,Labour,Bedford,https://www.theyworkforyou.com/mp/25649/mohamm...
646,646,25806,Jacob,Young,Conservative,Redcar,https://www.theyworkforyou.com/mp/25806/jacob_...
647,647,24822,Nadhim,Zahawi,Conservative,Stratford-on-Avon,https://www.theyworkforyou.com/mp/24822/nadhim...
648,648,25386,Daniel,Zeichner,Labour,Cambridge,https://www.theyworkforyou.com/mp/25386/daniel...


In [20]:
df_mp['Lastname']=df_mp['Last name']
df_mp['Firstname']=df_mp['First name']
df_mp = df_mp[['Constituency','Firstname','Lastname']]
len(df_mp)

649

In [21]:
# All matching columns

len(df.merge(df_mp))

599

In [22]:
# Match on First name and Last name

len(df.merge(df_mp, on=['Firstname','Lastname'] ))

624

In [23]:
# Match on Consistency and Last name

len(df.merge(df_mp, on=['Constituency','Lastname'] ))

607

In [24]:
# Match on Consistency and First name

len(df.merge(df_mp, on=['Constituency','Firstname'] ))

602

In [25]:
# Match on Lastname

len(df.merge(df_mp, on=['Lastname'] ))

982

In [26]:
# Match on Firstname

len(df.merge(df_mp, on=['Firstname'] ))

2663

In [27]:
# Match on Consistency

len(df.merge(df_mp, on=['Constituency'] ))

622

In [28]:
df_outer = df.merge(df_mp, on=['Constituency'],how="outer",indicator=True)
df_outer[df_outer['_merge']=='right_only']['Constituency'].head(n=5)

650    Birmingham, Hall Green
651      Liverpool, Wavertree
652         Sheffield, Hallam
653     Liverpool, West Derby
654    Birmingham, Hodge Hill
Name: Constituency, dtype: object

In [29]:
df_outer[df_outer['_merge']=='left_only']['Constituency'].head(n=5)

46     Birmingham Edgbaston
47     Birmingham Erdington
48    Birmingham Hall Green
49    Birmingham Hodge Hill
50      Birmingham Ladywood
Name: Constituency, dtype: object

In [30]:
# Remove commas from both dataframes

df_mp['Constituency'] = df_mp['Constituency'].str.replace(',', '')
df['Constituency'] = df['Constituency'].str.replace(',', '')

In [31]:
# Match on Consistency

len(df.merge(df_mp, on=['Constituency'] ))

649

In [32]:
# Repeat perfect match count

len(df.merge(df_mp))

624

In [33]:
# Find consitutencies where neither Firstname nor Lastname match
# These are candidates by-elections.

df_inner = df.merge(df_mp, on=['Constituency'])
df_inner[(df_inner['Firstname_x'] != df_inner['Firstname_y']) & (df_inner['Lastname_x'] != df_inner['Lastname_y'])]

Unnamed: 0,Constituency,Firstname_x,Lastname_x,Firstname_y,Lastname_y
4,Airdrie and Shotts,Neil,Gray,Anum,Qaisar
29,Batley and Spen,Tracy,Brabin,Kim,Leadbeater
47,Birmingham Erdington,Jack,Dromey,Paulette,Hamilton
130,Chesham and Amersham,Cheryl,Gillan,Sarah,Green
139,City of Chester,Chris,Matheson,Samantha,Dixon
268,Hartlepool,Mike,Hill,Jill,Mortimer
392,Newton Abbot,Anne,Marie Morris,Anne Marie,Morris
410,North Shropshire,Owen,Paterson,Helen,Morgan
432,Old Bexley and Sidcup,James,Brokenshire,Louie,French
531,Southend West,David,Amess,Anna,Firth


In [34]:
# Find constituencies where either only the Firstname matches or only the Lastname
# These are matches we should have found

df_inner = df.merge(df_mp, on=['Constituency'])
df_inner[(df_inner['Firstname_x'] == df_inner['Firstname_y']) & (df_inner['Lastname_x'] != df_inner['Lastname_y']) |
         (df_inner['Firstname_x'] != df_inner['Firstname_y']) & (df_inner['Lastname_x'] == df_inner['Lastname_y'])]

Unnamed: 0,Constituency,Firstname_x,Lastname_x,Firstname_y,Lastname_y
46,Birmingham Edgbaston,Preet,Gill,Preet Kaur,Gill
99,Burton,Kate,Griffiths,Kate,Kniveton
122,Central Suffolk and North Ipswich,Dan,Poulter,Daniel,Poulter
272,Hayes and Harlington,John,McDonnell,John Martin,McDonnell
311,Kingston upon Hull North,Diana,Johnson,Diana R.,Johnson
316,Lagan Valley,Jeffrey,Donaldson,Jeffrey M.,Donaldson
394,North Antrim,Ian,Paisley,Ian,Paisley Jnr
502,Slough,Tanmanjeet,Dhesi,Tan,Dhesi
510,South Down,Chris,Hazzard,Christopher,Hazzard
526,South West Norfolk,Liz,Truss,Elizabeth,Truss


In [35]:
# Remove extra suffixes in TheyWorkForYou Firstnames

df_mp['Firstname'] = df_mp['Firstname'].str.split().str[0]

In [36]:
# Repeat perfect match count

len(df.merge(df_mp))

628

In [37]:
# Find constituencies where either only the Firstname matches or only the Lastname
# These are matches we should have found

df_inner = df.merge(df_mp, on=['Constituency'])
df_unmatched = df_inner[(df_inner['Firstname_x'] == df_inner['Firstname_y']) & (df_inner['Lastname_x'] != df_inner['Lastname_y']) |
         (df_inner['Firstname_x'] != df_inner['Firstname_y']) & (df_inner['Lastname_x'] == df_inner['Lastname_y'])]
df_unmatched

Unnamed: 0,Constituency,Firstname_x,Lastname_x,Firstname_y,Lastname_y
99,Burton,Kate,Griffiths,Kate,Kniveton
122,Central Suffolk and North Ipswich,Dan,Poulter,Daniel,Poulter
392,Newton Abbot,Anne,Marie Morris,Anne,Morris
394,North Antrim,Ian,Paisley,Ian,Paisley Jnr
502,Slough,Tanmanjeet,Dhesi,Tan,Dhesi
510,South Down,Chris,Hazzard,Christopher,Hazzard
526,South West Norfolk,Liz,Truss,Elizabeth,Truss
605,Wealden,Nus,Ghani,Nusrat,Ghani
615,West Dunbartonshire,Martin,Docherty-Hughes,Martin,Docherty


In [38]:
# Save unmatched to pick up in Chapter 3

df_unmatched.to_csv('mps_unmatched.csv', index=False)

In [39]:
df.to_csv('mps_wiki_clean.csv', index=False)

In [40]:
df_mp.to_csv('mps_they_clean.csv', index=False)