# Chapter 2 - Data Standardization

## Step 1 - Acquire Data

### Wikipedia

In [1]:
# Get the Wikipedia webpage and find all tables.

url = "https://en.wikipedia.org/wiki/List_of_MPs_elected_in_the_2019_United_Kingdom_general_election"

import requests
from bs4 import BeautifulSoup

website_url = requests.get(url).text
soup = BeautifulSoup(website_url,'html.parser')
tables = soup.find_all('table')

In [2]:
# Find the table with members returned, extract rows as a list of lists and load into dataframe

import pandas as pd

for table in tables:
    if 'Member returned' in table.text:
        headers = [header.text.strip() for header in table.find_all('th')]
        # To fix - headers returns 7 but only 5 columns 
        headers = headers[:5]
        dfrows = []
        table_rows = table.find_all('tr')    
        for row in table_rows:
            td = row.find_all('td')
            dfrow = [row.text for row in td if row.text!='\n']
            dfrows.append(dfrow)

df_w = pd.DataFrame(dfrows, columns=headers)            

In [3]:
df_w.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 5 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Constituency                       650 non-null    object
 1   Party of incumbentbefore election  650 non-null    object
 2   Member returned                    650 non-null    object
 3   Party of incumbentafter election   650 non-null    object
 4   Notes                              650 non-null    object
dtypes: object(5)
memory usage: 25.6+ KB


In [4]:
# Remove unwanted columns and check length

df_w = df_w[['Constituency','Member returned','Notes']]
len(df_w)

652

### Saving to Local Storage

In [17]:
# Save file to local storage
#df_w.to_csv('mps_wiki_raw.csv')

df_w = pd.read_csv('mps_wiki_raw.csv')

### They Work For You

In [18]:
# Download all current Members of Parliament

url = "https://www.theyworkforyou.com/mps/?f=csv"
df_t = pd.read_csv(url, header=0)

In [19]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Person ID     650 non-null    int64 
 1   First name    650 non-null    object
 2   Last name     650 non-null    object
 3   Party         650 non-null    object
 4   Constituency  650 non-null    object
 5   URI           650 non-null    object
dtypes: int64(1), object(5)
memory usage: 30.6+ KB


In [20]:
df_t.head(n=5)

Unnamed: 0,Person ID,First name,Last name,Party,Constituency,URI
0,10001,Diane,Abbott,Labour,Hackney North and Stoke Newington,https://www.theyworkforyou.com/mp/10001/diane_...
1,25034,Debbie,Abrahams,Labour,Oldham East and Saddleworth,https://www.theyworkforyou.com/mp/25034/debbie...
2,24878,Nigel,Adams,Conservative,Selby and Ainsty,https://www.theyworkforyou.com/mp/24878/nigel_...
3,25661,Bim,Afolami,Conservative,Hitchin and Harpenden,https://www.theyworkforyou.com/mp/25661/bim_af...
4,11929,Adam,Afriyie,Conservative,Windsor,https://www.theyworkforyou.com/mp/11929/adam_a...


### Add facebook links

In [21]:
def facelink(url):
    website_url = requests.get(url).text
    soup = BeautifulSoup(website_url,'html.parser')
    flinks = [f"{item['href']}" for item in soup.select(
            "a[href*='facebook.com']")]
    if flinks[0]!="https://www.facebook.com/TheyWorkForYou":
        return(flinks[0])
    else:
        return("")

In [22]:
df_t['Flink'] = df_t.apply(lambda x: facelink(x.URI), axis=1)

In [23]:
df_t.head(n=5)

Unnamed: 0,Person ID,First name,Last name,Party,Constituency,URI,Flink
0,10001,Diane,Abbott,Labour,Hackney North and Stoke Newington,https://www.theyworkforyou.com/mp/10001/diane_...,https://facebook.com/Dianeabbott
1,25034,Debbie,Abrahams,Labour,Oldham East and Saddleworth,https://www.theyworkforyou.com/mp/25034/debbie...,
2,24878,Nigel,Adams,Conservative,Selby and Ainsty,https://www.theyworkforyou.com/mp/24878/nigel_...,https://facebook.com/nigel.adamsmp
3,25661,Bim,Afolami,Conservative,Hitchin and Harpenden,https://www.theyworkforyou.com/mp/25661/bim_af...,
4,11929,Adam,Afriyie,Conservative,Windsor,https://www.theyworkforyou.com/mp/11929/adam_a...,https://facebook.com/adamafriyieofficial


In [24]:
# Remove unwanted columns and check length

df_t = df_t[['Constituency','First name','Last name','Flink']]
len(df_t)

650

### Saving to Local Storage

In [25]:
# Save file to local storage
# After next UK election when theyworkforyou update their website you may wish to 
# comment out this next line and use the raw file (captured in 2023) provided.

#df_t.to_csv('mps_they_raw.csv')

df_t = pd.read_csv('mps_they_raw.csv')

## Step 2 - Data Cleansing

#### Wikipedia Data

In [37]:
df_w.head(n=5)

Unnamed: 0.1,Unnamed: 0,Constituency,Fullname,Notes,Firstname,Lastname
1,1,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock
2,2,Aberconwy,Robin Millar,"Previous incumbent, Guto Bebb, did not stand\n",Robin,Millar
3,3,Aberdeen North,Kirsty Blackman,Seat held\n,Kirsty,Blackman
4,4,Aberdeen South,Stephen Flynn,"Previous incumbent, Ross Thomson, did not stand\n",Stephen,Flynn
5,5,Airdrie and Shotts,Neil Gray,Seat held\n,Neil,Gray


In [38]:
df_w.tail(n=5)

Unnamed: 0.1,Unnamed: 0,Constituency,Fullname,Notes,Firstname,Lastname
646,646,Wythenshawe and Sale East,Mike Kane,Seat held\n,Mike,Kane
647,647,Yeovil,Marcus Fysh,Seat held\n,Marcus,Fysh
648,648,Ynys Môn,Virginia Crosbie,"Previous incumbent, Albert Owen, did not stand\n",Virginia,Crosbie
649,649,York Central,Rachael Maskell,Seat held\n,Rachael,Maskell
650,650,York Outer,Julian Sturdy,Seat held\n,Julian,Sturdy


In [28]:
# Rename columns for consistency

df_w = df_w.rename(columns={ 'Member returned' : 'Fullname'})

# Remove null rows at start and end and strip tailing '\n'

df_w = df_w.dropna()
df_w['Constituency'] = df_w['Constituency'].str.rstrip("\n")
df_w['Fullname'] = df_w['Fullname'].str.rstrip("\n")

In [29]:
# Check \n not elsewhere in Fullname

df_w[df_w['Fullname'].astype(str).str.contains('\n')]

Unnamed: 0.1,Unnamed: 0,Constituency,Fullname,Notes
228,228,Finchley and Golders Green,\nMike Freer,Seat held\n
373,373,Mole Valley,\nPaul Beresford,Seat held\n
545,545,Stockport,\nNavendu Mishra,"Previous incumbent, Ann Coffey, did not stand\n"
546,546,Stockton North,\nAlex Cunningham,Seat held\n
547,547,Stockton South,\nMatt Vickers,"Defeated incumbent, Paul Williams\n"
...,...,...,...,...
613,613,West Bromwich East,\nNicola Richards,"Previous incumbent, Tom Watson, did not stand\n"
614,614,West Bromwich West,\nShaun Bailey,"Previous incumbent, Adrian Bailey, did not sta..."
616,616,West Dunbartonshire,\nMartin Docherty-Hughes,Seat held\n
619,619,West Suffolk,\nMatthew Hancock,Seat held\n


In [30]:
# Strip leading '\n' 

df_w['Fullname'] = df_w['Fullname'].str.lstrip("\n")

In [31]:
# Check \n not elsewhere in Constituency

df_w[df_w['Constituency'].astype(str).str.contains('\n')]

Unnamed: 0.1,Unnamed: 0,Constituency,Fullname,Notes


In [32]:
# Split into Firstname and compound Lastname

df_w['Firstname'] = df_w['Fullname'].str.split().str[0]
df_w['Lastname'] = df_w['Fullname'].astype(str).apply(lambda x: ' '.join(x.split()[1:]))

In [33]:
# Check for compound lastnames

df_w[df_w['Lastname'].astype(str).str.contains(' ')]['Lastname']

31       de Cordova
134    Duncan Smith
393    Marie Morris
592      Ahmad Khan
Name: Lastname, dtype: object

#### They Work for You Data

In [34]:
df_t = df_t.rename(columns={'Last name' : 'Lastname', 'First name' : 'Firstname'})

### Calculate Exact Match Counts 

In [52]:
# All matching columns

len(df_w.merge(df_t, on=['Constituency','Firstname','Lastname']))

599

In [53]:
# Match on First name and Last name

len(df_w.merge(df_t, on=['Firstname','Lastname'] ))

624

In [54]:
# Match on Consistency and Last name

len(df_w.merge(df_t, on=['Constituency','Lastname'] ))

607

In [55]:
# Match on Consistency and First name

len(df_w.merge(df_t, on=['Constituency','Firstname'] ))

602

In [56]:
# Match on Lastname

len(df_w.merge(df_t, on=['Lastname'] ))

982

In [57]:
# Match on Firstname

len(df_w.merge(df_t, on=['Firstname'] ))

2663

In [58]:
# Match on Consistency

len(df_w.merge(df_t, on=['Constituency'] ))

623

## Step 3 - Further Cleansing

### Consitutency

In [59]:
df_w_outer = df_w.merge(df_t, on=['Constituency'],how="outer",indicator=True)
df_w_outer[df_w_outer['_merge']=='right_only']['Constituency'].head(n=5)

650    Birmingham, Hall Green
651      Liverpool, Wavertree
652         Sheffield, Hallam
653     Liverpool, West Derby
654    Birmingham, Hodge Hill
Name: Constituency, dtype: object

In [60]:
df_w_outer[df_w_outer['_merge']=='left_only']['Constituency'].head(n=5)

46     Birmingham Edgbaston
47     Birmingham Erdington
48    Birmingham Hall Green
49    Birmingham Hodge Hill
50      Birmingham Ladywood
Name: Constituency, dtype: object

In [61]:
# Remove commas from both dataframes

df_t['Constituency'] = df_t['Constituency'].str.replace(',', '')
df_w['Constituency'] = df_w['Constituency'].str.replace(',', '')

In [62]:
# Match on Consistency

len(df_w.merge(df_t, on=['Constituency'] ))

650

In [64]:
# Repeat perfect match count

len(df_w.merge(df_t, on=['Constituency','Firstname','Lastname']))

624

### Firstname

In [65]:
df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))
df_w_inner[(df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t'])]

Unnamed: 0,Unnamed: 0_w,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Unnamed: 0_t,Firstname_t,Lastname_t,Flink
4,5,Airdrie and Shotts,Neil Gray,Seat held\n,Neil,Gray,500,Anum,Qaisar,
29,30,Batley and Spen,Tracy Brabin,Seat held\n,Tracy,Brabin,361,Kim,Leadbeater,
47,48,Birmingham Erdington,Jack Dromey,Seat held\n,Jack,Dromey,262,Paulette,Hamilton,
130,131,Chesham and Amersham,Cheryl Gillan,Seat held\n,Cheryl,Gillan,250,Sarah,Green,
139,140,City of Chester,Chris Matheson,Seat held\n,Chris,Matheson,156,Samantha,Dixon,
268,269,Hartlepool,Mike Hill,Seat held\n,Mike,Hill,450,Jill,Mortimer,
392,393,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,445,Anne Marie,Morris,https://facebook.com/annemarie.morris.NA
410,411,North Shropshire,Owen Paterson,Seat held\n,Owen,Paterson,443,Helen,Morgan,
432,433,Old Bexley and Sidcup,James Brokenshire,Seat held\n,James,Brokenshire,223,Louie,French,
531,532,Southend West,David Amess,Seat held\n,David,Amess,206,Anna,Firth,


In [66]:
df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))
df_w_inner[(df_w_inner['Firstname_w'] == df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t']) |
         (df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] == df_w_inner['Lastname_t'])]

Unnamed: 0,Unnamed: 0_w,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Unnamed: 0_t,Firstname_t,Lastname_t,Flink
46,47,Birmingham Edgbaston,Preet Gill,Seat held\n,Preet,Gill,236,Preet Kaur,Gill,https://facebook.com/PreetKaurGillMP
99,100,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,349,Kate,Kniveton,
122,123,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,494,Daniel,Poulter,
272,273,Hayes and Harlington,John McDonnell,Seat held\n,John,McDonnell,412,John Martin,McDonnell,https://facebook.com/johnmcdonnellmp
311,312,Kingston upon Hull North,Diana Johnson,Seat held\n,Diana,Johnson,325,Diana R.,Johnson,https://facebook.com/DianaJohnsonHullNorth
316,317,Lagan Valley,Jeffrey Donaldson,Seat held\n,Jeffrey,Donaldson,161,Jeffrey M.,Donaldson,https://facebook.com/jeffrey.donaldson1
394,395,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,480,Ian,Paisley Jnr,
502,503,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,153,Tan,Dhesi,https://facebook.com/tandhesi
510,511,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,278,Christopher,Hazzard,https://facebook.com/chris.hazzard.77
526,527,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,602,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk


In [67]:
# Remove extra suffixes in TheyWorkForYou Firstnames

df_t['Firstname'] = df_t['Firstname'].str.split().str[0]

In [68]:
# Final resolved match count

df_resolved = df_w.merge(df_t, on=['Firstname','Lastname'] )
len(df_resolved)

628

In [69]:
df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))
df_w_unmatched = df_w_inner[(df_w_inner['Firstname_w'] == df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t']) |
         (df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] == df_w_inner['Lastname_t'])]
df_w_unmatched

Unnamed: 0,Unnamed: 0_w,Constituency,Fullname,Notes,Firstname_w,Lastname_w,Unnamed: 0_t,Firstname_t,Lastname_t,Flink
99,100,Burton,Kate Griffiths,"Previous incumbent, Andrew Griffiths, did not ...",Kate,Griffiths,349,Kate,Kniveton,
122,123,Central Suffolk and North Ipswich,Dan Poulter,Seat held\n,Dan,Poulter,494,Daniel,Poulter,
392,393,Newton Abbot,Anne Marie Morris,Seat held\n,Anne,Marie Morris,445,Anne,Morris,https://facebook.com/annemarie.morris.NA
394,395,North Antrim,Ian Paisley,Seat held\n,Ian,Paisley,480,Ian,Paisley Jnr,
502,503,Slough,Tanmanjeet Dhesi,Seat held\n,Tanmanjeet,Dhesi,153,Tan,Dhesi,https://facebook.com/tandhesi
510,511,South Down,Chris Hazzard,Seat held\n,Chris,Hazzard,278,Christopher,Hazzard,https://facebook.com/chris.hazzard.77
526,527,South West Norfolk,Liz Truss,Seat held\n,Liz,Truss,602,Elizabeth,Truss,https://facebook.com/ElizabethTrussSWNorfolk
605,606,Wealden,Nus Ghani,Seat held\n,Nus,Ghani,230,Nusrat,Ghani,https://facebook.com/NusGhaniofficial
615,616,West Dunbartonshire,Martin Docherty-Hughes,Seat held\n,Martin,Docherty-Hughes,159,Martin,Docherty,https://facebook.com/MartinDochertySNP


## Sample Problem: Find MPs who held their seat and currently have Facebook account

In [70]:
df_resolved.head(n=5)

Unnamed: 0,Unnamed: 0_x,Constituency_x,Fullname,Notes,Firstname,Lastname,Unnamed: 0_y,Constituency_y,Flink
0,1,Aberavon,Stephen Kinnock,Seat held\n,Stephen,Kinnock,346,Aberavon,https://facebook.com/stephenkinnock
1,2,Aberconwy,Robin Millar,"Previous incumbent, Guto Bebb, did not stand\n",Robin,Millar,429,Aberconwy,
2,3,Aberdeen North,Kirsty Blackman,Seat held\n,Kirsty,Blackman,48,Aberdeen North,https://facebook.com/aberdeennorth
3,4,Aberdeen South,Stephen Flynn,"Previous incumbent, Ross Thomson, did not stand\n",Stephen,Flynn,211,Aberdeen South,
4,6,Aldershot,Leo Docherty,Seat held\n,Leo,Docherty,158,Aldershot,https://facebook.com/pg/LeoDocherty4Aldershot


In [71]:
# Select those records with a non-null Facebook reference

df_heldwithface = df_resolved[(df_resolved['Flink']!="") & (df_resolved['Notes']=="Seat held\n")]
len(df_heldwithface)

474

## Save Files for Subsequent Chapters

In [72]:
# Save unmatched to pick up in Chapter 3

df_w_unmatched.to_csv('mps_unmatched.csv', index=False)

In [73]:
df_w.to_csv('mps_wiki_clean.csv', index=False)

In [74]:
df_t.to_csv('mps_they_clean.csv', index=False)