# Chapter 2 - Data Standardization

## Step 1 - Acquire Data

### Wikipedia

In [None]:
# Get the Wikipedia webpage and find all tables.

url = "https://en.wikipedia.org/wiki/List_of_MPs_elected_in_the_2019_United_Kingdom_general_election"

import requests
from bs4 import BeautifulSoup

website_url = requests.get(url).text
soup = BeautifulSoup(website_url,'html.parser')
tables = soup.find_all('table')

In [None]:
# Find the table with members returned, extract rows as a list of lists and load into dataframe

import pandas as pd

for table in tables:
    if 'Member returned' in table.text:
        headers = [header.text.strip() for header in table.find_all('th')]
        # To fix - headers returns 7 but only 5 columns 
        headers = headers[:5]
        dfrows = []
        table_rows = table.find_all('tr')    
        for row in table_rows:
            td = row.find_all('td')
            dfrow = [row.text for row in td if row.text!='\n']
            dfrows.append(dfrow)

df_w = pd.DataFrame(dfrows, columns=headers)            

In [None]:
df_w.info()

In [None]:
# Remove unwanted columns and check length

df_w = df_w[['Constituency','Member returned','Notes']]
len(df_w)

### Saving to Local Storage

In [None]:
# Save file to local storage
#df_w.to_csv('mps_wiki_raw.csv')

df_w = pd.read_csv('mps_wiki_raw.csv')

### They Work For You

In [None]:
# Download all current Members of Parliament

url = "https://www.theyworkforyou.com/mps/?f=csv"
df_t = pd.read_csv(url, header=0)

In [None]:
df_t.info()

In [None]:
df_t.head(n=5)

### Add facebook links

In [None]:
def facelink(url):
    website_url = requests.get(url).text
    soup = BeautifulSoup(website_url,'html.parser')
    flinks = [f"{item['href']}" for item in soup.select(
            "a[href*='facebook.com']")]
    if flinks[0]!="https://www.facebook.com/TheyWorkForYou":
        return(flinks[0])
    else:
        return("")

In [None]:
df_t['Flink'] = df_t.apply(lambda x: facelink(x.URI), axis=1)

In [None]:
df_t.head(n=5)

In [None]:
# Remove unwanted columns and check length

df_t = df_t[['Constituency','First name','Last name','Flink']]
len(df_t)

### Saving to Local Storage

In [None]:
# Save file to local storage
# After next UK election when theyworkforyou update their website you may wish to 
# comment out this next line and use the raw file (captured in 2023) provided.

#df_t.to_csv('mps_they_raw.csv')

df_t = pd.read_csv('mps_they_raw.csv')

## Step 2 - Data Cleansing

#### Wikipedia Data

In [None]:
df_w.head(n=5)

In [None]:
df_w.tail(n=5)

In [None]:
# Rename columns for consistency

df_w = df_w.rename(columns={ 'Member returned' : 'Fullname'})

# Remove null rows at start and end and strip tailing '\n'

df_w = df_w.dropna()
df_w['Constituency'] = df_w['Constituency'].str.rstrip("\n")
df_w['Fullname'] = df_w['Fullname'].str.rstrip("\n")

In [None]:
# Check \n not elsewhere in Fullname

df_w[df_w['Fullname'].astype(str).str.contains('\n')]

In [None]:
# Strip leading '\n' 

df_w['Fullname'] = df_w['Fullname'].str.lstrip("\n")

In [None]:
# Check \n not elsewhere in Constituency

df_w[df_w['Constituency'].astype(str).str.contains('\n')]

In [None]:
# Split into Firstname and compound Lastname

df_w['Firstname'] = df_w['Fullname'].str.split().str[0]
df_w['Lastname'] = df_w['Fullname'].astype(str).apply(lambda x: ' '.join(x.split()[1:]))

In [None]:
# Check for compound lastnames

df_w[df_w['Lastname'].astype(str).str.contains(' ')]['Lastname']

#### They Work for You Data

In [None]:
df_t = df_t.rename(columns={'Last name' : 'Lastname', 'First name' : 'Firstname'})

### Calculate Exact Match Counts 

In [None]:
# All matching columns

len(df_w.merge(df_t, on=['Constituency','Firstname','Lastname']))

In [None]:
# Match on First name and Last name

len(df_w.merge(df_t, on=['Firstname','Lastname'] ))

In [None]:
# Match on Consistency and Last name

len(df_w.merge(df_t, on=['Constituency','Lastname'] ))

In [None]:
# Match on Consistency and First name

len(df_w.merge(df_t, on=['Constituency','Firstname'] ))

In [None]:
# Match on Lastname

len(df_w.merge(df_t, on=['Lastname'] ))

In [None]:
# Match on Firstname

len(df_w.merge(df_t, on=['Firstname'] ))

In [None]:
# Match on Consistency

len(df_w.merge(df_t, on=['Constituency'] ))

## Step 3 - Further Cleansing

### Consitutency

In [None]:
df_w_outer = df_w.merge(df_t, on=['Constituency'],how="outer",indicator=True)
df_w_outer[df_w_outer['_merge']=='right_only']['Constituency'].head(n=5)

In [None]:
df_w_outer[df_w_outer['_merge']=='left_only']['Constituency'].head(n=5)

In [None]:
# Remove commas from both dataframes

df_t['Constituency'] = df_t['Constituency'].str.replace(',', '')
df_w['Constituency'] = df_w['Constituency'].str.replace(',', '')

In [None]:
# Match on Consistency

len(df_w.merge(df_t, on=['Constituency'] ))

In [None]:
# Repeat perfect match count

len(df_w.merge(df_t, on=['Constituency','Firstname','Lastname']))

### Firstname

In [None]:
df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))
df_w_inner[(df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t'])]

In [None]:
df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))
df_w_inner[(df_w_inner['Firstname_w'] == df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t']) |
         (df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] == df_w_inner['Lastname_t'])]

In [None]:
# Remove extra suffixes in TheyWorkForYou Firstnames

df_t['Firstname'] = df_t['Firstname'].str.split().str[0]

In [None]:
# Final resolved match count

df_resolved = df_w.merge(df_t, on=['Firstname','Lastname'] )
len(df_resolved)

In [None]:
df_w_inner = df_w.merge(df_t, on=['Constituency'], suffixes=('_w', '_t'))
df_w_unmatched = df_w_inner[(df_w_inner['Firstname_w'] == df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] != df_w_inner['Lastname_t']) |
         (df_w_inner['Firstname_w'] != df_w_inner['Firstname_t']) & (df_w_inner['Lastname_w'] == df_w_inner['Lastname_t'])]
df_w_unmatched

## Sample Problem: Find MPs who held their seat and currently have Facebook account

In [None]:
df_resolved.head(n=5)

In [None]:
# Select those records with a non-null Facebook reference

df_heldwithface = df_resolved[(df_resolved['Flink']!="") & (df_resolved['Notes']=="Seat held\n")]
len(df_heldwithface)

## Save Files for Subsequent Chapters

In [None]:
# Save unmatched to pick up in Chapter 3

df_w_unmatched.to_csv('mps_unmatched.csv', index=False)

In [None]:
df_w.to_csv('mps_wiki_clean.csv', index=False)

In [None]:
df_t.to_csv('mps_they_clean.csv', index=False)