# This jupyter notebook scrapes unstructured data from the website "The Political Graveyard" (www.thepoliticalgraveyard.com) into an excel spreadsheet
## The historical dataset that is generates is about all religious politicians born in the United States by their religion, birth state and county as well as lived state.
### You will need to download chromedriver to exceute this notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import requests

### Create a dataset for religion, state of residence, birth state and birth county for each politician

In [62]:
# Using Selenium for web-scraping

# add the location for chromedriver her
EXE_PATH = r'C:\Users\nikhi\Desktop\Summer 2020\RA Work\Scraping\chromedriver.exe'
browser = webdriver.Chrome(executable_path=EXE_PATH)

# the following religious denominations have their data arranged statewise
religion1 = ['Methodist', 'Catholic', 'Presbyterian' , 'Episcopalian', 'Baptist', 'Congregationalist', 'Jewish', 'Lutheran', 'Unitarian', 'Protestant', 'Christian', 'Mormon', 'Disciples of Christ', 'Quaker', 'Christian Reformed']

# the following religious denominations have their data all in the same webpage
religion2=['Church of Christ', 'Eastern Orthodox', 'Christian Scientist', 'Pentecostal', 'Brethren', 'Atheist or agnostic', 'Muslim', 'Swedenborgian', 'Adventist', 'Nazarene', 'Mennonite', 'Spiritualist', 'Buddhist' , 'Puritan' , 'Deist', 'Scientologist', 'Hindu']

# a dictionary for data entries
dic_pol_name_state_rel = {}

# counter for dictionary entry
count = 0

# loop over each religion
for rel in religion1:

    url = "http://politicalgraveyard.com/index.html#PE"
    
    # opne the link to the webpage
    browser.get(url)
    browser.implicitly_wait(5)

    # click on the link whose text contains name of the religion
    rel_link = browser.find_element_by_link_text(rel).click()

    # get the URL of the link
    rel_url = browser.current_url

    html = urlopen(rel_url)
    soup = BeautifulSoup(html, 'lxml')

    # get all the links in that reigion's webpage
    link = soup.findAll('a')

    # a list of all the links for various states for that religion 
    state_links=[]

    # loop over all the links in that religion's webpage
    # the strategy is to keep opening the 'geo' links until 'The Political Grveyard' link appears 
    # when the loop breaks

    for each in link:
        if('The Political Graveyard' in each.text):
            break
        elif(('geo' in each.get('href')) == True):
            # get the link for the states and append to the list
            state_links.append(each.get('href'))

    # loop over all the 'geo' links in that religion's webpage
    # the strategy is to keep opening the 'geo' links and extract 
    # state name and politician's name from the webpage

    for st in state_links:
        try:
            html = requests.get(st)
            soup = BeautifulSoup(html.content)

            # identifier for state name
            a = 'Politicians in'

            # loop over all the <\p> tags to identify state's name in each webpage
            for x in (soup.find_all('p')):
                if rel in ['Episcopalian', 'Congregationalist', 'Jewish', 'Protestant', 'Mormon', 'Quaker', 'Christian Reformed']:
                    # in these religions, the text contains additional religion information that we do not want in the state name
                    # we substring accordingly
                    state = str(x)[str(x).find(a)+15:str(x).find('<br/>',str(x).find(a)+15)]
                else: 
                    state = str(x)[str(x).find(a)+15:str(x).find('</p>')]

            # open the state link
            browser.get(st)
            browser.implicitly_wait(5)
            
            html = urlopen(st)
            soup = BeautifulSoup(html, 'lxml')

            #  save the html content as a string
            string = str(soup)

            # find all the bold texts in the page
            name = soup.findAll('b')

            # flag to identify whether the bold text is a new name or modified version of the old name
            # Here I use the fact that new names appear only after the religion name is mentioned in the bold text
            flag = 1
            
            # list of all politician names found on the religion by state webpage
            pol_list=[]

            # loop over all the bold text tags
            for each in name:

                # if the text contains the word 'Political', it means that all the names have been scraped
                if('Political' in each.text):
                    break

                # if next bold tag is a new name, flag = 1
                if flag==1:
                    if ('(' in each.text):
                        # if text contains info on birth/death year in (), remove that and save the data
                        pol_list.append(each.text[0:each.text.find('(')])
                    else: 
                        pol_list.append(each.text)

                    # next tag will not be a new name
                    flag = 0

                elif (rel in each.text):
                    # if the bold tag contains a religion name, the next bold tag will have a new name
                    flag = 1

            # locate the positions of all the politician names in the text of the html content
            locate=[]
            for each in pol_list:
                locate.append(string.find(each))

            # find the length of 'locate' list
            l = len(locate)
            for x in np.arange(l):
                if x<=l-2:
                    # read the substring of 'string' as an html file through BeautifulSoup
                    soup = BeautifulSoup(string[locate[x]:locate[x+1]],features="html.parser")
                    for each in (soup.find_all('a')): # for each link in the substring
                        if 'born' in each.get('href'): # if the link is a 'born' link
                            temp1 = string[locate[x]:locate[x+1]].find(each.get_text()) # index of the link in the substring
                            temp2 = string[locate[x]:locate[x+1]].find('/a>',temp1)+5 # index of '/a>' after the link in the substring
                            temp3 = string[locate[x]:locate[x+1]].find('<a',temp2) # index of '<a' after '/a>' in the substring
                            temp4 = string[locate[x]:locate[x+1]][temp2:temp3].replace('\n', ' ') 
                            # extract the born state name located between temp2 and temp3 by replacing the '\n' with ' '

                            dic_pol_name_state_rel[count] = {'lived_state':state, 'born_state':temp4[:temp4.rfind(',')], 'born_county':each.get_text().replace('\n', ' '), 'religion':rel, 'pol_name':pol_list[x] }
                            count +=1

                elif x ==l-1: # if it is the last name in the list of politicians for that state
                    # read the substring of 'string' as an html file through BeautifulSoup
                    soup = BeautifulSoup(string[locate[x]:],features="html.parser")
                    for each in (soup.find_all('a')):  # for each link in the substring
                        if 'born' in each.get('href'): # if the link is a 'born' link
                            temp1 = string[locate[x]:].find(each.get_text()) # index of the link in the substring
                            temp2 = string[locate[x]:].find('/a>',temp1)+5 # index of '/a>' after the link in the substring
                            temp3 = string[locate[x]:].find('<a',temp2) # index of '<a' after '/a>' in the substring
                            temp4 = string[locate[x]:][temp2:temp3].replace('\n', ' ')
                            # extract the born state name located between temp2 and temp3 by replacing the '\n' with ' '

                            dic_pol_name_state_rel[count] = {'lived_state':state, 'born_state':temp4[:temp4.rfind(',')], 'born_county':each.get_text().replace('\n', ' '), 'religion':rel, 'pol_name':pol_list[x] }
                            count +=1
    
            # Some state's have their counties clubbed alphabetically A-C, D-F etc...

            html = urlopen(st)
            soup = BeautifulSoup(html, 'lxml')

            # find all the links in that state's webpage
            link = soup.findAll('a')

            # create a list of links containing the links to all the clubbed pages
            ord = []

            # if next bold tag is a new name, flag = 1
            flag = 0

            for each in link:
                # based on the link texts, identify the useful links that have the names of politians 
                if (('to' in each.get_text()) & ('zz' in each.get_text()) & ('geo' in each.get('href'))):
                    # save the links into the list
                    ord.append(each.get("href"))
                    flag = 1

            # if such kind of clubbing exists on ghe webpage, then run this part of the code
            if (flag == 1):
                # loop over all the links stored in the list above
                for curr in ord:

                    # open each link
                    html = urlopen(curr)
                    soup = BeautifulSoup(html, 'lxml')

                    #  save the html content as a string
                    string = str(soup)

                    # list of all politician names found on the religion by state webpage
                    pol_list=[]

                    # find all the bold tags
                    name=soup.findAll('b')

                    # for each bold tag, check if it has a new name
                    for each in name:
                        
                        # if the word 'Political' appears in a new bold tag, it means all the new names have been scraped
                        # so break
                        if('Political' in each.text):
                            break

                        # if the next bold tag has a new name
                        if flag==1:
                            if ('(' in each.text):
                                # if text contains info on birth/death year in (), remove that and save the data
                                pol_list.append(each.text[0:each.text.find('(')])
                            else: 
                                pol_list.append(each.text)


                            # the next bold tag does not contain a new name
                            flag = 0

                        elif (rel in each.text):
                            # if this tag has a religion name, the next bold tag has a new name
                            flag = 1
                            
                    # locate the positions of all the politician names in the text of the html content
                    locate=[]
                    for each in pol_list:
                        locate.append(string.find(each))

                    l = len(locate)
                    for x in np.arange(l):
                        if x<=l-2:
                            soup = BeautifulSoup(string[locate[x]:locate[x+1]],features="html.parser")
                            for each in (soup.find_all('a')):
                                if 'born' in each.get('href'): # if the link is a 'born' link
                                    temp1 = string[locate[x]:locate[x+1]].find(each.get_text()) # index of the link in the substring
                                    temp2 = string[locate[x]:locate[x+1]].find('/a>',temp1)+5 # index of '/a>' after the link in the substring
                                    temp3 = string[locate[x]:locate[x+1]].find('<a',temp2) # index of '<a' after '/a>' in the substring
                                    temp4 = string[locate[x]:locate[x+1]][temp2:temp3].replace('\n', ' ') 
                                    # extract the born state name located between temp2 and temp3 by replacing the '\n' with ' '

                                    dic_pol_name_state_rel[count] = {'lived_state':state, 'born_state':temp4[:temp4.rfind(',')], 'born_county':each.get_text().replace('\n', ' '), 'religion':rel, 'pol_name':pol_list[x] }
                                    count +=1

                        elif x ==l-1: # if it is the last name in the list of politicians for that state
                            # read the substring of 'string' as an html file through BeautifulSoup
                            soup = BeautifulSoup(string[locate[x]:],features="html.parser")
                            for each in (soup.find_all('a')):  # for each link in the substring
                                if 'born' in each.get('href'): # if the link is a 'born' link
                                    temp1 = string[locate[x]:].find(each.get_text()) # index of the link in the substring
                                    temp2 = string[locate[x]:].find('/a>',temp1)+5 # index of '/a>' after the link in the substring
                                    temp3 = string[locate[x]:].find('<a',temp2) # index of '<a' after '/a>' in the substring
                                    temp4 = string[locate[x]:][temp2:temp3].replace('\n', ' ')
                                    # extract the born state name located between temp2 and temp3 by replacing the '\n' with ' '

                                    dic_pol_name_state_rel[count] = {'lived_state':state, 'born_state':temp4[:temp4.rfind(',')], 'born_county':each.get_text().replace('\n', ' '), 'religion':rel, 'pol_name':pol_list[x] }
                                    count +=1

        except NoSuchElementException:
            pass

In [64]:
# Using Selenium for web-scraping
EXE_PATH = r'C:\Users\nikhi\Desktop\Summer 2020\RA Work\Scraping\chromedriver.exe'
browser = webdriver.Chrome(executable_path=EXE_PATH)

# loop over each religion
for rel in religion2:

    url = "http://politicalgraveyard.com/index.html#PE"
    
    # open each URL in the webpage
    browser.get(url)
    browser.implicitly_wait(5)

    # click on the link containing the religion name in text
    rel_link = browser.find_element_by_link_text(rel).click()
    rel_url = browser.current_url

    html = urlopen(rel_url)
    soup = BeautifulSoup(html, 'lxml')

    # find all the link in the religion webpage
    link = soup.findAll('a')

    try:
            browser.get(rel_url)
            browser.implicitly_wait(5)
            
            html = urlopen(rel_url)
            soup = BeautifulSoup(html, 'lxml')

            #  save the html content as a string
            string = str(soup)

            # list of all politician names found on the religion by state webpage
            pol_list=[]

            # find all the bold tags in the religion webpage
            name = soup.findAll('b')

            # flag to identify if the next bold tag contains a new name
            flag = 1
            
            # loop over each bold tag in the webpage
            for each in name:

                # if 'political' appears in the bold tag, it means all the names have been scraped
                if('Political' in each.text):
                    break

                # if the next tag contains a new name, store the name
                if flag==1:
                    if ('(' in each.text):
                        # if name has info on birth/death year, then remove that info by substring accordingly
                        pol_list.append(each.text[0:each.text.find('(')])
                    else: 
                        pol_list.append(each.text)

                    # next bold tage is not a new name
                    flag = 0

                elif (rel in each.text):
                    # if this tag has religion name, next bold tag has a new name
                    flag = 1

            #locate the positions of all the politician names in the text of the html content
            locate=[]
            for each in pol_list:
                locate.append(string.find(each))
        
            l = len(locate)
            for x in np.arange(l):
                if x<=l-2:
                    soup = BeautifulSoup(string[locate[x]:locate[x+1]],features="html.parser")
                    for each in (soup.find_all('a')):
                        if 'born' in each.get('href'): # if the link is a 'born' link
                            temp1 = string[locate[x]:locate[x+1]].find(each.get_text()) # index of the link in the substring
                            temp2 = string[locate[x]:locate[x+1]].find('/a>',temp1)+5 # index of '/a>' after the link in the substring
                            temp3 = string[locate[x]:locate[x+1]].find('<a',temp2) # index of '<a' after '/a>' in the substring
                            temp4 = string[locate[x]:locate[x+1]][temp2:temp3].replace('\n', ' ') 
                            # extract the born state name located between temp2 and temp3 by replacing the '\n' with ' '

                            dic_pol_name_state_rel[count] = {'lived_state':state, 'born_state':temp4[:temp4.rfind(',')], 'born_county':each.get_text().replace('\n', ' '), 'religion':rel, 'pol_name':pol_list[x] }
                            count +=1

                elif x ==l-1: # if it is the last name in the list of politicians for that state
                    # read the substring of 'string' as an html file through BeautifulSoup
                    soup = BeautifulSoup(string[locate[x]:],features="html.parser")
                    for each in (soup.find_all('a')):  # for each link in the substring
                        if 'born' in each.get('href'): # if the link is a 'born' link
                            temp1 = string[locate[x]:].find(each.get_text()) # index of the link in the substring
                            temp2 = string[locate[x]:].find('/a>',temp1)+5 # index of '/a>' after the link in the substring
                            temp3 = string[locate[x]:].find('<a',temp2) # index of '<a' after '/a>' in the substring
                            temp4 = string[locate[x]:][temp2:temp3].replace('\n', ' ')
                            # extract the born state name located between temp2 and temp3 by replacing the '\n' with ' '

                            dic_pol_name_state_rel[count] = {'lived_state':state, 'born_state':temp4[:temp4.rfind(',')], 'born_county':each.get_text().replace('\n', ' '), 'religion':rel, 'pol_name':pol_list[x] }
                            count +=1

    except NoSuchElementException:
        pass

In [3]:
# save the dictionary into a dataframe
df_pol_name_state_rel=pd.DataFrame.from_dict(dic_pol_name_state_rel).T

# save the dataframe into an excel file
df_pol_name_state_rel.to_excel('data_state_county_religion.xlsx',engine='xlsxwriter')

NameError: name 'dic_pol_name_state_rel' is not defined

### Change the name of birth state from abbreviated to its full name

In [5]:
# open an excel file that abbreviations for US state names into dataframe
df_st = pd.read_excel('state_name_abb.xlsx')
df_st['ABB'] = df_st['ABB'].str.replace(' ','')
df_pol_name_state_rel = pd.read_excel("data_state_county_religion.xlsx")
for abb in df_pol_name_state_rel['born_state']:
    # for each abbreviation in the clergy names dataframe
    flag = 0
    for each in df_st['ABB']:
        if str(each) in str(abb):
            # if part of the state name is in an actual state name
            # replace the state name (abbreviated) to full state name
            temp = df_st.loc[df_st['ABB']==each].iloc[0,0]
            df_pol_name_state_rel['born_state'] = df_pol_name_state_rel['born_state'].replace(abb, temp)
            flag = 1

In [3]:
# save the dataframe into an excel file
df_pol_name_state_rel.to_excel('data_state_county_religion_final.xlsx',engine='xlsxwriter')

In [6]:
df_pol_name_state_rel

Unnamed: 0.1,Unnamed: 0,lived_state,born_state,born_county,religion,pol_name
0,0,Alabama,Alabama,Jefferson County,Methodist,"Oscar William Adams, Jr."
1,1,Alabama,Alabama,Winston County,Methodist,Robert Brown Aderholt
2,2,Alabama,Alabama,Blount County,Methodist,Miles Clayton Allgood
3,3,Alabama,Alabama,Lawrence County,Methodist,Edward Berton Almon
4,4,Alabama,Alabama,Mobile County,Methodist,Julian Leigh Andrews
...,...,...,...,...,...,...
20176,20176,none,second cousin five times removed of,Lithgow Osborne,Deist,Benjamin Franklin
20177,20177,none,Virginia,Albemarle County,Deist,Thomas Jefferson
20178,20178,none,Michigan,Wayne County,Scientologist,Sonny Bono
20179,20179,none,Delaware,New Castle County,Scientologist,Stephen A. Davis
