In [1]:
import time
import os
import csv

from cand_methods import scrape_candbio

import requests
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool

import pandas as pd
import numpy as np

In [3]:
#!pwd

In [4]:
#os.chdir('C:/Users/matth/Documents/Coding/RA Fouirnaies/vs_candidate_bios')

## Candidate Biographies

### Exploring Candidate Pages

Candidate bio page follows this https://justfacts.votesmart.org/candidate/biography/333/janet-carroll-richardson where the name gets automatically filled after inputting the number

Appears to end at candidate 209,867. However, there could be gaps in the middle.

In [4]:
def make_query(num):
    query = 'https://justfacts.votesmart.org/candidate/biography/' + str(num)
    return query

**Initial Remarks**: The biographical information of candidates is organized into a number of collapsible cards ['Personal','Education','Political Experience', 'Caucuses/Former Committees', 'Professional Experience', 'Religious, Civic, and other Memberships'] (6 total). Information within the first card 'Personal' is organized slightly differently than in the other cards, with a bold header for each piece of information, while in the other 5 cards information is presented plainly in rows. 

The current plan is to create a general function or functions which loop through the individual attributes of each card for every card. 

In [5]:

q = make_query(333)
request = requests.get(q)
soup = BeautifulSoup(request.content, 'html.parser')


In [6]:
cards = soup.find_all(name = 'div', class_ = 'card card-plain accordion-card')

In [7]:
len(cards) # all cards besides education card which has class 'card card-plain accordion-card accordion-header'

5

**Personal**

In [8]:
vals = cards[0].find_all(name = 'p')
len(vals)

12

In [9]:
for val in vals:
    print(val.string.strip())

Full Name:
Janet Carroll 'Skeet' Richardson
Gender:
Female
Family:
Divorced;   1 daughter: Bethany, 10 yrs. old
Birth Date:
01/09/1959
Birth Place:
Fort Worth, TX
Home City:
Keller, TX


In [10]:
entries = []
for x in range(0,len(vals),2):
    if(x==(len(vals)-2)):
        entry = vals[x].string.strip() + vals[x+1].string.strip()
    else:
        entry = vals[x].string.strip() + vals[x+1].string.strip() + ';'
    entries.append(entry)

In [11]:
entries # could improve this method by creating a dictionary instead of a list

["Full Name:Janet Carroll 'Skeet' Richardson;",
 'Gender:Female;',
 'Family:Divorced;   1 daughter: Bethany, 10 yrs. old;',
 'Birth Date:01/09/1959;',
 'Birth Place:Fort Worth, TX;',
 'Home City:Keller, TX']

**Education**

In [12]:
edu_card = soup.find(name = 'div', class_ = 'card card-plain accordion-card accordion-header')

In [13]:
edu_list = edu_card.find_all(name = 'p')

In [14]:
for edu in edu_list:
    print(edu.string.strip())

Texas Christian University


**Others**

In [15]:
others = []
for card in cards:
    if(card != cards[0]):
        temp = card.find_all(name = 'p')
        o = []
        for t in temp:
            o.append(t.string.strip())
        others.append(o)

In [16]:
others

[['Legislative Aide, State Representative Doyle Willis',
  'Campaign Director for Congressional candidate George Richardson, 1986'],
 ['No caucus information on file.'],
 ["President/Producer, `Positive' TV Programming, Richardson-Carroll Prod",
  "Author, `Unsung Heroes'",
  'Public Relations Executive Television Executive, WFAA-TV, ABC'],
 ["Regional Member, Emmy's",
  'Member, National Association of Female Executives',
  'Member, Women in Communications']]

#### Testing Function(s) for scraping an individual page and writing it to a row of a csv

In [17]:
from cand_methods import scrape_candbio

In [2]:
headers = ['Candidate Number','Personal','Education','Political Experience','Caucuses/Former Committees',
           'Professional Experience','Religious, Civic, and other Memberships','Additional Information']

In [3]:
file = open('./draft_cbios.csv', 'w')
writer = csv.writer(file)
writer.writerow(headers)
file.close()

In [20]:
start_time = time.time()

#for x in range(1,101):
#    scrape_candbio(x)
    
print("My program took", time.time()-start_time, "to run") # 467.9851350784302 (8 min) for 100 bios

My program took 467.9851350784302 to run


**Troubleshooting**

In [1]:
from cand_methods import make_query
from cand_methods import get_personal
from cand_methods import get_edu
from cand_methods import get_others

In [62]:
num = 73
query = make_query(num)   
try:
    req = requests.get(query)
except:
    row = [num] + ['redirect error' for i in range(0,6)]

try:
    soup = BeautifulSoup(req.content, 'html.parser')
except:
    row = [num] + ['unknown' for i in range(0,6)]

# checking if page is missing/empty i.e. there is no candidate with that id number
try:
    main = soup.find(name='div', class_ = 'row text-center')
    cont = main.find(name = 'div', class_ = 'container')
    pnf = cont.find(name='h1', class_ = 'title')
    if (pnf.string == 'Page Not Found'):
        row = [num] + [np.nan for i in range(0,6)]
except:
    pass

cards = soup.find_all(name = 'div', class_ = 'card card-plain accordion-card') # all cards besides education
edu_card = soup.find(name = 'div', class_ = 'card card-plain accordion-card accordion-header')

### Parallelization of BeautifulSoup

Current estimates put the total scraping at >266 hours so parallelization will be necessary to complete this task. From online forums, `request.get` apparently is a bottleneck in Python so the solutions are either to rewrite the existing function using Selenium and use the existing `parallel` function (maybe unnecessarily bulky) or write a new function for parallelizing BeautifulSoup tasks. 

In [5]:
test_data = list(range(1,101)) # without multiprocessing this should take around 8 minutes

In [9]:
from multiprocessing.pool import ThreadPool

start_time = time.time()

with ThreadPool(6) as pool:
    pool.map(scrape_candbio, test_data)

print("My program took", time.time()-start_time, "to run") # 91.76298093795776 s (1 min 30 s) approx 5x faster
# this runtime is before checking for duplicates

My program took 91.76298093795776 to run


**Result**: Multiprocessing is 5x faster but appears to repeat tasks, could optimize further by preventing this. 

*Potential Solution*: Brute force solution would be at the start of the function, readcsv and check if number is in list of numbers in the num column and force return if it does. 

In [5]:
start_time = time.time()

with ThreadPool(6) as pool:
    pool.map(scrape_candbio, test_data)

print("My program took", time.time()-start_time, "to run") # 91.63398146629333  (1 min 30 s) almost same runtime

My program took 91.63398146629333 to run


**Result**: Resolved issue with repeat tasks but did not increase runtime

#### Attempting Multiple Runs

In [6]:
# run 1 (10,000)

data_1_10k = list(range(1,10000))

headers = ['Candidate Number','Personal','Education','Political Experience','Caucuses/Former Committees',
           'Professional Experience','Religious, Civic, and other Memberships','Additional Information']

file = open('./draft_cbios.csv', 'w')
writer = csv.writer(file)
writer.writerow(headers)
file.close()

start_time = time.time()

with ThreadPool(6) as pool:
    pool.map(scrape_candbio, data_1_10k)

print("My program took", time.time()-start_time, "to run") # 8597.337768554688 (2 hr 25 min)

error in: 5186
error in: 5594
error in:error in: 7216
error in: error in: 6409 6801

error in: 5187
error in: 5595
5990
error in: 5991
error in: 6802
error in: 5992
error in:error in: 5596
 6803
error in: 5188
error in: 6410
error in: 7217
error in: 6804
error in: 6411
error in: 5993
error in: 5994
error in: 5597
error in: 5995
error in: 5189
error in: 7218
error in: 5598
error in: 6805
error in: 5996
error in: 6412
error in:error in:error in: 5190
error in:  5599
6806
 error in:error in: 5997
 6413
7219
error in: 6414
error in: 5191
error in:error in: 7220
 6807
error in:error in: 5600
error in: 6808
 6415
error in: 7221
error in: 5998
error in: 5192
error in:error in: 6809
error in: 5601
 error in: 5193
error in: 6416
error in: 5999
7222
error in:error in: 6000
 7223
error in: 5602
My program took 8597.337768554688 to run


**Errors from above 1 to 10k run**

error in: 5186
error in: 5594
error in:error in: 7216
error in: error in: 6409 6801

error in: 5187
error in: 5595
5990
error in: 5991
error in: 6802
error in: 5992
error in:error in: 5596
 6803
error in: 5188
error in: 6410
error in: 7217
error in: 6804
error in: 6411
error in: 5993
error in: 5994
error in: 5597
error in: 5995
error in: 5189
error in: 7218
error in: 5598
error in: 6805
error in: 5996
error in: 6412
error in:error in:error in: 5190
error in:  5599
6806
 error in:error in: 5997
 6413
7219
error in: 6414
error in: 5191
error in:error in: 7220
 6807
error in:error in: 5600
error in: 6808
 6415
error in: 7221
error in: 5998
error in: 5192
error in:error in: 6809
error in: 5601
 error in: 5193
error in: 6416
error in: 5999
7222
error in:error in: 6000
 7223
error in: 5602
