# Data Acquisition 
## The Purpose of this kernel is to illustrate the use of BeautifulSoup to Scrap Elections Data

In [13]:
# Let's write script to scrap NA 2008 Data
import requests
import urllib.parse
import pandas as pd
page = requests.get('https://ecp.gov.pk/ResultDetails.aspx?EleId=1&Election=General%20Election%2018%20Feb%202008')
contents = page.content

In [2]:
#Import beautiful soup to get things started
from bs4 import BeautifulSoup
soup = BeautifulSoup(contents, 'html.parser')

In [3]:
# Get list of Urls that list result of each constituency
list_urls = []
for link in soup.find_all('tr'):
    extracted_row = link.find('a')
    if extracted_row is not None:
        list_urls.append("https://ecp.gov.pk/"+extracted_row.get('href'))    

In [4]:
# let's have a peek at the list
list_urls[0:5]

['https://ecp.gov.pk/ConstResult.aspx?Const_Id=NA-1 &type=NA',
 'https://ecp.gov.pk/ConstResult.aspx?Const_Id=NA-2 &type=NA',
 'https://ecp.gov.pk/ConstResult.aspx?Const_Id=NA-3 &type=NA',
 'https://ecp.gov.pk/ConstResult.aspx?Const_Id=NA-4 &type=NA',
 'https://ecp.gov.pk/ConstResult.aspx?Const_Id=NA-5&type=NA']

In [5]:
def scrap_first_table_content(soup):
    #first table provides information about registered votes, votes polled, valid votes, rejected votes and turnout
    first_table = soup.select_one("table:nth-of-type(1)")
    all_rows = first_table.find_all('tr')
    first_table_list = []
    #first table list contains rows of first table, because each row represent's different entity
    #i.e. votes, votes polled, valid votes, rejected votes and turnout
    for row in all_rows:
        extractedEntity = row.select_one("td:nth-of-type(2)")
        if(extractedEntity is not None):
            first_table_list.append(extractedEntity.find('p').find('span').contents[0])
            
    return first_table_list

In [7]:
def scrap_second_table_content(soup, first_table_list):
    #second table contain candidate results in a constituency
    second_table = soup.select_one("table:nth-of-type(2)")
    all_rows_second = second_table.find_all('tr')
    
    for row in all_rows_second:
        if(row is not None):
            extracted_col1,extracted_col2,extracted_col3 = row.select_one("td:nth-of-type(1)"),\
                            row.select_one("td:nth-of-type(2)"),row.select_one("td:nth-of-type(3)")
            candidate_name = extracted_col1.find('p').contents[0] if(extracted_col1 is not None) else None
            party = extracted_col2.find('p').contents[0] if(extracted_col2 is not None) else None
            votes = extracted_col3.find('p').contents[0] if(extracted_col3 is not None) else None
            if(candidate_name is not None and party is not None and votes is not None):
                # first_table_list contains votes, votes polled, valid votes, rejected votes and turnout
                # not naming them here separtely to save time
                list_of_tuples.append((district,constituency,candidate_name,party,votes,first_table_list[2],\
                             first_table_list[3],first_table_list[1],first_table_list[0],first_table_list[4]))
    return list_of_tuples

## Time for some heavy lifting

In [8]:
# We need to create a session by visting primary_url first, otherwise the site will redirect us to other page
#which is not of our interest
primary_url = "https://ecp.gov.pk/ResultDetails.aspx?EleId=1&Election=General%20Election%2018%20Feb%202008"
session = requests.Session()
temporary_result = session.get(primary_url)

# Let's iterate the links and create a list of tuples
list_of_tuples = []
for url_item in list_urls:
    page = session.get(url_item)
    soup = BeautifulSoup(page.text, 'html.parser')
    # time to get district name and constituency
    heading = soup.find('p').find('span').contents[0]
    district = heading[heading.find("(")+1:heading.find(")")]
    constituency = heading.split(" ")[0]
    
    #first table provides information about registered votes, votes polled, valid votes, rejected votes and turnout
    first_table_list = scrap_first_table_content(soup)
    #second table contain candidate results in a constituency
    list_of_tuples = scrap_second_table_content(soup, first_table_list)

In [9]:
list_of_tuples[-5:]

[('Kech-cum-Gwadar.',
  'NA-272',
  'Dr. Muhammad Haider Baloch',
  'Pakistan Peoples Party Parliamentarians',
  '3514',
  '107930',
  '3992',
  '106936',
  '316766',
  '33.75 %'),
 ('Kech-cum-Gwadar.',
  'NA-272',
  'Mufti Ahtisham-ul-Haq Asia Abadi',
  'MUTTHIDA\xa0MAJLIS-E-AMAL\xa0PAKISTAN',
  '1237',
  '107930',
  '3992',
  '106936',
  '316766',
  '33.75 %'),
 ('Kech-cum-Gwadar.',
  'NA-272',
  'Syed Sher Jan. (R)',
  'Independent',
  '1520',
  '107930',
  '3992',
  '106936',
  '316766',
  '33.75 %'),
 ('Kech-cum-Gwadar.',
  'NA-272',
  'Yaqoob Bizenjo.',
  'Balochistan National Party (Awami)',
  '61655',
  '107930',
  '3992',
  '106936',
  '316766',
  '33.75 %'),
 ('Kech-cum-Gwadar.',
  'NA-272',
  'Zubeda Jalal',
  'Independent',
  '33564',
  '107930',
  '3992',
  '106936',
  '316766',
  '33.75 %')]

In [10]:
# let's see the number of records in our dataset
len(list_of_tuples)

2316

In [11]:
# Seems like you don't like tuples, go ahead and make dataframe near and dear to your heart
columns = ['Seat','ConstituencyTitle','CandidateName','Party','Votes','TotalValidVotes','TotalRejectedVotes','TotalVotes','TotalRegisteredVoters','Turnout']
df = pd.DataFrame([x for x in list_of_tuples], columns=columns)

In [None]:
# want to save data, specify the path
writer = pd.ExcelWriter('../NA13.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()