In [1]:
from pyquery import PyQuery as pq
import urllib2
import requests
import pandas as pd
import re
import time
import numpy as np
import json
from time import sleep

###1. Define URLs:###

In [2]:
ls_2014 = "http://myneta.info/ls2014/index.php?action=show_winners&sort=default"
ls_2009 = "http://myneta.info/ls2009/index.php?action=show_winners&sort=default"
ls_2004 = "http://myneta.info/loksabha2004/index.php?action=show_winners&sort=default"
base_2014 = "http://myneta.info/ls2014/"
base_2009 = "http://myneta.info/ls2009/"
base_2004 = "http://myneta.info/loksabha2004/"

url_list={'2004':ls_2004,'2009':ls_2009,'2014':ls_2014}
base_urls = {'2004':base_2004,'2009':base_2009,'2014':base_2014}

###2. Define key functions to get data ###

#### The following functions returns a dataframe of winners within each state ####

In [3]:
def process_winner_link(link, year):
    
    html = requests.get(link)
    doc = pq(html.content)
    trs = doc('table').eq(2).children('tr')

    datalist=[]

    for tr in trs:
        winner_dict = {'year':year}
        for i,td in enumerate(tr.findall('td')):
            if i == 0:
                winner_dict['idnum'] = td.text
            if i == 1:
                winner_dict['name'] = td.findall('a')[1].text
                winner_dict['url'] = pq(td)('a').attr.href
            if i == 2:
                winner_dict['constituency'] = td.text
            if i == 3:
                winner_dict['party'] = td.text
            if i == 4:
                winner_dict['crim_cases'] = td.text
                if td.text == None:
                    winner_dict['crim_cases'] = td.find('span').text
            if i == 5:
                winner_dict['education'] = td.text
            if i == 6:
                winner_dict['assets'] =  td.text.replace(',','').replace('Rs','').lstrip()
            if i == 7:
                winner_dict['liabilities'] =  td.text.replace(',','').replace('Rs','').lstrip()

        datalist.append(winner_dict)

    return pd.DataFrame(datalist)
    

####The following functions returns a dataframe of constituencies within a state####

In [4]:
def get_constituents(year, thelink):
    html = requests.get(thelink)
    doc = pq(html.content)
    trs = doc('table').eq(2).find('tr')
    datalist=[]
    print ""
    print "Number of constituent: %s :" % len(trs),
    for tr in trs:
        new_state = pq(tr)('th').text()
        if new_state != "":
            print "!",
            curr_state = new_state 
            #print "state: %s" % curr_state
        else:
            print ".",
            districts = pq(tr)('td')
            if districts != "":
                for x in districts:
                    dist_name = pq(x)('a').text()
                    if dist_name != "":
                        district_dict = {'year':year, 'state':curr_state}
                        district_dict['district'] = dist_name
                        district_dict['thelink'] = pq(x)('a').attr.href
                        datalist.append(district_dict)

    return pd.DataFrame(datalist)


####The following functions returns a dataframe of candidates from a constituency####

In [5]:
def get_candidates(constituency, state, thelink, year):

    masterlink = thelink
    thepage = requests.get(masterlink)
    doc = pq(thepage.content)

    trs = doc('table').eq(2).children('tr')

    #print pq(trs)
    datalist = []

    for tr in trs[1:]:
        candidate_dict = {'Year':year, 'State':state, 'Constituency':constituency, 'Winner': 0}
        for i,td in enumerate(pq(tr)('td')):
            if i == 0:
                candidate_dict['Name'] = pq(td)('a').text()
                candidate_dict['Link'] = pq(td)('a').attr.href
                if pq(td)('font').text() == "Winner":
                    candidate_dict['Winner'] = 1
            if i == 1:
                candidate_dict['Party'] = pq(td).text()
            if i == 2:
                candidate_dict['Criminal_Cases'] = pq(td).text()
            if i == 3:
                candidate_dict['Education'] = pq(td).text()                
            if i == 4:
                candidate_dict['Age'] = pq(td).text() 
            if i == 5:
                candidate_dict['Assets_Rs'] = td.text.replace("Rs",'').replace(',','').lstrip()
            if i == 6:
                candidate_dict['Liabilities_Rs'] = td.text.replace("Rs",'').replace(',','').lstrip()
        datalist.append(candidate_dict)
    return pd.DataFrame(datalist)

###3. Put it all together and get dataframes###

Get all the winner

In [6]:
#For each url, get the list of winners
winner_df_list = []
for year,url in url_list.iteritems():
    winner_df_list.append(process_winner_link(url, year))

winner_df = pd.concat(winner_df_list)

In [7]:
print winner_df.count()
winner_df.head(3)
winner_df.to_csv('winners.csv')

assets          1575
constituency    1575
crim_cases      1575
education       1575
idnum           1575
liabilities     1575
name            1575
party           1575
url             1575
year            1575
dtype: int64


Get all the contituencies

In [8]:
constituency_list = []

for year, url in base_urls.iteritems():
    constituency_list.append(get_constituents(year, url))

constituency_df = pd.concat(constituency_list)


Number of constituent: 226 : ! . . ! . . . . . . . . ! . . ! . . . . . ! . . . . . . . . . . . ! . . ! . . . . ! . . ! . . ! . . ! . . . . . . . . ! . . . . ! . . ! . . . ! . . . . . ! . . . . . . . . ! . . . . . . ! . . ! . . . . . . . . . ! . . . . . . . . . . . . . ! . . ! . . ! . . ! . . ! . . . ! . . . . . . . ! . . ! . . . . . ! . . . . . . . . ! . . ! . . . . . . . . . . . ! . . . . . . ! . . ! . . . . . . . . . . . . . . . . . . . . . ! . . . ! . . . . . . . . . . . . 
Number of constituent: 223 : ! . . ! . . . . . . . . . . . . ! . . ! . . . . . ! . . . . . . . . . . . ! . . ! . . . . ! . . ! . . ! . . ! . . . . . . . . ! . . . . ! . . ! . . . ! . . . . . ! . . . . . . . . ! . . . . . . ! . . ! . . . . . . . . . ! . . . . . . . . . . . . . ! . . ! . . ! . . ! . . ! . . . ! . . . . . . . ! . . ! . . . . . ! . . . . . . . . ! . . ! . . . . . . . . . . . ! . . ! . . . . . . . . . . . . . . . . . . . . . ! . . . ! . . . . . . . . . . . . 
Number of constituent: 223 : ! . . ! . . 

In [9]:
print constituency_df.count()
constituency_df.head(3)

Unnamed: 0,district,state,thelink,year
0,ANDAMAN & NICOBAR ISLANDS,ANDAMAN & NICOBAR ISLANDS,index.php?action=show_candidates&constituency_...,2014
1,AMALAPURAM,ANDHRA PRADESH,index.php?action=show_candidates&constituency_...,2014
2,ANAKAPALLE,ANDHRA PRADESH,index.php?action=show_candidates&constituency_...,2014


Get the constituents for each constituency

In [19]:
candidates_list = []

for index, row in constituency_df.iterrows() :
#    print "%s, %s, %s, %s" % (row['district'], row['state'], base_urls[row['year']]+row['thelink'], row['year'])
    temp_df = get_candidates(row['district'], row['state'], base_urls[row['year']]+row['thelink'], row['year'])
    candidates_list.append(temp_df)
    if index % 10 == 0:
        print ".",
    #sleep(1)
candidates_df = pd.concat(candidates_list)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


In [37]:
print candidates_df.count()
candidates_df[len(candidates_df)-100:len(candidates_df)].head(3)

Age               19134
Assets_Rs         19134
Constituency      19134
Criminal_Cases    19134
Education         19134
Liabilities_Rs    19134
Link              19134
Name              19134
Party             19134
State             19134
Winner            19134
Year              19134
dtype: int64


Unnamed: 0,Age,Assets_Rs,Constituency,Criminal_Cases,Education,Liabilities_Rs,Link,Name,Party,State,Winner,Year
7,52,1252940,Katwa,0,Graduate,1850000,candidate.php?candidate_id=5320,Saifuddin Choudhury,PDS,WEST BENGAL,0,2004
8,44,Nil,Katwa,0,Not Given,0,candidate.php?candidate_id=5324,Salil Dutta,CPI(ML)(L),WEST BENGAL,0,2004
9,46,12000,Katwa,0,12th Pass,0,candidate.php?candidate_id=5323,Shyamsundar Das,IND,WEST BENGAL,0,2004


###4. Download page of candidates and save as json###

In [27]:
pages_dict = {}

for index, row in candidates_df[['Link','Year']].iterrows() :
    html = requests.get(base_urls[row['Year']]+row['Link'])
    pages_dict[base_urls[row['Year']]+row['Link']] = html.content
    if index % 10 == 0.0:
        if index % 100 == 0.0:
            print "#",
        else:
            print ".",

    #sleep(1)
    

# . # . # # . # . # . # # . # . # . # . # . . # . # . # . # . # . # . # . # . # # # . # . # . . # # # # # . # . # # . # # . # . # # . # . # # . # # . # . # . # . # . # . # . # . # . # . # . # . # . # . # . # . . # . # # . # . # # . # . . # . . # . # . # . # . # . # . # . # . # . # . # . # . # . # . # . . # . # . # # . . # . . # . # # . . # . . # . # . . . # . # . # . # # # . # . # # . # . # . # # . # . # # # . # . . # # # . # . # . # . # . # . # . # # # . # # # . # . . # . . # . . # . . . . # . . # . . # . # . # . . # # . # # # . # . # . # # . # . # . # . . . # . # . # . # . # . # . # . # # . # . # . . # . # . # . . # . # . # . . # . # . # . . # . # . # . # . # . # . # . # . # # . # . # . . # . # . # . # . # . # . # . # # . # . # . # . # . # . # . # . # # . # . # # # . # . # # . # . # . # . # # . # # # . . # . # # . # # . # . . # # . . # . # . # . # # # . # . # # # . # # . # . # . # . # . # . # . # # . # . . # # . . . # . . # . # . # . # . # # . # . # . . # . # . . # . # . # . # . . # 

In [29]:
with open('candidate_pages.json', 'w') as fp:
    json.dump(pages_dict, fp, encoding='latin1')

In [36]:
for key, value in pages_dict.iteritems():
    if len(value) < 5000:
        print key