In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import numpy as np
from urllib.parse import urljoin
import xlsxwriter
from openpyxl import load_workbook
import re

In [2]:
# load overview over municipalities
df = pd.read_excel("../data/NRW/nrw_mun.xlsx")

# select rows and columns
muns = df.loc[df['Wer']=='Edith', ['Gemeinde', 'Gemeindeschlüssel']]
muns.columns = ['mun_name', 'mun_code']

In [3]:
def get_url1(name):
    e.send_keys(name)
    e.send_keys(Keys.ENTER)
    soup = BeautifulSoup(d.page_source)
    d.find_element_by_id('suchfeld').clear()
    time.sleep(0.7)
    links = soup.find('tbody').find_all('a') 
    return [link.get('href') for link in links if 'Votemanager' not in link]

In [4]:
# set up webdriver and search field element
# d=webdriver.Chrome("../chromedriver.exe")
# d.get('https://wahlen.votemanager.de/#')
# e = d.find_element_by_id('suchfeld')

# # search for each municipality and get the url to the municipality specific overview page
# muns['url'] = muns['mun_name'].apply(get_url1)

In [5]:
# muns.to_pickle("./temp.pkl")
muns = pd.read_pickle("./temp.pkl")

In [6]:
muns.head()

Unnamed: 0,mun_name,mun_code,url
1,Duisburg,5112000,[http://wahlergebnis.duisburg.de/05112000/inde...
5,Mülheim an der Ruhr,5117000,[https://wahlpraesentation.muelheim-ruhr.de/05...
9,Wuppertal,5124000,[https://wahlen.wuppertal.de/05124000/index.html]
13,Goch,5154016,[]
17,Kevelaer,5154032,[]


In [7]:
# clean results:

# 1. export municipalities that weren't found on votemanager
writer = pd.ExcelWriter('../data/NRW/nrw_mun_edith.xlsx', engine='xlsxwriter')

muns[muns.astype(str)['url']=='[]'][['mun_name', 'mun_code']].to_excel(writer, index=False)

muns = muns[muns.astype(str)['url']!='[]'].reset_index(drop=True)

writer.save()

# 2. look at municipalities were 2 results were found
muns['results'] = muns['url'].str.len()

muns['url1'] = np.nan

for row in muns.itertuples():
    if row[4] == 1:
        muns.loc[row[0], 'url1'] = row[3]
    if row[4] > 1:
        muns.loc[row[0], 'url1'] = [link for link in row[3] if str(row[2]) in link]
        
muns.drop(columns=['url', 'results'], inplace=True)

In [8]:
# now, direct to the elections on 25-05-2014
def get_url2(url1):
    """ To get the urls specific to the Ratswahl elections.
    It takes as input the overview url for each municipality from wahlen.regioit
    and returns a list of all Ratswahl-url from its html text.
    """
    base = url1
    r = requests.get(url1)
    soup = BeautifulSoup(r.text)
    
    kom_row  = [ r for r in soup.find_all('tr') if 'Kommunalwahlen' in r.text ]
    
    res = [link.get('href') for link in BeautifulSoup(str(kom_row)).find_all('a') if '25.05.2014' in link]
    
    if res:
        href=res.pop()
        return urljoin(base,href)
    else:
        return np.nan

In [9]:
muns['url2'] = muns['url1'].apply(get_url2)

In [13]:
# remember to safe those with to excel writer!!!
writer = pd.ExcelWriter('../data/NRW/nrw_mun_edith.xlsx', engine='openpyxl')
# try to open an existing workbook
writer.book = load_workbook('../data/NRW/nrw_mun_edith.xlsx')
# copy existing sheets
writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
# read existing file
reader = pd.read_excel('../data/NRW/nrw_mun_edith.xlsx')
# write out the new sheet
muns[muns['url2'].isnull()][['mun_name', 'mun_code']].to_excel(writer,index=False,header=False,startrow=len(reader)+1)

writer.close()

muns.dropna(axis=0, inplace=True)

In [14]:
# From the overview url set up url for the overview of Wahlbezirke for Ratswahl:
# muns['url3'] = 

def get_url3(url2): 
    r = requests.get(url2) 
    soup = BeautifulSoup(r.text)
    base = url2
    hrefs = [link.get('href') for link in soup.find_all('a')]
    
    matches = ['Ratswahl', 'Landratswahl']
    
    res=[href for href in hrefs if any(x in href for x in matches)]
    
    if res:
        s = res.pop()
        href = re.sub('Gemeinde_.*','Uebersicht_wahlb.html', s)
        return urljoin(base,href)
    else:
        return np.nan

In [15]:
muns['url3'] = muns['url2'].apply(get_url3)

In [16]:
# remember to safe those with to excel writer!!!
writer = pd.ExcelWriter('../data/NRW/nrw_mun_edith.xlsx', engine='openpyxl')
# try to open an existing workbook
writer.book = load_workbook('../data/NRW/nrw_mun_edith.xlsx')
# copy existing sheets
writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets)
# read existing file
reader = pd.read_excel('../data/NRW/nrw_mun_edith.xlsx')
# write out the new sheet
muns[muns['url3'].isnull()][['mun_name', 'mun_code']].to_excel(writer,index=False,header=False,startrow=len(reader)+1)

writer.close()

muns.dropna(axis=0, inplace=True)

In [17]:
# get all the Wahlbezirke

def p2f(x):
    return float(x.strip(' %').replace(',','.'))/100

cols = ['votes', 'share', 'candidate', 'party', 'district', 'number', 'mun', 'mun_code']
df_final = pd.DataFrame(columns=cols)

for i, row in muns.iterrows():
    display(i, muns.loc[i,'mun_name'])
    
    base = muns.loc[i, 'url2']
    url3 = muns.loc[i, 'url3']

    href_bezirke = [row.get('href') for row in BeautifulSoup(requests.get(url3).text).find('table').find_all('a')[:-1]]
    
    url_bezirke = [urljoin(base,href) for href in href_bezirke]
    
    for x in range(len(url_bezirke)): 
        url = url_bezirke[x]
        soup = BeautifulSoup(requests.get(url).text)
        results = pd.read_html(str(soup.find_all('table')[1]))[0].iloc[:, 1:]
        results[['candidate','party']]  = results.iloc[:,0].str.split(',', expand=True, n=1)
        results.drop(results.columns[0], axis=1, inplace=True)
        
        re_searchnum = re.search('[0-9]+_(.*)\.html',href_bezirke[x])
                                 
        if (re_searchnum is not None):
            results['district'] = str(re_searchnum.group(1)).replace('__',' ')
            results['number'] = int(re.findall('[0-9]+',href_bezirke[x]).pop())
        else:
            results['district'] = str(re.search('Wahlbezirk_(.*)\.html', href_bezirke[x]).group(1))
            results['number'] = x+1

        results['mun'] = muns.loc[i,'mun_name']
        results['mun_code'] = muns.loc[i,'mun_code']
        results.columns = cols
        results['share']=results['share'].apply(p2f)        
        
        df_final = df_final.append(results)
    
    

0

'Duisburg'

1

'Mülheim an der Ruhr'

2

'Wuppertal'

3

'Hilden'

4

'Ratingen'

5

'Grevenbroich'

6

'Meerbusch'

8

'Aachen'

9

'Herzogenrath'

10

'Stolberg (Rhld.)'

11

'Heimbach'

12

'Kreuzau'

13

'Nideggen'

14

'Vettweiß'

15

'Elsdorf'

16

'Kerpen'

17

'Blankenheim'

18

'Kall'

19

'Weilerswist'

21

'Übach-Palenberg'

22

'Bergneustadt'

23

'Lindlar'

24

'Radevormwald'

25

'Wipperfürth'

SSLError: HTTPSConnectionPool(host='wahlen.kdvz-frechen.de', port=443): Max retries exceeded with url: /civitec/kwew2014/05374052/html5/Ratswahl_NRW_18_Wahlbezirk_nordwestl_Stadtgebiet.html (Caused by SSLError(SSLError("bad handshake: SysCallError(10060, 'WSAETIMEDOUT')")))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    display(muns)