# Scraping

### Mètodes i utilitats

In [1]:
# http://stackoverflow.com/questions/7100125/storing-python-dictionaries

import json
from bson import json_util
import yaml

# http://api.mongodb.org/python/1.10.1/api/bson/json_util.html
# Mètodes per grabar y carregar fitxers en format json
def save_dict_json(dict, filename):
    with open(filename, 'wb') as fp:
        json.dump(dict, fp, default=json_util.default)

def load_dict_json(filename):
    with open(filename, 'rb') as fp:
        return json.load(fp, object_hook=json_util.object_hook)

# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
def load_dict_yaml(filename):
    with open(filename, 'rb') as fp:
        return yaml.load(fp)
    
# Mètode per grabar un fitxer de texte
def save_text_file(text, file_name, encoding = 'utf-8'):
    with open(file_name, "w") as text_file:
        if encoding:
            text_file.write(text.encode(encoding))
        else:
            text_file.write(text)
            
# Mètode per grabar un fitxer de texte
def save_list_text_file(text_list, file_name):
    with open(file_name, "w") as text_file:
        for item in text_list:
            text_file.write("%s\n" % item)
        
# Mètode per carregar un fitxer de texte
def load_text_file(file_name, encoding = 'utf-8'):
    with open(file_name, "r") as text_file:
        if encoding:
            return text_file.read().decode(encoding)
        else:
            return text_file.read()
        
# Mètode per eliminar caracters incorrectes als noms de fitxers (potser millor utilitzar: https://pypi.python.org/pypi/goldfinch/0.4)
invalid_filename_chars_windows = '\/:*?"<>|'
def del_invalid_chars(value, deletechars):
    for c in deletechars:
        value = value.replace(c,'')
    return value;

import datetime

# Mètode per cambiar de format de data de format_1 a format_2
def format_date(strFecha, format_1, format_2):
    dtDate = datetime.datetime.strptime(strFecha, format_1)
    return dtDate.strftime(format_2)

# Mètode que ens diu si una data cau en dimecres
# https://docs.python.org/2/library/datetime.html#datetime.date.weekday
def isWednesday(my_date):
    if my_date.date().weekday() == 2:  # 0: Monday, 1: Tuesday, 2: Wednesday, ...
        return True
    else:
        return False
    
# print "24/03/2015", "isWednesday:", isWednesday("24/03/2015")
# print "11/03/2015", "isWednesday:", isWednesday("11/03/2015")

In [2]:
def get_text_between( s, first, last, include_limits = False, first_ocurrences = True ):
    try:
        if first_ocurrences:
            if include_limits:
                start = s.index( first )
                end = s.rindex( last, start ) + len(last)
                return s[start:end]
            else:
                start = s.index( first ) + len( first )
                end = s.rindex( last, start ) 
                return s[start:end]
        else: 
            # look for last ocurrences of first and last
            if include_limits:
                start = s.rindex( first )
                end = s.index( last ) + len(last)
                return s[start:end]
            else:
                start = s.rindex( first ) + len( first )
                end = s.index( last ) 
                return s[start:end]            
    except ValueError:
        return ""
    
def remove_text_between( s, first, last, include_limits = False ):
    try:
        if include_limits:
            start = s.index( first ) + len( first )
            end = s.index( last, start ) 
            return s[:start] + s[end:] 
        else:
            start = s.index( first )
            end = s.index( last, start ) + len(last)
            return s[:start] + s[end:]
    except ValueError:
        return ""
    
def remove_spaces_and_newline(s):
    return re.sub(r'\s+', ' ', s.replace('\n',''))

In [3]:
# http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python/22776#22776
# http://blog.radevic.com/2012/07/python-download-url-to-file-with.html

import urllib2
import sys

# Mètode per descarregar fitxers d'una url
def download_file(url, file_name, path = "", verbose = False):
    u = urllib2.urlopen(url)
    f = open(path+file_name, 'wb')
    meta = u.info()
    file_size = int(meta.getheaders("Content-Length")[0])
    if verbose:
        print("Downloading: {0} Bytes: {1}".format(url, file_size))

    file_size_dl = 0
    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        
        if verbose:
            p = float(file_size_dl) / file_size
            status = "\r{0} bytes  [{1:.2%}]".format(file_size_dl, p)
            status = status + chr(8)*(len(status)+1)
            sys.stdout.write(status)
            sys.stdout.flush()

            # done = int(50 * file_size_dl / file_size)
            # sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )    
            # sys.stdout.flush()

    f.close()
    print("\n")

# url = 'http://www.pp.es/sites/default/files/documentos/pr_den_2015.pdf'
# file_name = url.split('/')[-1]    
# download_file(url, file_name,'../data/pp/')

## Scraping de la web de documents del Congrés

#### Mètodes per cercar i afegir diputats al diccionari, i mètodes per el scraping.

In [4]:
import unicodedata
import sys

# Mètode que treu els accents a un string (en format unicode)
def remove_accents(input_str):
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

#
def get_group(member_group_dict, legislature):
    if legislature:
        for key in member_group_dict:
            if legislature in member_group_dict[key]:
                return key
    else:
        legislature = 0
        for key in member_group_dict:
            if legislature < max(member_group_dict[key]):
                legislature = max(member_group_dict[key])
        return get_group(member_group_dict, legislature)

#
def get_member_group_by_id(diputados_dict, id, legislature):
    if len(diputados_dict[id]['group']) == 1:
        return diputados_dict[id]['group'].keys()[0]
    else:
        return get_group(loaded_diputados_dict[id]['group'], legislature)

def search_diputado_in_dict(my_dict, dip_surname, dip_name = "", verbose=False):
    result = []
    
    if dip_name != "":
        for key in my_dict:
            if remove_accents(dip_surname.upper()) == my_dict[key]['surname'] and remove_accents(dip_name.upper()) == my_dict[key]['name']:
                result.append(key)            
    else:
        for key in my_dict:
            if remove_accents(dip_surname.upper()) == my_dict[key]['surname']:
                result.append(key)

    if len(result) == 0:
        if verbose:
            print "ERROR en search_diputado_in_dict. No s'ha trobat:", dip_surname, ",", dip_name
        return None
    elif len(result) == 1:
        return result[0]
    else: # len(result) > 1:
        if verbose:
            str_result = ""
            for i in result:
                str_result += str(i) + ": " + my_dict[i]['surname'] + ", " + my_dict[i]['name'] + "\n"
            print "ERROR en search_diputado_in_dict. S'ha trobat més d'un resultat:\n", str_result.encode('utf-8')
        return result[0]
        
''' '''
loaded_diputados_dict = load_dict_json('diputados_dict.json')

s = u"Aznar López"
n = u"José María"
id = search_diputado_in_dict(loaded_diputados_dict,s,n, True)
if id:
    print id, ":", loaded_diputados_dict[id]
    print "group:", get_member_group_by_id(loaded_diputados_dict, id, 10)
else:
    print "ERROR en search_diputado_in_dict. No s'ha trobat:", s, n

s = u"Catalá Polo"
n = u""
id = search_diputado_in_dict(loaded_diputados_dict,s,n, True)
if id:
    print id, ":", loaded_diputados_dict[id]
    print "group:", get_member_group_by_id(loaded_diputados_dict, id, 10)
else:
    print "ERROR en search_diputado_in_dict. No s'ha trobat:", s, n


36 : {u'surname': u'AZNAR LOPEZ', u'group': {u'GP': [5, 6, 7]}, u'r_surname': u'Aznar L\xf3pez', u'r_name': u'Jos\xe9 Mar\xeda', u'name': u'JOSE MARIA'}
group: GP
1374 : {u'group': {u'GP': [10]}, u'surname': u'CATALA POLO', u'r_surname': u'Catal\xe1 Polo', u'r_name': u'Rafael', u'name': u'RAFAEL'}
group: GP


In [4]:
def add_member_to_dict(members_dict, surname, name, group, legislature, num_member):
    f_surname = remove_accents(unicode(surname).upper())
    f_name = remove_accents(unicode(name).upper())
    members_dict[num_member] = {'surname':f_surname, 'r_surname':surname, 'name':f_name, 'r_name':name, 'group':{group:[legislature]}}


def secure_add_member_to_dict(members_dict, surname, name, group, legislature, verbose = False):
    # Cerca del diputat al diccionari, si ja hi és s'actualitza la info. En cas contrari s'afegeix.
    id = search_diputado_in_dict(members_dict, surname, name)
    if id:
        if group in members_dict[id]['group']:
            if legislature in members_dict[id]['group'][group]:
                if verbose:
                    print "WARN: already in dict:", surname, "|", name, "|", group, "|", legislature
            else:
                members_dict[id]['group'][group].append(legislature)
        else:
            members_dict[id]['group'].update({group:[legislature]})
    else:
        add_member_to_dict(members_dict, surname, name, group, legislature, len(members_dict))


def scrap_webpage(url_page, my_dict, legislature, start_index = 0, verbose = False):
    
    print "Starting scraping..."

    legislatura = legislature
    web_page = start_index

    max_web_page = 50
    delay = 5
    element_id = "PIE"
    continue_scraping = True
    
    while continue_scraping :

        scrap_url_page = url_page.format(web_page)
        browser.get(scrap_url_page)

        # Wait for the page to load
        # http://selenium-python.readthedocs.org/en/latest/waits.html
        try:
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, element_id)))
            # print "Page is ready!"
        except TimeoutException:
            print "Loading took too much time!"
        except NoSuchElementException:
            print "Element id didn't find: " + element_id        

        html_page=browser.page_source

        if html_page is not None:
            print "Parsing: ..." + scrap_url_page[-80:]

            soup = BeautifulSoup(html_page)

            div_listado = soup.find('div', {'class' : 'listado_1'})
            if div_listado:
                li_tags = div_listado.findAll('li')
                if len(li_tags) > 0 :
                    for li_tag in li_tags:

                        # Obtenció de les dades d'un diputat: group, name[0]->'surname', name[1]->'name'
                        spam_tag = li_tag.find('span')
                        if spam_tag:
                            # Diputados de todas las legislaturas. Cerca amb dos paràmetres: pàgina i idLegislatura
                            # http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados?_piref73_2874067_73_1333049_1333049.next_page=/wc/busquedaAlfabeticaDiputados&paginaActual=1&idLegislatura=4&tipoBusqueda=completo
                            group = get_text_between(spam_tag.getText(),'(',')')
                            link_tag = li_tag.find('a')
                            name = link_tag.getText().split(', ')
                            if len(name) == 0:
                                name = link_tag.getText().split(' ')
                        else:
                            # Diputados que han causado baja:
                            # http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BajasLegAct?_piref73_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual=0
                            link_tag = li_tag.find('a')
                            group = get_text_between(link_tag.getText(),'(',')')
                            str_member = link_tag.getText().split(' (')[0]
                            name = str_member.split(', ')
                            if len(name) == 0:
                                name = str_member.split(' ')

                        # S'afegeix el diputat al diccionari
                        secure_add_member_to_dict(my_dict, name[0], name[1], group, legislatura, verbose)
                else:
                    print "No more results"
                    continue_scraping = False
            else:
                print "Tag not found: soup.find('div', {'class' : 'listado_1'})"
                continue_scraping = False
        else:
            print "URL not found:", scrap_url_page
            continue_scraping = False

        web_page += 1

        if web_page > max_web_page:
            print "Reached web_page limit:", web_page
            continue_scraping = False
    
    print "Scraping finished."
    return my_dict

### Obtenim el llistat de tots els diputats del Congrés amb els seus grups polítics.

In [5]:
# Búsqueda amb dos paràmetres: pàgina i idLegislatura
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados?_piref73_2874067_73_1333049_1333049.next_page=/wc/busquedaAlfabeticaDiputados&paginaActual=1&idLegislatura=4&tipoBusqueda=completo

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import os
import time
import string

os.environ["PATH"] = 'C:\Users\pablo_000\Documents\P\Data Science\chromedriver' \
    + os.pathsep + 'C:\Users\pfernandezs\Documents\P\Cosas\Data Science\posgrau\chromedriver'

browser = webdriver.Chrome()

diputados_dict = {}
legislatura = 5
url_page = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados?_piref73_2874067_73_1333049_1333049.next_page=/wc/busquedaAlfabeticaDiputados&paginaActual={}&idLegislatura=&tipoBusqueda=completo"

while legislatura < 11:
    
    scraping_url = string.replace(url_page,'idLegislatura=', 'idLegislatura=' + str(legislatura))
    new_diputados_dict = scrap_webpage(scraping_url, diputados_dict, legislatura, 0, verbose = True)
    legislatura += 1

print "Num. diputados:", len(diputados_dict)
# print diputados_dict
print "Saving 'diputados_dict'..."
save_dict_json(diputados_dict, 'diputados_dict.json')

Starting scraping...
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=0&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=1&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=2&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=3&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=4&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=5&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=6&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=7&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=8&idLegislatura=5&tipoBusqueda=completo
Parsing: ...busquedaAlfabeticaDiputados&paginaActual=9&idLegislatura=5&tipoBusqueda=completo
Parsing: ...usquedaAlfabeticaDiputados&paginaActu

A la base de dades no hi apareixen alguns ministres i altres membres del gobern que, en canvi, sí intervenen a les sessions de control del Congrés.  Els afegirem a mà:

In [6]:
#
loaded_diputados_dict = load_dict_json('diputados_dict.json')

# MINISTRO DE JUSTICIA (Catalá Polo)  -->  http://es.wikipedia.org/wiki/Rafael_Catal%C3%A1_Polo
secure_add_member_to_dict(loaded_diputados_dict, u"Catalá Polo", u"Rafael", u"GP", 10, verbose = True)
# MINISTRO DE JUSTICIA (Ruiz-Gallardón Jiménez)
secure_add_member_to_dict(loaded_diputados_dict, u"Ruiz-Gallardón Jiménez", u"Alberto", u"GP", 10, verbose = True)
# MINISTRO DE EDUCACIÓN, CULTURA Y DEPORTE (Wert Ortega)  -->  http://es.wikipedia.org/wiki/Jos%C3%A9_Ignacio_Wert
secure_add_member_to_dict(loaded_diputados_dict, u"Wert Ortega", u"José Ignacio", u"GP", 10, verbose = True)
# MINISTRA DE AGRICULTURA, ALIMENTACIÓN Y MEDIO AMBIENTE (García Tejerina)  -->  http://es.wikipedia.org/wiki/Isabel_Garc%C3%ADa_Tejerina
secure_add_member_to_dict(loaded_diputados_dict, u"García Tejerina", u"Isabel", u"GP", 10, verbose = True)
# MINISTRO DE DEFENSA (Morenés Eulate)
secure_add_member_to_dict(loaded_diputados_dict, u"Morenés Eulate", u"Pedro", u"GP", 10, verbose = True)

print "Num. diputados:", len(loaded_diputados_dict)
# print diputados_dict
print "Saving 'diputados_dict'..."
save_dict_json(loaded_diputados_dict, 'diputados_dict.json')

Num. diputados: 1379
Saving 'diputados_dict'...


Trobem d'altres diputats que no apareixen al dccionari perque es van donar de baixa.

In [7]:
# El diputat Yuste Cabello, Chesús no apareix en el diccionari perque es va donar de baixa el 02/07/2014:
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BusqForm?_piref73_1333155_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=71
# Hi han d'altres diputats que es van donar de baixa. Veure: 'diputados que han causado baja':
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BajasLegAct?_piref73_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual=0

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import os
import time

os.environ["PATH"] = 'C:\Users\pablo_000\Documents\P\Data Science\chromedriver' \
    + os.pathsep + 'C:\Users\pfernandezs\Documents\P\Cosas\Data Science\posgrau\chromedriver'

browser = webdriver.Chrome()

#
loaded_diputados_dict = load_dict_json('diputados_dict.json')
url_page = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BajasLegAct?_piref73_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual={}"

new_diputados_dict = scrap_webpage(url_page, loaded_diputados_dict, 10, 0, verbose = True)

print "\nNum. diputados:", len(new_diputados_dict)
# print new_diputados_dict
print "Saving 'diputados_dict'..."
save_dict_json(new_diputados_dict, 'diputados_dict.json')

Starting scraping...
Parsing: ...3_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual=0
Parsing: ...3_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual=1
WARN: already in dict: Ruiz-Gallardón Jiménez | Alberto | GP | 10
Parsing: ...3_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual=2
Parsing: ...3_2496066_73_2496053_2496053.next_page=/wc/diputadosBajaLegActual&paginaActual=3
No more results
Scraping finished.

Num. diputados: 1399
Saving 'diputados_dict'...


In [8]:
#
loaded_diputados_dict = load_dict_json('diputados_dict.json')
print "Num. diputados:", len(loaded_diputados_dict)

Num. diputados: 1399


### Nova URL amb tots els diputats de totes les legislatures

A la URL http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas trobem: "Listado de diputados en todas las legislaturas".  Farem un nou scrapping per obtenir a part dels seus nom, cognom i grup parlamentari, la seva foto.

In [5]:
def add_member_to_dict_v2(members_dict, surname, name, url_member, url_img_member, url_img_party, party_code, legislature, num_member):
    f_surname = remove_accents(unicode(surname).upper())
    f_name = remove_accents(unicode(name).upper())
    members_dict[num_member] = {'surname':f_surname, 'r_surname':surname, 'name':f_name, 'r_name':name, 
                                'url_member': url_member, 'url_img_member':url_img_member, 'id_img_member':"",
                                'url_img_party':url_img_party, 'id_img_party':"", 'party_code':party_code, 
                                'legislature':legislature}

def secure_add_member_to_dict_v2(members_dict, surname, name, url_member, url_img_member, url_img_party, party_code, legislature, verbose = False):
    # Cerca del diputat al diccionari, si ja hi és s'actualitza la info. En cas contrari s'afegeix.
    id = search_diputado_in_dict(members_dict, surname, name)
    if id:
        if verbose:
            print "WARN: already in dict:", surname, "|", name, "|", party_code
    else:
        add_member_to_dict_v2(members_dict, surname, name, url_member, url_img_member, url_img_party, party_code, legislature, len(members_dict))


def scrap_webpage_v2(url_page, my_dict, start_index = 0, verbose = False):
    
    print "Starting scraping..."
    
    web_congreso = 'http://www.congreso.es'

    web_page = start_index
    scrap_url_page = url_page.format(web_page)
    browser = webdriver.Chrome()
    browser.get(scrap_url_page)
    
    browser_sub = webdriver.Chrome()

    max_web_page = 90 # 87.7 = 2194 diputados / 25 diputados por pagina
    delay = 5
    element_id = "PIE"
    continue_scraping = True
    
    while continue_scraping :

        # Wait for the page to load
        # http://selenium-python.readthedocs.org/en/latest/waits.html
        try:
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, element_id)))
            # print "Page is ready!"
        except TimeoutException:
            print "Loading took too much time!"
        except NoSuchElementException:
            print "Element id didn't find: " + element_id
            
        html_page=browser.page_source

        if html_page:
            print "Parsing: ..." + scrap_url_page[-80:]

            soup = BeautifulSoup(html_page)
            
            div_listado = soup.find('div', {'class' : 'listado_1'})
            if div_listado:
                li_tags = div_listado.findAll('li')
                if len(li_tags) > 0 :
                    for li_tag in li_tags:

                        # Obtenció de les dades d'un diputat: name[0]->'surname', name[1]->'name', URL de la pàgina personal
                        link_tag = li_tag.find('a')
                        legislature = link_tag['href'].split('=')[-1]
                        url_member = web_congreso + link_tag['href']
                        str_member = link_tag.getText()
                        name = str_member.split(', ')
                        if len(name) == 0:
                            name = str_member.split(' ')
                            
                        #
                        browser_sub.get(url_member)
                        try:
                            WebDriverWait(browser_sub, delay).until(EC.presence_of_element_located((By.ID, element_id)))
                            # print "Page is ready!"
                        except TimeoutException:
                            print "Loading took too much time!"
                        except NoSuchElementException:
                            print "Element id didn't find: " + element_id
                            
                        html_page_sub=browser_sub.page_source
                            
                        if html_page_sub:
                            print "\tParsing: ..." + url_member[-80:]

                            soup_sub = BeautifulSoup(html_page_sub)
                            
                            div_datos_diputado = soup_sub.find('div', {'id' : 'datos_diputado'})
                            if div_datos_diputado:
                                img_member_tag = div_datos_diputado.find_next('img')
                                if img_member_tag:
                                    url_img_member = web_congreso + img_member_tag['src']
                                    img_party_tag = img_member_tag.find_next('img')
                                    if img_party_tag:
                                        url_img_party = web_congreso + img_party_tag['src']
                                    else:
                                        url_img_party = ""
                                    p_party_tag = img_member_tag.find_next('p',{'class' : 'nombre_grupo'})
                                    if p_party_tag:
                                        party_code = p_party_tag.getText()
                                    else:
                                        party_code = ""

                                    # S'afegeix el diputat al diccionari
                                    secure_add_member_to_dict_v2(my_dict, name[0], name[1], url_member, url_img_member, url_img_party, party_code, legislature, verbose)
                            else:
                                print "\tTag not found: soup.find('div', {'id' : 'datos_diputado'})"
                else:
                    print "No more results"
                    continue_scraping = False
            else:
                print "Tag not found: soup.find('div', {'class' : 'listado_1'})"
                continue_scraping = False
        else:
            print "URL not found:", scrap_url_page
            continue_scraping = False
            
        try:
            element_siguiente = browser.find_element_by_xpath("//*[contains(text(), 'Siguiente')]")
            element_siguiente.click()
        except NoSuchElementException as e:
            # print e
            print "Scraping finished..."
            continue_scraping = False

        web_page += 1

        if web_page > max_web_page:
            print "Reached web_page limit:", web_page
            continue_scraping = False
    
    print "Scraping finished."
    return my_dict

In [6]:
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import os
import time
import string

os.environ["PATH"] = 'C:\Users\pablo_000\Documents\P\Data Science\chromedriver' \
    + os.pathsep + 'C:\Users\pfernandezs\Documents\P\Cosas\Data Science\posgrau\chromedriver'

all_diputados_dict = {}
url_page = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas"

all_diputados_dict = scrap_webpage_v2(url_page, all_diputados_dict, 0, verbose = True)

print "Num. diputados:", len(all_diputados_dict)
print all_diputados_dict
print "Saving 'all_diputados_dict'..."
save_dict_json(all_diputados_dict, 'all_diputados_dict.json')

Starting scraping...
Parsing: ...eso.es/portal/page/portal/Congreso/Congreso/Diputados/DiputadosTodasLegislaturas
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=87&idLegislatura=10
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=355&idLegislatura=5
	Parsing: ...5_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=268&idLegislatura=10
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=397&idLegislatura=7
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=285&idLegislatura=2
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=360&idLegislatura=2
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=349&idLegislatura=6
	Parsing: ...155_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=25&idLegislatura=1
	Parsing: ...55_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=244&idLegislatura=4
	Parsing: ...55_73_1333154_1333154.next_

#### Diccionari de partits polítics

In [18]:
# Diferents partits polítics trobats al diccionari 'all_diputados_dict'

loaded_all_diputados_dict = load_dict_json('all_diputados_dict.json')

all_parties_dict = {}

for key in loaded_all_diputados_dict:
    party_code = loaded_all_diputados_dict[key]['party_code'].upper()
    if party_code:
        if int(loaded_all_diputados_dict[key]['legislature']) > 6: # Ens interessen les legislatures VII-X
            if party_code in all_parties_dict:
                if loaded_all_diputados_dict[key]['url_img_party']:

                    image_found = False
                    for item in all_parties_dict[party_code]['url_images']:
                        if loaded_all_diputados_dict[key]['legislature'] == item[0] and \
                            loaded_all_diputados_dict[key]['url_img_party'] == item[1]:
                            image_found = True
                            break
                    if not image_found:
                        all_parties_dict[party_code]['url_images'].append([loaded_all_diputados_dict[key]['legislature'],loaded_all_diputados_dict[key]['url_img_party']])
                        all_parties_dict[party_code]['id_images'].append("")
            else:
                all_parties_dict[party_code] = {'name':party_code, 'url_images':[[loaded_all_diputados_dict[key]['legislature'],loaded_all_diputados_dict[key]['url_img_party']]],'id_images':[''], 'color':''}

print "Num. parties:", len(all_parties_dict)
print all_parties_dict
print "Saving 'all_parties_dict'..."
save_dict_json(all_parties_dict, 'all_parties_dict.json')


Num. parties: 39
{u'PSE-EE(PSOE)': {'color': '', 'id_images': ['', ''], 'name': u'PSE-EE(PSOE)', 'url_images': [[u'9', u'http://www.congreso.es/wc/htdocs/web/img/logos_grandes/PSE-EE(PSOE)_9.gif'], [u'8', u'http://www.congreso.es/wc/htdocs/web/img/logos_grandes/201_8.gif']]}, u'CC-NC-PNC': {'color': '', 'id_images': ['', ''], 'name': u'CC-NC-PNC', 'url_images': [[u'10', u'http://www.congreso.es/wc/visualizarLogo?codParl=307'], [u'10', u'http://www.congreso.es/wc/visualizarLogo?codParl=335']]}, u'COMPROM\xcdS-Q': {'color': '', 'id_images': [''], 'name': u'COMPROM\xcdS-Q', 'url_images': [[u'10', u'http://www.congreso.es/wc/visualizarLogo?codParl=275']]}, u'CIU': {'color': '', 'id_images': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'name': u'CIU', 'url_images': [[u'7', u''], [u'10', u'http://www.congreso.es/wc/visualizarLogo?codParl=163'], [u'10', u'http://www.congreso.es/wc/visualizarLogo?codParl=219'], [u'10', u'http://www.congreso.es/wc/visualizarLogo?cod

Aquest resultat permet fer una collection nova 'parties' a la BBDD.  S'haurà de fer a mà, cercant la informació que falti.

### Creació de la base de dades del Congrés

#### Obrir conexió a la BD 'congres'

In [11]:
import pymongo

# Create the connection to MongoDB
try:
    connection=pymongo.MongoClient()
    print "Connection to Mongo Daemon successful!!!"
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e
    
# Obtenim la BD del Congrés
db = connection['congres']
#
print "Collections:", db.collection_names()

Connection to Mongo Daemon successful!!!
Collections: [u'system.indexes', u'all_document', u'congres_groups', u'congres_members', u'document', u'legislatures']


Check the database status
http://127.0.0.1:28017/

#### Tancar la conexió a la BD

In [4]:
connection.close()

#### Guardem el diccionari dels diputats a la BD

In [11]:
#
congres_members_collection = db['congres_members']
congres_members_collection.drop()

#
loaded_diputados_dict = load_dict_json('diputados_dict.json')
print "num. of members:", len(loaded_diputados_dict)
for key in loaded_diputados_dict:
    congres_members_collection.insert(loaded_diputados_dict[key])


num. of members: 1399


### Guardem el NOU diccionari dels diputats a la BD i afegim una nova collection de partits polítics amb els seus nom, descripció, logo, ...

#### Creació de les collections: 'parties' i 'all_congres_members'

In [8]:
#
parties_collection = db['parties']
parties_collection.drop()

#
loaded_all_parties_dict = load_dict_json('all_parties_dict.json')
print "num. of members:", len(loaded_all_parties_dict)
for key in loaded_all_parties_dict:
    # print loaded_all_parties_dict[key]
    parties_collection.insert({'code':key, 'name':loaded_all_parties_dict[key]['name'], 'color': loaded_all_parties_dict[key]['color'], 'id_images': loaded_all_parties_dict[key]['id_images'], 'url_images': loaded_all_parties_dict[key]['url_images']})


#
all_congres_members_collection = db['all_congres_members']
all_congres_members_collection.drop()

#
loaded_all_diputados_dict = {}
loaded_all_diputados_dict = load_dict_json('all_diputados_dict.json')
print "num. of members:", len(loaded_all_diputados_dict)
for key in loaded_all_diputados_dict:
    # print loaded_all_diputados_dict[key]
    all_congres_members_collection.insert(loaded_all_diputados_dict[key])


num. of members: 20
num. of members: 75


#### Creació d'indexos per millorar el temps de cerca

In [9]:
#
congres_members_collection = db['congres_members']

s = "Llamazares Trigo"
n = "Gaspar"

result = congres_members_collection.find({"surname":s.upper(), "name":n.upper()})
if result.count()>0:
    print "Result:", result[0]
else:
    print "No results"

print congres_members_collection.find({"surname":s.upper(), "name":n.upper()}).explain()["cursor"]
print congres_members_collection.find({"surname":s.upper(), "name":n.upper()}).explain()["nscanned"]

No results
BasicCursor
75


Les cerques a la col.lecció 'congres_members' utilitza un 'BasicCursor' equivalent a un 'table scan'. Afegirem uns indexos als camps 'surname' i 'name' per millorar la cerca:

In [10]:
# http://api.mongodb.org/python/current/tutorial.html#indexing
# http://docs.mongodb.org/manual/core/index-creation/
# http://docs.mongodb.org/manual/reference/indexes/

from pymongo import ASCENDING, DESCENDING

congres_members_collection.create_index([("surname", ASCENDING), ("name", ASCENDING)])
all_congres_members_collection.create_index([("surname", ASCENDING), ("name", ASCENDING)])

u'surname_1_name_1'

In [11]:
for index in congres_members_collection.index_information():
    print index
print "-"*120
for index in all_congres_members_collection.index_information():
    print index

surname_1_name_1
_id_


In [12]:
s = "Llamazares Trigo"
n = "Gaspar"

print congres_members_collection.find({"surname":s.upper(), "name":n.upper()}).explain()["cursor"]
print congres_members_collection.find({"surname":s.upper(), "name":n.upper()}).explain()["nscanned"]

print '\nObservació: la recerca amb "re.compile(s, re.IGNORECASE)" no aprofita l\'index creat, recorre tots els elements de la taula:', congres_members_collection.count()
import re
# http://stackoverflow.com/questions/6266555/querying-mongodb-via-pymongo-in-case-insensitive-efficiently
print congres_members_collection.find({'surname': re.compile(s, re.IGNORECASE), "name":re.compile(n, re.IGNORECASE)}).explain()["cursor"]
print congres_members_collection.find({'surname': re.compile(s, re.IGNORECASE), "name":re.compile(n, re.IGNORECASE)}).explain()["nscanned"]

BtreeCursor surname_1_name_1
0

Observació: la recerca amb "re.compile(s, re.IGNORECASE)" no aprofita l'index creat, recorre tots els elements de la taula: 75
BtreeCursor surname_1_name_1
75


#### Descarregar els documents del diccionari 'all_parties_dict'

In [13]:
# Mètode per descarregar els documents del diccionari 'all_parties_dict'

import urllib
import os
import datetime
import gridfs

# http://stackoverflow.com/questions/20551602/saving-a-file-in-mongodbs-gridfs-with-pymongo-results-in-a-truncated-file-pyt
        
def download_parties_images_to_DB(parties, my_gridFS, path_folder, overwrite = False):
    
    for party in parties.find():
        print "Party:", party['code']
        for index_image, url_image in enumerate(party['url_images']):
            
            if party['id_images'][index_image]=="" or overwrite:
                if url_image[1]:
                    print "Loading: ...", url_image[1]
                    
                    extension = url_image[1][-3:]
                    if extension == "jpg":
                        content_type = "image/jpeg"
                    else:
                        extension = "gif"
                        content_type = "image/gif"
                        
                    file_name = party['code'] + "_" + str(index_image) + "." + extension
                    print "Saving...", file_name
                    download_file(url_image[1], file_name, path_folder)

                    '''
                    with open(path_folder+file_name, 'rb') as my_image:
                        id_image = my_gridFS.put(my_image, content_type="jpg", filename=file_name)

                    party['id_images'][index_image] = id_image
                    '''
                else:
                    print "No image."

        # http://docs.mongodb.org/manual/tutorial/modify-documents/
        # parties.update({'_id':party['_id']},{'$set':{'id_images':party['id_images']}}, upsert=False, multi=False)

#
parties_collection = db['parties']

fs = gridfs.GridFS(db, 'parties')

path_folder = "congres/parties/"

download_parties_images_to_DB(parties_collection, fs, path_folder, overwrite = True)

Party: EA-EE
No image.
Party: PP
Loading: ... http://www.congreso.es/wc/htdocs/web/img/logos_grandes/201_7.gif
Saving... PP_0.gif


Loading: ... http://www.congreso.es/wc/visualizarLogo?codParl=171
Saving... PP_1.gif


Loading: ... http://www.congreso.es/wc/visualizarLogo?codParl=173
Saving... PP_2.gif


Loading: ... http://www.congreso.es/wc/visualizarLogo?codParl=87
Saving... PP_3.gif


Loading: ... http://www.congreso.es/wc/visualizarLogo?codParl=12
Saving... PP_4.gif


Loading: ... http://www.congreso.es/wc/htdocs/web/img/logos_grandes/PP_9.gif
Saving... PP_5.gif


Loading: ... http://www.congreso.es/wc/visualizarLogo?codParl=150
Saving... PP_6.gif


Loading: ... http://www.congreso.es/wc/visualizarLogo?codParl=83
Saving... PP_7.gif


Party: PSA
No image.
Party: PSE-PSOE
No image.
Party: IU
No image.
Party: PSE-EE(PSOE)
Loading: ... http://www.congreso.es/wc/htdocs/web/img/logos_grandes/PSE-EE(PSOE)_9.gif
Saving... PSE-EE(PSOE)_0.gif


Party: IU-EUPV
No image.
Party: UPN-PP
No imag

IndexError: list index out of range

#### Descarregar els documents del diccionari 'all_diputados_dict'

In [15]:
# Mètode per descarregar els documents del diccionari 'all_diputados_dict'

import urllib
import os
import datetime
# http://api.mongodb.org/python/current/api/gridfs/index.html
# http://api.mongodb.org/python/current/examples/gridfs.html
# -->  http://dirolf.com/2010/03/29/new-gridfs-implementation-for-pymongo.html
# http://blog.pythonisito.com/2012/05/gridfs-mongodb-filesystem.html
import gridfs

        
def download_member_images_to_DB(congres_members, parties, my_gridFS, path_folder, overwrite = False):
    
    for member in congres_members.find():
        print "Member:", member
        if member['id_img_member']=="" or overwrite:
            
            result = parties.find({'code':member['party_code']})
            if result.count() > 0:
                member['id_img_party'] = result[0]['_id']
            else:
                print "WARN: party not found:", member['party_code']
            #
            print "Loading: ...", member['url_img_member']
                        
            file_name = member['url_img_member'].split('/')[-1]
            print "Saving...", file_name
            
            extension = member['url_img_member'][-3:]
            if extension == "jpg":
                content_type = "image/jpeg"
            else:
                extension = "gif"
                content_type = "image/gif"
            
            download_file(member['url_img_member'], file_name, path_folder)

            '''
            with open(path_folder+file_name, "rb") as my_doc:
                member['id_img_member'] = my_gridFS.put(my_doc, content_type=content_type, filename=file_name)
            
            # http://docs.mongodb.org/manual/tutorial/modify-documents/
            congres_members.update({'_id':member['_id']},{'$set':{'id_img_member':member['id_img_member'], 'id_img_party':member['id_img_party']}, upsert=False, multi=False)
            '''

#
all_congres_members_collection = db['all_congres_members']
#
parties_collection = db['parties']

fs = gridfs.GridFS(db, 'all_congres_members')

path_folder = "congres/members/"

download_member_images_to_DB(all_congres_members_collection, parties_collection, fs, path_folder, overwrite = True)

Member: {u'legislature': u'3', u'surname': u'ALDECOA AZARLOZA', u'id_img_member': u'', u'url_member': u'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BusqForm?_piref73_1333155_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=129&idLegislatura=3', u'url_img_member': u'http://www.congreso.es/wc/htdocs/web/img/diputados/129_3.jpg', u'r_name': u'Jos\xe9 Ignacio', u'id_img_party': u'', u'party_name': u'HB', u'url_img_party': u'', u'_id': ObjectId('554136517c4c67094800f790'), u'r_surname': u'Aldecoa Azarloza', u'name': u'JOSE IGNACIO'}
Loading: ... http://www.congreso.es/wc/htdocs/web/img/diputados/129_3.jpg
Saving... 129_3.jpg


Member: {u'legislature': u'8', u'surname': u'ALCAZAR ESCRIBANO', u'id_img_member': u'', u'url_member': u'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BusqForm?_piref73_1333155_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=360&idLegislatura=8', u'url_img_member': u'http://www.congreso.es/wc/htdocs/

In [16]:
#
all_congres_members_collection = db['all_congres_members']

#
print "num. of rows: ", all_congres_members_collection.count()
for doc in all_congres_members_collection.find():
    print doc

num. of rows:  75
{u'legislature': u'3', u'surname': u'ALDECOA AZARLOZA', u'id_img_member': u'', u'url_member': u'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BusqForm?_piref73_1333155_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=129&idLegislatura=3', u'url_img_member': u'http://www.congreso.es/wc/htdocs/web/img/diputados/129_3.jpg', u'r_name': u'Jos\xe9 Ignacio', u'id_img_party': u'', u'party_name': u'HB', u'url_img_party': u'', u'_id': ObjectId('554136517c4c67094800f790'), u'r_surname': u'Aldecoa Azarloza', u'name': u'JOSE IGNACIO'}
{u'legislature': u'8', u'surname': u'ALCAZAR ESCRIBANO', u'id_img_member': u'', u'url_member': u'http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/BusqForm?_piref73_1333155_73_1333154_1333154.next_page=/wc/fichaDiputado&idDiputado=360&idLegislatura=8', u'url_img_member': u'http://www.congreso.es/wc/htdocs/web/img/diputados/360_8.jpg', u'r_name': u'Mar\xeda Angustias', u'id_img_party': u'', u'party_

#### Cerca de diputats a la BD

In [9]:
# Metode per buscar un diputat a la BD
# TODO: canviar la query per afegir el parametre 'legislature'
def search_diputado_in_BD(collection, s, n="", type='exact', legislature=0, verbose=False):
    if n=="":
        result = collection.find({'surname': remove_accents(s.upper())})
    elif type=='exact':
        result = collection.find({'surname': remove_accents(s.upper()), "name": remove_accents(n.upper())})
    else:
        result = collection.find({'surname': {"$regex":remove_accents(s.upper())}, "name": {"$regex":remove_accents(n.upper())}})
            
    if result.count() == 0:
        if verbose:
            print "WARN en search_diputado_in_BD_like: No s'han trobat resultats per:", s, "; ", n
        return None
    elif result.count() == 1:
        return result[0]
    else: # result.count() > 1:
        if verbose:
            print "WARN en search_diputado_in_BD_like: S'ha trobat més d'un resultat per: ", s, "; ", n
        if legislature > 0:
            f_result = []
            for result_obj in result:
                if verbose:
                    print "\t", result_obj
                found = False
                for group in result_obj['group']:
                    if legislature in result_obj['group'][group]:
                        found = True
                if found:
                    f_result.append(result_obj)

            if len(f_result) == 0:
                if verbose:
                    print "WARN en search_diputado_in_BD_like: No s'han trobat resultats per:", s, "; ", n, ", legislature:", legislature
                result.rewind()
                return result[0]
            elif len(f_result) == 1:
                return f_result[0]
            else:
                if verbose:
                    print "WARN en search_diputado_in_BD_like: S'ha trobat més d'un resultat per: ", s, "; ", n
                    for f_result_obj in f_result:
                        print "\t", f_result_obj
                return f_result[0]
        else:
            if verbose:
                print "WARN en search_diputado_in_BD_like: S'ha trobat més d'un resultat per: ", s, "; ", n
                for result_obj in result:
                    print "\t", result_obj
                # http://stackoverflow.com/questions/9872891/is-it-possible-to-iterate-a-mongo-cursor-twice
                result.rewind()
            return result[0]

In [10]:
def get_group(group_dict, legislature):
    for key in group_dict:
        if legislature in group_dict[key]:
            return key
    return group_dict.keys()[0]

''' '''
#
congres_members_collection = db['congres_members']

# El paràmetre de recerca en unicode
s = u"Llamazares Trigo"
n = u"Gaspar"
dip = search_diputado_in_BD(congres_members_collection,s,n)
if dip:
    print dip
    if len(dip['group']) == 1:
        print "Group:", dip['group'].keys()[0]
    else:
        print "Groups:", dip['group']
        group = get_group(dip['group'], 10)
        if group:
            print s, ",", n, "-->", group
else:
    print "ERROR en search_diputado_in_BD. No s'ha trobat:", s, n

s = u"Fernández Díaz"
dip = search_diputado_in_BD(congres_members_collection, s, legislature=10, verbose=True)
if dip:
    print dip
    if len(dip['group']) == 1:
        print "Group:", dip['group'].keys()[0]
    else:
        print "Groups:", dip['group']
        group = get_group(dip['group'], 10)
        if group:
            print s, ",", n, "-->", group
else:
    print "ERROR en search_diputado_in_BD. No s'ha trobat:", s
    
s = u"Ruiz-Gallardón Jiménez"
dip = search_diputado_in_BD(congres_members_collection, s, legislature=10, verbose=True)
if dip:
    print dip
    if len(dip['group']) == 1:
        print "Group:", dip['group'].keys()[0]
    else:
        print "Groups:", dip['group']
        group = get_group(dip['group'], 10)
        if group:
            print s, ",", n, "-->", group
else:
    print "ERROR en search_diputado_in_BD. No s'ha trobat:", s

{u'group': {u'GIU': [7], u'GIP': [10], u'GER-IU-ICV': [9], u'GIU-ICV': [8]}, u'name': u'GASPAR', u'r_name': u'Gaspar', u'surname': u'LLAMAZARES TRIGO', u'_id': ObjectId('552594aa7c4c6704fc26710f'), u'r_surname': u'Llamazares Trigo'}
Groups: {u'GIU': [7], u'GIP': [10], u'GER-IU-ICV': [9], u'GIU-ICV': [8]}
Llamazares Trigo , Gaspar --> GIP
WARN en search_diputado_in_BD_like: S'ha trobat més d'un resultat per:  Fernández Díaz ;  
	{u'group': {u'GP': [5]}, u'name': u'ELVIRA', u'r_name': u'Elvira', u'surname': u'FERNANDEZ DIAZ', u'_id': ObjectId('552594ab7c4c6704fc267576'), u'r_surname': u'Fern\xe1ndez D\xedaz'}
	{u'group': {u'GP': [5, 6, 7, 8, 9, 10]}, u'name': u'JORGE', u'r_name': u'Jorge', u'surname': u'FERNANDEZ DIAZ', u'_id': ObjectId('552594ab7c4c6704fc267577'), u'r_surname': u'Fern\xe1ndez D\xedaz'}
{u'group': {u'GP': [5, 6, 7, 8, 9, 10]}, u'name': u'JORGE', u'r_name': u'Jorge', u'surname': u'FERNANDEZ DIAZ', u'_id': ObjectId('552594ab7c4c6704fc267577'), u'r_surname': u'Fern\xe1ndez 

In [11]:
conjunctions = ['I', 'Y', 'DEL', 'DE']

def get_surname_name_from_string(speaker):
    speaker_list = speaker.upper().split()

    if speaker_list[-2] in conjunctions:
        if len(speaker_list) > 4:
            surname = speaker_list[-3] + " " + speaker_list[-2] + " " + speaker_list[-1]
            name = speaker_list[0]
        else:
            surname = speaker_list[-3] + " " + speaker_list[-2] + " " + speaker_list[-1]
            name = speaker_list[-4]
    else:
        if speaker_list[-3] == 'DE':
            if len(speaker_list) > 4:
                surname = speaker_list[-2] + " " + speaker_list[-1]
                name = speaker_list[0]
            else:
                surname = speaker_list[-2] + " " + speaker_list[-1]
                name = speaker_list[-4] + " DE"
        else:
            if len(speaker_list) > 3:
                surname = speaker_list[-2] + " " + speaker_list[-1]
                name = speaker_list[0]
            else:
                surname = speaker_list[-2] + " " + speaker_list[-1]
                name = speaker_list[-3]
    return surname, name

def search_diputado_from_string(congres_members_col, speaker, legislature, verbose = False):
    surname, name = get_surname_name_from_string(speaker)
    return search_diputado_in_BD(congres_members_col, surname, name, type='like', legislature=legislature, verbose=verbose)


In [12]:
# Test del mètode search_diputado_from_string
#
congres_members_collection = db['congres_members']

for congres_member in congres_members_collection.find():
    ns = congres_member['r_name'] + " " + congres_member['r_surname']
    dip = search_diputado_from_string(congres_members_collection, ns, legislature=10, verbose=True)
    if not dip:
        print "ERROR: no results."
        print "ns:", ns, "-->", congres_member['r_name'] + " || " + congres_member['r_surname']
        print surname, ";", name
        print "--------------------------------------------"

WARN en search_diputado_in_BD_like: S'ha trobat més d'un resultat per:  LÓPEZ I CHAMOSA ;  ISABEL
	{u'group': {u'GS': [6, 7, 8, 9]}, u'name': u'ISABEL', u'r_name': u'Isabel', u'surname': u'LOPEZ I CHAMOSA', u'_id': ObjectId('552594aa7c4c6704fc267105'), u'r_surname': u'L\xf3pez i Chamosa'}
	{u'group': {u'GS': [10]}, u'name': u'MARIA ISABEL', u'r_name': u'Mar\xeda Isabel', u'surname': u'LOPEZ I CHAMOSA', u'_id': ObjectId('552594aa7c4c6704fc26729d'), u'r_surname': u'L\xf3pez i Chamosa'}
WARN en search_diputado_in_BD_like: S'ha trobat més d'un resultat per:  GUILLAUMES I RÁFOLS ;  FELIU
	{u'group': {u'GC-CiU': [5]}, u'name': u'FELIU', u'r_name': u'Feliu', u'surname': u'GUILLAUMES I RAFOLS', u'_id': ObjectId('552594aa7c4c6704fc267155'), u'r_surname': u'Guillaumes i R\xe1fols'}
	{u'group': {u'GC-CiU': [10]}, u'name': u'FELIU-JOAN', u'r_name': u'Feliu-Joan', u'surname': u'GUILLAUMES I RAFOLS', u'_id': ObjectId('552594aa7c4c6704fc267422'), u'r_surname': u'Guillaumes i R\xe0fols'}
WARN en searc

### Llistat de grups de parlamentaris del Congrés

In [8]:
# Diferents grups trobats al diccionari de diputats

loaded_diputados_dict = load_dict_json('diputados_dict.json')

all_groups = []

for key in loaded_diputados_dict:
    for subkey in loaded_diputados_dict[key]['group']:
        try:
            all_groups.index(subkey)
        except ValueError:
            all_groups.append(subkey)

print len(all_groups), "groups"
print all_groups

16 groups
[u'GP', u'GS', u'GIU-IC', u'GMX', u'GV (EAJ-PNV)', u'GV-PNV', u'GCC', u'GUPyD', u'GMx', u'GCC-NC', u'GC-CiU', u'GIP', u'GIU', u'GIU-ICV', u'GER-IU-ICV', u'GER-ERC']


In [4]:
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/GruPar?_piref73_2914053_73_1339199_1339199.next_page=/wc/cambioLegislatura
# http://www.w3schools.com/html/html_colornames.asp
# Trend -> Tendència política. 'I' = 'Izquierda', 'D' = 'Derecha', 'N' = 'Ninguna' (para el grupo Mixto)
groups_dict = {
    '0':{'code':'GP', 'name':'Popular (PP)', 'description':'Grupo Parlamentario Popular en el Congreso','color':'steelblue','trend':'D'},
    '1':{'code':'GS', 'name':'Socialista (PSOE)', 'description':'Grupo Parlamentario Socialista','color':'red','trend':'I'},
    '2':{'code':'GC-CiU', 'name':'Convergència i Unió', 'description':'Grupo Parlamentario Catalán (Convergència i Unió)','color':'orange','trend':'D'},
    '3':{'code':'GIU', 'name':'Izquierda Unida', 'description':'Grupo Parlamentario Federal de Izquierda Unida','color':'green','trend':'I'},
    '4':{'code':'GIP', 'name':'Izquierda Unida', 'description':'Grupo Parlamentario de IU, ICV-EUiA, CHA: La Izquierda Plural','color':'green','trend':'I'},
    '5':{'code':'GIU-IC', 'name':'Izquierda Unida', 'description':'Grupo Parlamentario Federal Izquierda Unida-Iniciativa per Catalunya','color':'green','trend':'I'},
    '6':{'code':'GIU-ICV', 'name':'Izquierda Unida', 'description':'Grupo Parlamentario de Izquierda Unida-Iniciativa per Catalunya Verds','color':'green','trend':'I'},
    '7':{'code':'GER-IU-ICV', 'name':'Izquierda Unida', 'description':'Grupo Parlamentario de Esquerra Republicana-Izquierda Unida-Iniciativa per Catalunya Verds','color':'green','trend':'I'},
    '8':{'code':'GER-ERC', 'name':'Esquerra Republicana', 'description':'Grupo Parlamentario de Esquerra Republicana (ERC)','color':'gold','trend':'I'},
    '9':{'code':'GUPyD', 'name':'UPyD', 'description':'Grupo Parlamentario de Unión Progreso y Democracia','color':'pink','trend':'D'},
    '10':{'code':'GV (EAJ-PNV)', 'name':'PNV', 'description':'Grupo Parlamentario Vasco (EAJ-PNV)','color':'brown','trend':'D'},
    '11':{'code':'GV-PNV', 'name':'PNV', 'description':'Grupo Parlamentario Vasco (PNV)','color':'brown','trend':'D'},
    '12':{'code':'GCC', 'name':'Coalición Canaria', 'description':'Grupo Parlamentario de Coalición Canaria','color':'yellow','trend':'D'},
    '13':{'code':'GCC-NC', 'name':'Coalición Canaria', 'description':'Grupo Parlamentario de Coalición Canaria-Nueva Canarias','color':'yellow','trend':'D'},
    '14':{'code':'GMX', 'name':'Mixto','description':'Grupo Parlamentario Mixto','color':'grey','trend':'N'},
    '15':{'code':'GMx', 'name':'Mixto','description':'Grupo Parlamentario Mixto','color':'grey','trend':'N'}
}

print "Num. groups:", len(groups_dict)
print groups_dict
print "Saving 'groups_dict'..."
save_dict_json(groups_dict, 'groups_dict.json')

Num. groups: 16
{'11': {'color': 'brown', 'trend': 'D', 'code': 'GV-PNV', 'name': 'PNV', 'description': 'Grupo Parlamentario Vasco (PNV)'}, '10': {'color': 'brown', 'trend': 'D', 'code': 'GV (EAJ-PNV)', 'name': 'PNV', 'description': 'Grupo Parlamentario Vasco (EAJ-PNV)'}, '13': {'color': 'yellow', 'trend': 'D', 'code': 'GCC-NC', 'name': 'Coalici\xc3\xb3n Canaria', 'description': 'Grupo Parlamentario de Coalici\xc3\xb3n Canaria-Nueva Canarias'}, '12': {'color': 'yellow', 'trend': 'D', 'code': 'GCC', 'name': 'Coalici\xc3\xb3n Canaria', 'description': 'Grupo Parlamentario de Coalici\xc3\xb3n Canaria'}, '15': {'color': 'grey', 'trend': 'N', 'code': 'GMx', 'name': 'Mixto', 'description': 'Grupo Parlamentario Mixto'}, '14': {'color': 'grey', 'trend': 'N', 'code': 'GMX', 'name': 'Mixto', 'description': 'Grupo Parlamentario Mixto'}, '1': {'color': 'red', 'trend': 'I', 'code': 'GS', 'name': 'Socialista (PSOE)', 'description': 'Grupo Parlamentario Socialista'}, '0': {'color': 'steelblue', 'trend

In [7]:
#
congres_groups_collection = db['congres_groups']
congres_groups_collection.drop()

#
loaded_groups_dict = load_dict_json('groups_dict.json')
print "num. of groups:", len(loaded_groups_dict)
for key in loaded_groups_dict:
    congres_groups_collection.insert(loaded_groups_dict[key])


NameError: name 'db' is not defined

In [9]:
#
congres_groups_collection = db['congres_groups']

#
print "num. of rows: ", congres_groups_collection.count()
for doc in congres_groups_collection.find():
    print doc


num. of rows:  16
{u'color': u'brown', u'_id': ObjectId('552062f97c4c671994a624dd'), u'description': u'Grupo Parlamentario Vasco (PNV)', u'code': u'GV-PNV'}
{u'color': u'brown', u'_id': ObjectId('552062f97c4c671994a624de'), u'description': u'Grupo Parlamentario Vasco (EAJ-PNV)', u'code': u'GV (EAJ-PNV)'}
{u'color': u'yellow', u'_id': ObjectId('552062f97c4c671994a624df'), u'description': u'Grupo Parlamentario de Coalici\xf3n Canaria-Nueva Canarias', u'code': u'GCC-NC'}
{u'color': u'yellow', u'_id': ObjectId('552062f97c4c671994a624e0'), u'description': u'Grupo Parlamentario de Coalici\xf3n Canaria', u'code': u'GCC'}
{u'color': u'grey', u'_id': ObjectId('552062f97c4c671994a624e1'), u'description': u'Grupo Parlamentario Mixto', u'code': u'GMx'}
{u'color': u'grey', u'_id': ObjectId('552062f97c4c671994a624e2'), u'description': u'Grupo Parlamentario Mixto', u'code': u'GMX'}
{u'color': u'red', u'_id': ObjectId('552062f97c4c671994a624e3'), u'description': u'Grupo Parlamentario Socialista', u'co

#### Cerca de grups parlamentaris

In [13]:
def search_group_in_text_DB(text, congres_groups_col):
    
    for congres_group in congres_groups_col.find():
        description = congres_group['description'].upper()
        if description in text.upper():
            return congres_group['code']
    return None

# Test

doc_text= u'''
    ? DEL DIPUTADO DON ALFRED BOSCH I PASCUAL, DEL GRUPO PARLAMENTARIO MIXTO,
    QUE FORMULA AL SEÑOR PRESIDENTE DEL GOBIERNO: ¿CONSIDERA EL GOBIERNO
    ADECUADO PARA ERRADICAR LA POBREZA INFANTIL EN CATALUÑA LOS 2,25 EUROS
    POR NIÑO QUE DESTINÓ PARA COMBATIRLA MEDIANTE EL FONDO CORRESPONDIENTE?
    (Número de expediente 180/001217).
'''

congres_groups_collection = db['congres_groups']

print search_group_in_text_DB(doc_text, congres_groups_collection)


GMx


### Llistat de les legislatures V-X amb els intervals de duració

In [4]:
import datetime

# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/Historia
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/Historia/VLeg
# V LEGISLATURA 1993-1996 (Del 29 de junio de 1993 al 26 de marzo de 1996)
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/Historia/VILeg
# VI LEGISLATURA 1996-2000 (Del 27 de marzo de 1996 al 4 de abril de 2000)
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/Historia/VIILeg
# VII LEGISLATURA 2000-2004 (Del 5 de abril de 2000 al 1 de abril de 2004)
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/Historia/VIIILeg
# VIII LEGISLATURA  2004-2008 (Del 2 de abril de 2004 al 31 de marzo de 2008)
# http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados/Historia/LegIX
# IX LEGISLATURA  2008-2011 (Del 1 de abril de 2008 al 13 de diciembre de 2011)
# X LEGISLATURA  2011-Actualidad (Del 14 de diciembre de 2011 hasta la actualidad)

legislatures_dict = {
    '0':{'_id':5, 'start' : datetime.datetime.strptime("29/06/1993", "%d/%m/%Y"), 'end' : datetime.datetime.strptime("26/03/1996", "%d/%m/%Y"),
         'name':'V LEGISLATURA', 'description':'V LEGISLATURA 1993-1996 (Del 29 de junio de 1993 al 26 de marzo de 1996)'},
    '1':{'_id':6, 'start' : datetime.datetime.strptime("27/03/1996", "%d/%m/%Y"), 'end' : datetime.datetime.strptime("04/04/2000", "%d/%m/%Y"),
         'name':'VI LEGISLATURA', 'description':'VI LEGISLATURA 1996-2000 (Del 27 de marzo de 1996 al 4 de abril de 2000)'},
    '2':{'_id':7, 'start' : datetime.datetime.strptime("05/04/2000", "%d/%m/%Y"), 'end' : datetime.datetime.strptime("01/04/2004", "%d/%m/%Y"),
         'name':'VII LEGISLATURA', 'description':'VII LEGISLATURA 2000-2004 (Del 5 de abril de 2000 al 1 de abril de 2004)'},
    '3':{'_id':8, 'start' : datetime.datetime.strptime("02/04/2004", "%d/%m/%Y"), 'end' : datetime.datetime.strptime("31/03/2008", "%d/%m/%Y"),
         'name':'VIII LEGISLATURA', 'description':'VIII LEGISLATURA  2004-2008 (Del 2 de abril de 2004 al 31 de marzo de 2008)'},
    '4':{'_id':9, 'start' : datetime.datetime.strptime("01/04/2008", "%d/%m/%Y"), 'end' : datetime.datetime.strptime("13/12/2011", "%d/%m/%Y"),
         'name':'IX LEGISLATURA', 'description':'IX LEGISLATURA  2008-2011 (Del 1 de abril de 2008 al 13 de diciembre de 2011)'},
    '5':{'_id':10, 'start' : datetime.datetime.strptime("14/12/2011", "%d/%m/%Y"), 'end' : datetime.datetime.strptime("14/12/2015", "%d/%m/%Y"),
         'name':'X LEGISLATURA', 'description':'X LEGISLATURA  2011-Actualidad (Del 14 de diciembre de 2011 hasta la actualidad)'}
}

print "Num. legislatures:", len(legislatures_dict)
print legislatures_dict
print "Saving 'legislatures_dict'..."
save_dict_json(legislatures_dict, 'legislatures_dict.json')

Num. legislatures: 6
{'1': {'start': datetime.datetime(1996, 3, 27, 0, 0), '_id': 6, 'end': datetime.datetime(2000, 4, 4, 0, 0), 'name': 'VI LEGISLATURA', 'description': 'VI LEGISLATURA 1996-2000 (Del 27 de marzo de 1996 al 4 de abril de 2000)'}, '0': {'start': datetime.datetime(1993, 6, 29, 0, 0), '_id': 5, 'end': datetime.datetime(1996, 3, 26, 0, 0), 'name': 'V LEGISLATURA', 'description': 'V LEGISLATURA 1993-1996 (Del 29 de junio de 1993 al 26 de marzo de 1996)'}, '3': {'start': datetime.datetime(2004, 4, 2, 0, 0), '_id': 8, 'end': datetime.datetime(2008, 3, 31, 0, 0), 'name': 'VIII LEGISLATURA', 'description': 'VIII LEGISLATURA  2004-2008 (Del 2 de abril de 2004 al 31 de marzo de 2008)'}, '2': {'start': datetime.datetime(2000, 4, 5, 0, 0), '_id': 7, 'end': datetime.datetime(2004, 4, 1, 0, 0), 'name': 'VII LEGISLATURA', 'description': 'VII LEGISLATURA 2000-2004 (Del 5 de abril de 2000 al 1 de abril de 2004)'}, '5': {'start': datetime.datetime(2011, 12, 14, 0, 0), '_id': 10, 'end': d

In [20]:
def get_legislature(legislatures_dict, doc_date):
    d_doc_date = datetime.datetime.strptime(doc_date, "%d/%m/%Y")
    for key in legislatures_dict:
        if legislatures_dict[key]['start'].date() <= d_doc_date.date() <= legislatures_dict[key]['end'].date():
            return legislatures_dict[key]['_id']
    print "ERROR, get_legislature: date out of range:", doc_date
    return -1

# Test get_legislature
loaded_legislatures_dict = load_dict_json('legislatures_dict.json')

my_date = "03/06/2007"
print "my_date:", my_date, ", legislature:", get_legislature(loaded_legislatures_dict, my_date)
my_date = "03/06/2012"
print "my_date:", my_date, ", legislature:", get_legislature(loaded_legislatures_dict, my_date)
my_date = "03/06/1996"
print "my_date:", my_date, ", legislature:", get_legislature(loaded_legislatures_dict, my_date)


my_date: 03/06/2007 , legislature: 8
my_date: 03/06/2012 , legislature: 10
my_date: 03/06/1996 , legislature: 6


In [21]:
#
legislatures_collection = db['legislatures']
legislatures_collection.drop()

#
loaded_legislatures_dict = load_dict_json('legislatures_dict.json')
print "num. of legislatures:", len(loaded_legislatures_dict)
for key in loaded_legislatures_dict:
    legislatures_collection.insert(loaded_legislatures_dict[key])


num. of legislatures: 6


In [22]:
#
legislatures_collection = db['legislatures']

#
print "num. of rows: ", legislatures_collection.count()
for doc in legislatures_collection.find():
    print doc

num. of rows:  6
{u'start': datetime.datetime(1996, 3, 27, 0, 0), u'_id': 6, u'end': datetime.datetime(2000, 4, 4, 0, 0), u'description': u'VI LEGISLATURA 1996-2000 (Del 27 de marzo de 1996 al 4 de abril de 2000)'}
{u'start': datetime.datetime(1993, 6, 29, 0, 0), u'_id': 5, u'end': datetime.datetime(1996, 3, 26, 0, 0), u'description': u'V LEGISLATURA 1993-1996 (Del 29 de junio de 1993 al 26 de marzo de 1996)'}
{u'start': datetime.datetime(2004, 4, 2, 0, 0), u'_id': 8, u'end': datetime.datetime(2008, 3, 31, 0, 0), u'description': u'VIII LEGISLATURA  2004-2008 (Del 2 de abril de 2004 al 31 de marzo de 2008)'}
{u'start': datetime.datetime(2000, 4, 5, 0, 0), u'_id': 7, u'end': datetime.datetime(2004, 4, 1, 0, 0), u'description': u'VII LEGISLATURA 2000-2004 (Del 5 de abril de 2000 al 1 de abril de 2004)'}
{u'start': datetime.datetime(2011, 12, 14, 0, 0), u'_id': 10, u'end': datetime.datetime(2015, 12, 14, 0, 0), u'description': u'X LEGISLATURA  2011-Actualidad (Del 14 de diciembre de 2011 h

In [23]:
my_date = "03/06/2007"
d_doc_date = datetime.datetime.strptime(my_date, "%d/%m/%Y")
result = legislatures_collection.find({'start':{'$lte':d_doc_date}, 'end':{'$gte':d_doc_date}})
print result.explain()

{u'nYields': 0, u'nscannedAllPlans': 6, u'filterSet': False, u'allPlans': [{u'nChunkSkips': 0, u'n': 1, u'cursor': u'BasicCursor', u'scanAndOrder': False, u'nscannedObjects': 6, u'isMultiKey': False, u'indexOnly': False, u'nscanned': 6}], u'millis': 0, u'nChunkSkips': 0, u'server': u'DCEQUI1527:27017', u'n': 1, u'cursor': u'BasicCursor', u'scanAndOrder': False, u'nscannedObjectsAllPlans': 6, u'isMultiKey': False, u'stats': {u'works': 8, u'docsTested': 6, u'isEOF': 1, u'needFetch': 0, u'needTime': 6, u'yields': 0, u'invalidates': 0, u'unyields': 0, u'type': u'COLLSCAN', u'children': [], u'advanced': 1}, u'indexOnly': False, u'nscanned': 6, u'nscannedObjects': 6}


#### Cerca de legislatures per data

In [25]:
def get_legislature_DB(legislatures_col, d_doc_date):
    result = legislatures_col.find({'start':{'$lte':d_doc_date}, 'end':{'$gte':d_doc_date}})
    
    if result.count() == 1:
        return result[0]['_id']
    else:
        print "ERROR, get_legislature: date out of range:", doc_date
        return -1

# Test get_legislature_DB
legislatures_collection = db['legislatures']
    
my_date = datetime.datetime.strptime("03/06/2007", "%d/%m/%Y")
print "my_date:", my_date, ", legislature:", get_legislature_DB(legislatures_collection, my_date)
my_date = datetime.datetime.strptime("03/06/2012", "%d/%m/%Y")
print "my_date:", my_date, ", legislature:", get_legislature_DB(legislatures_collection, my_date)
my_date = datetime.datetime.strptime("03/06/1996", "%d/%m/%Y")
print "my_date:", my_date, ", legislature:", get_legislature_DB(legislatures_collection, my_date)
my_date = datetime.datetime.strptime("18/2/2015", "%d/%m/%Y")
print "my_date:", my_date, ", legislature:", get_legislature_DB(legislatures_collection, my_date)

my_date: 2007-06-03 00:00:00 , legislature: 8
my_date: 2012-06-03 00:00:00 , legislature: 10
my_date: 1996-06-03 00:00:00 , legislature: 6
my_date: 2015-02-18 00:00:00 , legislature: 10


### Scraping per descarregar els documents de les sessions del Congrés [ Edición oficial / Búsqueda de publicaciones ]

### Estructura del diccionari de documents

In [None]:
'''
all_document_dictionary = {
    'date': <data del document (Diario de sesiones del congreso de los diputados)>,
    'url': <url del document>,
    'description': <descripció del document>
    'id': <id del document a document_dictionary>
}

document_dictionary = {
    'date': <data del document (Diario de sesiones del congreso de los diputados)>,
    'url': <url del document>,
    'content_doc': <contingut del document en format html (sense tractar)>
    'updated_content_doc': <data d'actualització del camp 'content_doc'>
    <session_dictionary, lista amb les preguntes i intervencions de la sessió de control>
    'updated_session_dictionary': <data d'actualització del camp 'session_dictionary'>
}

session_dictionary = [{
    'question': <texte de la pregunta realitzada>,
    'speaker': <nom i cognoms de qui realitza la pregunta>,
    'group': <grup parlamentari que fa la pregunta o al que pertany qui realitza la pregunta>,
    'expedient_number': <número d'expedient de la pregunta>, 
    'intervention_dictonary' : [], llista amb les intervencions de la sessió de control i les persones que les van realitzar>
    }, ..., ]

intervention_dictonary = [{
    'text': <text de la intervenció>,
    'who': {  
        'surname': <cognoms de la persona que realitza la intervenció>,
        'name': <nom de la persona que realitza la intervenció>,
        'group': <grup al que pertany la persona: 'GP' (Grupo Popular), 'GS' (Grupo Socialista), ...>
    }, ..., ]
}
'''

In [58]:
# Congreso de los Diputados / Edición oficial / Búsqueda de publicaciones

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

from bs4 import BeautifulSoup
import re

import os
import time
import datetime


os.environ["PATH"] = 'C:\Users\pablo_000\Documents\P\Data Science\chromedriver' \
    + os.pathsep + 'C:\Users\pfernandezs\Documents\P\Cosas\Data Science\posgrau\chromedriver'
    
browser = webdriver.Chrome()

# Congreso de los Diputados / Edición oficial / Búsqueda de publicaciones
url_page = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Publicaciones"
web_congreso = 'http://www.congreso.es'

idLegislatura = ["PUW0", "PUW1", "PUW2", "PUW3", "PUW4", "PUW5", "PUW6", "PUW7", "PUW8", "PUW9", "PU10"]

# Descarregarem els documents desde la V legislatura fins la X legislatura (actual), és a dir, els documents desde 1993 fins ara.
legislatura = 5

# Data 'desde' al formulari
d_day = "01"
d_month = "01"
d_year = "1993"

# Data 'hasta' al formulari
h_d = datetime.date.today()
h_day = h_d.strftime('%d')
h_month = h_d.strftime('%m')
h_year = h_d.strftime('%Y')

delay = 5
element_idLegislatura_id = "idLegislatura"
element_PIE_id = "PIE"
    
num_docs = 0
all_document_dictionary = {}

while legislatura < 11:

    continue_scraping = True

    browser.get(url_page)
    
    # Wait for the page to load
    try:
        WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, element_idLegislatura_id)))
        # print "Page is ready!"
    except TimeoutException:
        print "Loading took too much time!"
    except NoSuchElementException:
        print "Element id didn't find: " + element_idLegislatura_id
    
    print "Starting scraping legislature:", legislatura
        
    html_page=browser.page_source

    # Omplim el formulari
    # http://selenium-python.readthedocs.org/navigating.html#filling-in-forms

    # <select name="BASE" id="idLegislatura" onchange="cambiarLegislatura()">
    #   <option selected value="PU10">X Legislatura ( 2011-Actualidad ) </option>
    #   <option value="PUW9">IX Legislatura ( 2008-2011 ) </option>
    #   <option value="PUW8">VIII Legislatura ( 2004-2008 ) </option>
    #   <option value="PUW7">VII Legislatura ( 2000-2004 ) </option>
    #   <option value="PUW6">VI Legislatura ( 1996-2000 ) </option>
    #   <option value="PUW5">V Legislatura ( 1993-1996 ) </option>
    #   <option value="PUW4">IV Legislatura ( 1989-1993 ) </option>
    #   <option value="PUW3">III Legislatura ( 1986-1989 ) </option>
    #   <option value="PUW2">II Legislatura ( 1982-1986 ) </option>
    #   <option value="PUW1">I Legislatura ( 1979-1982 ) </option>
    #   <option value="PUW0">Legislatura Constituyente ( 1977-1979 ) </option>
    element_idLegislatura = browser.find_element_by_id("idLegislatura")
    select = Select(element_idLegislatura)
    select.select_by_value(idLegislatura[legislatura])

    # Wait for the page to load
    try:
        WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, element_PIE_id)))
        # print "Page is ready!"
    except TimeoutException:
        print "Loading took too much time!"
    except NoSuchElementException:
        print "Element id didn't find: " + element_PIE_id

    # Tipo Búsqueda
    # <select name="TIPOB" id="TIPOB">
    #    <option value="1">Simple</option>
    #    <option value="2">Detallada por p&aacute;ginas</option>
    element_tipob = browser.find_element_by_id("TIPOB")
    select = Select(element_tipob)
    select.select_by_value("1")

    # Publicación
    # <select name="PUBL" id="PUBL">
    #    <option value="B">Boletines Oficiales</option>
    #    <option value="D">Diarios de sesiones</option>
    element_publ = browser.find_element_by_id("PUBL")
    element_publ.send_keys("D")

    # Sección
    # <select name="SECC" id="SECC">
    #    <option value="Congreso">Congreso de los Diputados</option>
    #    <option value="Cortes">Cortes Generales</option>
    #    <option value="Senado">Senado</option>
    element_secc = browser.find_element_by_id("SECC")
    element_secc.send_keys("Congreso")

    # Serie/Aptdo./Órgano
    # <input type="text" name="ORSE"  id="idOrse" value=''  />
    element_orse = browser.find_element_by_id("idOrse")
    element_orse.send_keys("Pleno")

    # Descripción
    # <input type="text" name="SUDE" id="SUDE" value='' />

    # <p>Fecha desde:</p>
    # <input type="text" name="ddia" id="ddia" maxlength="2" value="" class="caja_mini" />
    element_ddia = browser.find_element_by_id("ddia")
    element_ddia.send_keys(d_day)
    # <input type="text" name="dmes" id="dmes" maxlength="2" value="" class="caja_mini" />
    element_dmes = browser.find_element_by_id("dmes")
    element_dmes.send_keys(d_month)
    # <input type="text" name="dano" id="dano" maxlength="4" value="" class="caja_mini_ano" />
    element_dano = browser.find_element_by_id("dano")
    element_dano.send_keys(d_year)

    # <p class="fecha">Fecha hasta:</p>
    # <input type="text" name="hdia" id="hdia" maxlength="2" value="" class="caja_mini" />
    element_hdia = browser.find_element_by_id("hdia")
    element_hdia.send_keys(h_day)
    # <input type="text" name="hmes" id="hmes" maxlength="2" value="" class="caja_mini" />
    element_hmes = browser.find_element_by_id("hmes")
    element_hmes.send_keys(h_month)
    # <input type="text" name="hano" id="hano" maxlength="4" value="" class="caja_mini_ano" />
    element_hano = browser.find_element_by_id("hano")
    element_hano.send_keys(h_year)

    # <INPUT type="IMAGE" Alt='Buscar' value='Buscar' src="/wc/htdocs/web/img/btn_buscar.gif" />
    element_buscar = browser.find_element_by_xpath("//input[@src='/wc/htdocs/web/img/btn_buscar.gif']")
    element_buscar.click()
    
    legislature_page = 1

    while continue_scraping:
        
        # Wait for the page to load
        try:
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, element_PIE_id)))
            # print "Page is ready!"
        except TimeoutException:
            print "Loading took too much time!"
        except NoSuchElementException:
            print "Element id didn't find: " + element_PIE_id
        
        html_page=browser.page_source

        print "Scraping page:", legislature_page
        
        soup = BeautifulSoup(html_page)

        # <div class="RESULTADOS_BUSQUEDA">
        div_resultados = soup.find('div', {'id' : 'RESULTADOS_BUSQUEDA'})
        if div_resultados:
            for div_resultados_encontrados in div_resultados.findAll('div', {'class' : 'resultados_encontrados'}):
                link_tag = div_resultados_encontrados.find('a', attrs={'href': re.compile("^/portal")})
                if link_tag:
                    url_document = web_congreso + link_tag.get('href')
                    description = remove_spaces_and_newline(remove_text_between(link_tag.getText(),'(',')').strip())
                    doc_date = datetime.datetime.strptime(description.split(', de ')[1][:10], "%d/%m/%Y")

                    all_document_dictionary[num_docs] = {'description': description, 'url':url_document, 'date':doc_date, 'id':''}
                    num_docs += 1
                else:
                    print "Tag not found: div_resultados_encontrados.find('a', attrs={'href': re.compile('^/portal')})"
        else:
            print "Tag not found: soup.find('div', {'id' : 'RESULTADOS_BUSQUEDA'})"

        try:
            element_siguiente = browser.find_element_by_xpath("//*[contains(text(), 'Siguiente')]")
            element_siguiente.click()
        except NoSuchElementException as e:
            # print e
            print "Scraping legislature", legislatura, "finished..."
            continue_scraping = False
            
        legislature_page += 1
            
    legislatura += 1
    
print "Num. documents:", len(all_document_dictionary)
print "Saving dictionary..."
save_dict_json(all_document_dictionary, 'all_document_dictionary.json')

Starting scraping legislature: 5
Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping legislature 5 finished...
Starting scraping legislature: 6
Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping legislature 6 finished...
Starting scraping legislature: 7
Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping legislature 7 finished...
Starting scraping legislature: 8
Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping legislature 8 f

#### Saving 'all_document_dictionary' to MongoDB, in 'all_document' collection

In [None]:
#
collection = db['all_document']
collection.drop()

#
loaded_all_document_dictionary = load_dict_json('all_document_dictionary.json')
print "num. of documents:", len(loaded_all_document_dictionary)
for key in loaded_all_document_dictionary:
    collection.insert(loaded_all_document_dictionary[key])

#
print "num. of rows: ", collection.count()
for doc in collection.find():
    print doc

In [15]:
#
collection = db['all_document']

#
print "num. of rows: ", collection.count()
for doc in collection.find():
    print doc

num. of rows:  1566
{u'url': u'http://www.congreso.es/portal/page/portal/Congreso/PopUpCGI?CMD=VERDOC&CONF=BRSPUB.cnf&BASE=PUW6&PIECE=PUW6&DOCS=1-1&FMT=PUWTXDTS.fmt&OPDEF=Y&QUERY=%40FECH%26gt%3B%3D19930101+%26+%40FECH%26lt%3B%3D20150404+%26+%28D%29.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199805120157.CODI.#1', u'date': datetime.datetime(1998, 5, 12, 0, 0), u'_id': ObjectId('551f13017c4c671d6037a555'), u'description': u'Congreso de los Diputados, Pleno y Dip. Perm., n\xfam. 157, de 12/05/1998', u'id': u''}
{u'url': u'http://www.congreso.es/portal/page/portal/Congreso/PopUpCGI?CMD=VERDOC&CONF=BRSPUB.cnf&BASE=PUW6&PIECE=PUW6&DOCS=1-1&FMT=PUWTXDTS.fmt&OPDEF=Y&QUERY=%40FECH%26gt%3B%3D19930101+%26+%40FECH%26lt%3B%3D20150404+%26+%28D%29.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199805130158.CODI.#1', u'date': datetime.datetime(1998, 5, 13, 0, 0), u'_id': ObjectId('551f13017c4c671d6037a556'), u'description': u'Congreso de los Diputados, Pleno y Dip. Perm., n\x

In [16]:
# Mètode per descarregar els documents del diccionari 'all_document_dictionary' que corresponen a sessions de control del Congrés

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import os
import datetime

def download_docs_to_DB(all_doc_collection, doc_collection, overwrite = False):
    
    os.environ["PATH"] = 'C:\Users\pablo_000\Documents\P\Data Science\chromedriver' \
        + os.pathsep + 'C:\Users\pfernandezs\Documents\P\Cosas\Data Science\posgrau\chromedriver'

    delay = 10
    element_id = 'PIE_POPUP'
    
    browser = webdriver.Chrome()
    
    for doc in all_doc_collection.find():
        # Aquí es filtra els documents a guardar per data (dimecres hi ha sessió de control al Congrés)
        if isWednesday(doc['date']) and (doc['id']=="" or overwrite):
            print "Loading: ...", doc['url'][-80:]
            browser.get(doc['url'])
            
            # Wait for the page to load
            # http://selenium-python.readthedocs.org/en/latest/waits.html
            try:
                WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID, element_id)))
                # print "Page is ready!"
            except TimeoutException:
                print "Loading took too much time!"
            except NoSuchElementException:
                print "Element id didn't find: " + element_id
            
            html_page = browser.page_source
            
            # Aquí es filtra si el diari correspon a una sessió de control cercan al texte del diari 'PREGUNTAS'
            if 'PREGUNTAS' in html_page:
                print "Saving: ...", doc['url'][-80:]
                new_doc = {'date':doc['date'], \
                           'url':doc['url'], \
                           'description':doc['description'], \
                           'content_doc':html_page, \
                           'updated_content_doc':datetime.datetime.utcnow(), \
                           'session_dictionary':{}, \
                           'updated_session_dictionary':datetime.datetime.utcnow() \
                           }
                
                # Desde la versió 3.0:
                # new_doc_id = doc_collection.insert_one(new_doc).inserted_id
                doc_collection.insert(new_doc)
                inserted_doc = doc_collection.find({'date':new_doc['date']})
                # http://docs.mongodb.org/manual/tutorial/modify-documents/
                all_doc_collection.update({'_id':doc['_id']},{'$set':{'id':inserted_doc[0]['_id']}},upsert=False, multi=False)

#
all_doc_col = db['all_document']
#
doc_col = db['document']
doc_col.drop()

# download_docs_to_DB(all_doc_col, doc_col)
download_docs_to_DB(all_doc_col, doc_col, True)


Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199805130158.CODI.#1
Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199804290155.CODI.#1
Saving: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199804290155.CODI.#1
Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199805200161.CODI.#1
Saving: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199805200161.CODI.#1
Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199710290111.CODI.#1
Saving: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199710290111.CODI.#1
Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199710220108.CODI.#1
Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199710080106.CODI.#1
Saving: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.ORSE.+Y+CDP199710080106.CODI.#1
Loading: ... 9.PUBL.+%26+%28CONGRESO%29.SECC.+%26+%28PLENO%29.OR

In [17]:
#
doc_col = db['document']

print "num. of rows: ", doc_col.count()

num. of rows:  449


#### LListat i descarrega dels documents seleccionats (sessions de control dels dimecres)

In [18]:
#
doc_col = db['document']

print "num. of rows: ", doc_col.count()
for doc in doc_col.find():
    file_name = doc['date'].strftime("%Y%m%d") + "_doc.html"
    print file_name, ":", doc['description']
    save_text_file(doc['content_doc'], 'docs/' + file_name)


num. of rows:  449
19980429_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 155, de 29/04/1998
19980520_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 161, de 20/05/1998
19971029_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 111, de 29/10/1997
19971008_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 106, de 08/10/1997
19970924_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 103, de 24/09/1997
19970507_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 81, de 07/05/1997
19970521_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 84, de 21/05/1997
19970528_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 87, de 28/05/1997
19970604_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 90, de 04/06/1997
19941130_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 111, de 30/11/1994
19941116_doc.html : Congreso de los Diputados, Pleno y Dip. Perm., núm. 104

In [19]:
# Per previndre aquest error: Overflow sort stage buffered data usage exceeds internal limit
# indexem el camp 'date' de la collection 'document'
# http://stackoverflow.com/questions/27023622/overflow-sort-stage-buffered-data-usage-exceeds-internal-limit
#
doc_col = db['document']

from pymongo import ASCENDING, DESCENDING

doc_col.create_index([("date", ASCENDING)])

u'date_1'

#### Import and Export MongoDB Data

In [None]:
# Import and Export MongoDB Data
# http://docs.mongodb.org/manual/core/import-export/

# http://stackoverflow.com/questions/13988479/loading-and-saving-mongodb-database-from-to-disk-with-pymongo

# Copying a Database
# http://api.mongodb.org/python/current/examples/copydb.html

# Export / Import
# http://stackoverflow.com/questions/11255630/how-to-export-all-collection-in-mongodb
# mongodump -d <our database name> -o <directory_backup>
# mongorestore <our database name>