# Visualització de les dades del congrés: k-means, ...

### Mètodes i utilitats

In [1]:
# http://stackoverflow.com/questions/7100125/storing-python-dictionaries

import json
from bson import json_util
import yaml

# http://api.mongodb.org/python/1.10.1/api/bson/json_util.html
# Mètodes per grabar y carregar fitxers en format json
def save_dict_json(dict, filename):
    with open(filename, 'wb') as fp:
        json.dump(dict, fp, default=json_util.default)

def load_dict_json(filename):
    with open(filename, 'rb') as fp:
        return json.load(fp, object_hook=json_util.object_hook)

# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
def load_dict_yaml(filename):
    with open(filename, 'rb') as fp:
        return yaml.load(fp)
    
# Mètode per grabar un fitxer de texte
def save_text_file(text, file_name, encoding = 'utf-8'):
    with open(file_name, "w") as text_file:
        if encoding:
            text_file.write(text.encode(encoding))
        else:
            text_file.write(text)

import ast
# Mètode per grabar un fitxer de texte
def save_list_text_file(text_list, file_name):
    print "Saving:", file_name, "lines:", len(text_list)
    with open(file_name, "w") as text_file:
        for item in text_list:
            text_file.write((r''.join(ast.literal_eval(item))+"\n").encode('utf-8'))
            # text_file.write(item+"\n")
        
# Mètode per carregar un fitxer de texte
def load_text_file(file_name, encoding = 'utf-8'):
    with open(file_name, "r") as text_file:
        if encoding:
            return text_file.read().decode(encoding)
        else:
            return text_file.read()
        
# Mètode per eliminar caracters incorrectes als noms de fitxers (potser millor utilitzar: https://pypi.python.org/pypi/goldfinch/0.4)
invalid_filename_chars_windows = '\/:*?"<>|'
def del_invalid_chars(value, deletechars):
    for c in deletechars:
        value = value.replace(c,'')
    return value;

import datetime

# Mètode per cambiar de format de data de format_1 a format_2
def format_date(strFecha, format_1, format_2):
    dtDate = datetime.datetime.strptime(strFecha, format_1)
    return dtDate.strftime(format_2)

# Mètode que ens diu si una data cau en dimecres
# https://docs.python.org/2/library/datetime.html#datetime.date.weekday
def isWednesday(my_date):
    if my_date.date().weekday() == 2:  # 0: Monday, 1: Tuesday, 2: Wednesday, ...
        return True
    else:
        return False
    
# print "24/03/2015", "isWednesday:", isWednesday("24/03/2015")
# print "11/03/2015", "isWednesday:", isWednesday("11/03/2015")

In [18]:
#
loaded_groups_dict = load_dict_json('groups_dict.json')

def get_group_name(group_code):
    if group_code:
        for id in loaded_groups_dict:
            if loaded_groups_dict[id]['code'] == group_code:
                return loaded_groups_dict[id]['name']
        print group_code, "not found"
    return ""
    
group_name = get_group_name('GS')
if group_name:
    print "group_name:", group_name
else:
    print "No results."
    
group_name = get_group_name('NA')
if group_name:
    print "group_name:", group_name
else:
    print "No results."

group_name: Socialista (PSOE)
NA not found
No results.


#### Obrir conexió a la BD 'congres'

In [2]:
import pymongo

# Create the connection to MongoDB
try:
    connection=pymongo.MongoClient()
    print "Connection to Mongo Daemon successful!!!"
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e
    # Obtenim la BD del Congrés
db = connection['congres']
print "Collections : ", db.collection_names()

Connection to Mongo Daemon successful!!!
Collections :  [u'system.indexes', u'legislatures', u'all_document', u'congres_groups', u'congres_members', u'document']


### Obtenim els documents de MongoDB

In [3]:
import datetime
import re
# Col.lecció de documents a la BD
p = re.compile(r'\b[a-zA-Z\xe1\xe9\xed\xf3\xfa\xc1\xc9\xcd\xd3\xda\xf1\xe7]+\b',re.UNICODE)

doc_col = db['document']

doc_start_date = "01/09/2000"
d_doc_start_date = datetime.datetime.strptime(doc_start_date, "%d/%m/%Y")
doc_end_date = "01/04/2015"
d_doc_end_date = datetime.datetime.strptime(doc_end_date, "%d/%m/%Y")

result = doc_col.find({'date': {'$gte': d_doc_start_date, '$lte': d_doc_end_date}}).sort('date',1)

documents = []
text_documents = [] # documentos = [d1=p1+i11+...+i1M, ..., dN=pN+iN1+...]
docs_tokenized_keywords = []
    
if result.count() > 0 :
    for doc in result:
        print doc['description']
        dialog_tokenized_keywords = []
        for dialogo in doc['session_dictionary']:
            q_texto = ' '.join(p.findall(dialogo['question']))
            q_tokenized_keywords = []
            for keyword in dialogo['keywords']:
                q_tokenized_keywords += keyword[0].split()
            i_tokenized_keywords = []
            for intervencion in dialogo['intervention_dictionary']:
                for keyword in intervencion['keywords']:
                    i_tokenized_keywords += keyword[0].split()
                i_texto = ' '.join(p.findall(intervencion['text']))
                texto = q_texto + i_texto
            dialog_tokenized_keywords += (q_tokenized_keywords + i_tokenized_keywords)
            documents.append({'date':doc['date'], 'question':dialogo['question'], 'num_exp':dialogo['num_exp'], \
                              'speaker':dialogo['speaker'], 'group':dialogo['group'], \
                              'interventions': dialogo['intervention_dictionary'], \
                              'keywords':(q_tokenized_keywords + i_tokenized_keywords)})
            text_documents.append(texto)
        docs_tokenized_keywords += dialog_tokenized_keywords

print "Num. of tokenized keywords found in sessions:", len(docs_tokenized_keywords)
set_docs_tokenized_keywords = set(docs_tokenized_keywords)
print "Num. of tokenized keywords:", len(set_docs_tokenized_keywords)
print "len(documents):", len(documents)
print "len(text_documents):", len(text_documents)

Congreso de los Diputados, Pleno y Dip. Perm., núm. 23, de 13/09/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 25, de 20/09/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 28, de 27/09/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 31, de 04/10/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 34, de 18/10/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 46, de 29/11/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 49, de 13/12/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 52, de 20/12/2000
Congreso de los Diputados, Pleno y Dip. Perm., núm. 57, de 07/02/2001
Congreso de los Diputados, Pleno y Dip. Perm., núm. 60, de 14/02/2001
Congreso de los Diputados, Pleno y Dip. Perm., núm. 62, de 21/02/2001
Congreso de los Diputados, Pleno y Dip. Perm., núm. 65, de 07/03/2001
Congreso de los Diputados, Pleno y Dip. Perm., núm. 68, de 14/03/2001
Congreso de los Diputados, Pleno y Dip. Perm., núm. 71, de 21/03/2001
Congreso de los Dipu

In [4]:
# num documents
print "len(documents):", len(documents)
print "len(text_documents):", len(text_documents)

len(documents): 7000
len(text_documents): 7000


#### Load data

In [8]:
import pandas

# Load
# df
# df = pandas.read_pickle("congres/2015128_194656_df2pablo.pickle")
df = pandas.read_pickle("congres/2015129_01320_df2pablo_tf_25_k_45.pkl")
df

Unnamed: 0,date,label,keywords
0,2000-09-13,internacional,"[asuntos exteriores, política exterior, derech..."
1,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
2,2000-09-13,medio ambiente,"[energías renovables, seguridad nuclear, energ..."
3,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
4,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
5,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
6,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
7,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
8,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."
9,2000-09-13,economía,"[empresas, empresas españolas, medianas empres..."


In [6]:
# df.to_csv("congres/2015129_01320_df2pablo_tf_25_k_45.csv", sep=";", encoding='utf-8')

In [7]:
# df = pandas.read_csv("congres/2015129_01320_df2pablo_tf_25_k_45.csv", sep=";")
# df

In [8]:
df_congres_result = df[(df.label=='internacional') & (df.date=='2000-09-13')]
df_congres_result.iloc[0].keywords

[u'asuntos exteriores',
 u'pol\xedtica exterior',
 u'derechos humanos',
 u'cooperaci\xf3n',
 u'oriente medio',
 u'intereses espa\xf1oles',
 u'comunidad internacional',
 u'diplomacia espa\xf1ola',
 u'posici\xf3n com\xfan',
 u'secretario general',
 u'posici\xf3n',
 u'exteriores',
 u'intereses',
 u'cumbre iberoamericana',
 u'espa\xf1oles',
 u'am\xe9rica latina',
 u'visita',
 u'presidencia',
 u'cooperaci\xf3n internacional',
 u'asuntos generales']

In [92]:
import re
import nltk.data
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')

def get_words_sentences_v3(text, keyword_list):
    words = {}
    sentences = []
    sentence_index = 0
    
    text_sentences = spanish_tokenizer.tokenize(text)

    for keyword in keyword_list:
        my_re = u"\\b(" + keyword + u"[a-záéíóúñç\\-]*)\\b"
        # print my_re
        p = re.compile(my_re)
        sentence_indexes = [i for i, text_sentence in enumerate(text_sentences) if p.search(text_sentence.lower())]
        # print "Ocurrences of", keyword, ":", len(sentence_indexes)
        if len(sentence_indexes) > 0:
            word_indexes = []
            for i in sentence_indexes:
                sentence = text_sentences[i]
                for j in range(2):
                    if i-j-1 >= 0:
                        prev = text_sentences[i-j-1]
                    else:
                        prev = ""
                    if i+j+1 < len(text_sentences):
                        next = text_sentences[i+j+1]
                    else:
                        next = ""
                    sentence = prev + " " + sentence + " " + next
                
                sentences.append(text_sentences[i])
                word_indexes.append(sentence_index)
                sentence_index += 1
                
            words[keyword] = word_indexes
            
    return words, sentences

def get_words_sentences_v2(doc_text, keyword_list, doc_url, doc_title, n = 20):
    words = {}
    sentences = []
    sentence_index = 0
    
    text_sentences = spanish_tokenizer.tokenize(doc_text)

    for keyword in keyword_list[:n]:
        if len(keyword) > 2:
            # A PolitiLines.js s'utilitza: EReg("(\\W?)\\b" + word + "\\b(\\W?)","gi");
            # sentence_indexes = [i for i, text_sentence in enumerate(text_sentences) if keyword in text_sentence.lower()]
            sentence_indexes = [i for i, text_sentence in enumerate(text_sentences) if re.search(r"(\W?)\b%s\b(\W?)" % keyword, text_sentence,re.IGNORECASE)]
            
            # print "Ocurrences of", keyword, ":", len(sentence_indexes)
            if len(sentence_indexes) > 0:
                word_indexes = []
                for i in sentence_indexes:
                    '''
                    # Afegir les dues frases anteriors i posteriors
                    sentence = text_sentences[i]
                    for j in range(2):
                        if i-j-1 >= 0:
                            prev = text_sentences[i-j-1]
                        else:
                            prev = ""
                        if i+j+1 < len(text_sentences):
                            next = text_sentences[i+j+1]
                        else:
                            next = ""
                        sentence = prev + " " + sentence + " " + next
                    sentence = "\"... " + sentence + " ...\""
                    '''
                    sentence = "\"... " + text_sentences[i] + " ...\""
                    if doc_url:
                        doc_href = '<a href=\"' + doc_url + '\" onclick=\"window.open(this.href, \'' + doc_title +'\', \'left=20,top=20,width=650,height=500,toolbar=1,resizable=0\'); return false;\">See in session control diary</a>'
                        sentence += ("\n" + doc_href)
                    sentences.append(sentence)
                    word_indexes.append(sentence_index)
                    sentence_index += 1

                words[keyword] = word_indexes

    return words, sentences

def get_words_sentences(text, keyword_list):
    words = {}
    sentences = []
    sentence_index = 0
    tokenized_text = text.split()
    tokenized_lower_text = text.lower().split()
    for keyword in keyword_list:
        # Llista de tots els indexos on apareix keyword a 
        keyword_indexes = [i for i, token in enumerate(tokenized_lower_text) if token == keyword]
        print "Ocurrences of", keyword, ":", len(keyword_indexes)
        if len(keyword_indexes) > 0:
            word_indexes = []
            for i in keyword_indexes:
                sentence = tokenized_text[i]
                for j in range(10):
                    if i-j-1 >= 0:
                        prev = tokenized_text[i-j-1]
                    else:
                        prev = ""
                    if i+j+1 < len(tokenized_text):
                        next = tokenized_text[i+j+1]
                    else:
                        next = ""
                    sentence = prev + " " + sentence + " " + next
                sentences.append("... " + sentence + " ...")
                word_indexes.append(sentence_index)
                sentence_index += 1
            
            words[keyword] = word_indexes
    
    return words, sentences

In [93]:
text_test = u'''
Señora vicepresidenta, los juicios de valor están bien y yo los respeto, pero prefiero remitirme a los hechos, y de hechos voy a hablar en relación con su respeto a la independencia judicial. Los secretarios de las salas de gobierno los nombra el Gobierno de España, previo informe de la Audiencia Nacional, de los tribunales superiores de justicia y del Tribunal Supremo. Está pronto a terminar el mandato del secretario de la Sala de gobierno de la Audiencia Nacional, que es un puesto clave porque coordina a todos los secretarios judiciales de la Audiencia Nacional y por sus manos pasa información clave relacionada con la corrupción política de algunos partidos políticos. Por eso, el Ministerio de Justicia el pasado día 20 de enero se dirigió a la Audiencia Nacional para presentarle cuatro posibles candidatos a esa sustitución y pedirle que informase sobre los mismos. La Audiencia Nacional, por unanimidad de todos los miembros de la Sala de gobierno, informó a favor de uno de los candidatos. (La señora vicepresidenta del Gobierno y ministra de la Presidencia, Sáenz de Santamaría Antón, habla en voz baja con el señor ministro de Justicia, Catalá Polo). ?Sí, pregúntele, pregúntele?. Resulta que ese informe no le debió de gustar al Ministerio de Justicia. No le debió de gustar porque están muy nerviosos con lo que está pasando en la Audiencia Nacional. Por eso remitieron una cartita en la que le decían que no querían un informe sobre cuatro candidatos sino sobre una candidata. Esta es la carta de la vergüenza (Muestra una carta), la firma la secretaria de Estado de Justicia, ese es su superior jerárquico y usted es la persona que coordina la acción del Gobierno y da las órdenes. La Audiencia Nacional se ha ratificado en el informe y en el candidato, ustedes ahora pueden nombrar a quien quieran; usted puede o no responder a esta pregunta como hace normalmente, incluso puede echarle la culpa a quien le dé la gana (Rumores), pero usted, señora vicepresidenta, es culpable de este desastre. Señora vicepresidenta, no le pregunto si esta carta le parece bonita o fea (Muestra de nuevo la carta), lo que quiero que me diga es si cree que esta desfachatez es respetuosa con la Audiencia Nacional y con la independencia de la justicia.
'''

words, sentences = get_words_sentences_v3(text_test, [u'nacional',u'corrup'])
print "words:", words
for i, sentence in enumerate(sentences):
    print i, ":", sentence

print "-"*120

words, sentences = get_words_sentences_v2(text_test, [u'nacional',u'corrupción'], "", "")
print "words:", words
for i, sentence in enumerate(sentences):
    print i, ":", sentence

print "-"*120

words, sentences = get_words_sentences(text_test, [u'nacional',u'corrupción'])
print "words:", words
for i, sentence in enumerate(sentences):
    print i, ":", sentence

words: {u'corrup': [7], u'nacional': [0, 1, 2, 3, 4, 5, 6]}
0 : Los secretarios de las salas de gobierno los nombra el Gobierno de España, previo informe de la Audiencia Nacional, de los tribunales superiores de justicia y del Tribunal Supremo.
1 : Está pronto a terminar el mandato del secretario de la Sala de gobierno de la Audiencia Nacional, que es un puesto clave porque coordina a todos los secretarios judiciales de la Audiencia Nacional y por sus manos pasa información clave relacionada con la corrupción política de algunos partidos políticos.
2 : Por eso, el Ministerio de Justicia el pasado día 20 de enero se dirigió a la Audiencia Nacional para presentarle cuatro posibles candidatos a esa sustitución y pedirle que informase sobre los mismos.
3 : La Audiencia Nacional, por unanimidad de todos los miembros de la Sala de gobierno, informó a favor de uno de los candidatos.
4 : No le debió de gustar porque están muy nerviosos con lo que está pasando en la Audiencia Nacional.
5 : La A

In [94]:
def get_candidate_index(candidates, ns):
    for id, candidate in enumerate(candidates):
        if ns.lower().strip() == candidate['name'].lower().strip():
            return id
    
    return -1

In [95]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('spanish')
wordnet = WordNetLemmatizer()

def get_stem_list(word_list):
    word_list_stemmed = []

    for word in word_list:
        # stemmed_word = porter.stem(word)
        stemmed_word = snowball.stem(word) # requires 'corpora/wordnet' -> nltk.download()
        # stemmed_word = wordnet.lemmatize(word) # requires 'corpora/wordnet' -> nltk.download()
        
        found = False
        for sw in word_list_stemmed:
            if stemmed_word == sw[0]:
                found = True
                if len(word) < len(sw[1]):
                    sw[1] = word
                sw[2].append(word)
        if not found:
            word_list_stemmed.append([stemmed_word, word, [word]])
            
    return word_list_stemmed

df_congres_result = df[(df.label=='internacional') & (df.date=='2000-09-13')]
# keywords = [x.strip() for x in re.sub('[\[\]]', '', df_congres_result.iloc[0].keywords).split(',')]
keywords = df_congres_result.iloc[0].keywords
print keywords
print get_stem_list(keywords)


[u'asuntos exteriores', u'pol\xedtica exterior', u'derechos humanos', u'cooperaci\xf3n', u'oriente medio', u'intereses espa\xf1oles', u'comunidad internacional', u'diplomacia espa\xf1ola', u'posici\xf3n com\xfan', u'secretario general', u'posici\xf3n', u'exteriores', u'intereses', u'cumbre iberoamericana', u'espa\xf1oles', u'am\xe9rica latina', u'visita', u'presidencia', u'cooperaci\xf3n internacional', u'asuntos generales']
[[u'asuntos exterior', u'asuntos exteriores', [u'asuntos exteriores']], [u'politica exterior', u'pol\xedtica exterior', [u'pol\xedtica exterior']], [u'derechos human', u'derechos humanos', [u'derechos humanos']], [u'cooper', u'cooperaci\xf3n', [u'cooperaci\xf3n']], [u'oriente medi', u'oriente medio', [u'oriente medio']], [u'intereses espa\xf1ol', u'intereses espa\xf1oles', [u'intereses espa\xf1oles']], [u'comunidad internacional', u'comunidad internacional', [u'comunidad internacional']], [u'diplomacia espa\xf1ol', u'diplomacia espa\xf1ola', [u'diplomacia espa\xf1o

In [9]:
import re

df_by_date = df.groupby('date')

'''
for session_date, docs in df_by_date:
    print "Date:", session_date, "num. docs:", len(docs)
    for idr, doc in docs.iterrows():
        print "\t", idr, doc['label'], ":", len(doc['keywords']), doc['keywords']
        # keywords = [x.strip() for x in re.sub('[\[\]]', '', doc['keywords']).split(',')]
        keywords = doc['keywords']
        print len(keywords), keywords
'''

'\nfor session_date, docs in df_by_date:\n    print "Date:", session_date, "num. docs:", len(docs)\n    for idr, doc in docs.iterrows():\n        print "\t", idr, doc[\'label\'], ":", len(doc[\'keywords\']), doc[\'keywords\']\n        # keywords = [x.strip() for x in re.sub(\'[\\[\\]]\', \'\', doc[\'keywords\']).split(\',\')]\n        keywords = doc[\'keywords\']\n        print len(keywords), keywords\n'

#### Debats agrupats per grups parlamentaris

In [98]:
def get_first_keywords(n_first_keywords = 25):

    first_keywords = {}
    for session_date, docs in df_by_date:
        print "Date:", session_date, "num. docs:", len(docs)

        pre_blocks = {session_date:[]}
        for idr, doc in docs.iterrows():
            # print "\t", idr, doc['label'], doc['keywords']
            document = documents[idr]
            
            # doc_keywords = [x.strip() for x in re.sub('[\[\]]', '', doc['keywords']).split(',')]
            words, sentences = get_words_sentences_v2(document['question'], doc['keywords'], "", "")
            # words, sentences = get_words_sentences_v2(document['question'], doc_keywords)
            pre_blocks[session_date].append({"words":words, "sentences":sentences})
            
            for int_index, intervention in enumerate(document['interventions']):
                words, sentences = get_words_sentences_v2(intervention['text'], doc['keywords'], "", "")
                # words, sentences = get_words_sentences_v2(intervention['text'], doc_keywords)
                pre_blocks[session_date].append({"words":words, "sentences":sentences})
        
        # Llista
        count_words = []
        for l_words in pre_blocks[session_date]:
            for keyword in l_words['words']:
                keyword_found = False
                for c_word in count_words:
                    if keyword == c_word[0]:
                        c_word[1] += len(l_words['words'][keyword])
                        keyword_found = True
                        break
                        
                if not keyword_found:
                    count_words.append([keyword, len(l_words['words'][keyword])])
                    
        count_words.sort(key=lambda col: col[1], reverse=True)
        print "len(count_words):", len(count_words)
        
        if len(count_words) > n_first_keywords:
            first_keywords[session_date]=[word[0] for word in count_words[:n_first_keywords]]
        else:
            first_keywords[session_date]=[word[0] for word in count_words]
        print first_keywords[session_date]
        
    return first_keywords

In [99]:
first_keywords = get_first_keywords()

Date: 2000-09-13 00:00:00 num. docs: 24
len(count_words): 37
[u'econom\xeda', u'econom\xeda espa\xf1ola', u'empresas', u'ense\xf1anza', u'j\xf3venes', u'tramo', u'datos', u'fomento', u'sanidad', u'peque\xf1as', u'salud', u'medianas empresas', u'empresas espa\xf1olas', u'sistema educativo', u'calidad', u'espa\xf1oles', u'industrial', u'corredor', u'asistencia sanitaria', u'profesionales sanitarios', u'sistema nacional', u'profesionales', u'servicios sanitarios', u'sanidad p\xfablica', u'sistema sanitario']
Date: 2000-09-20 00:00:00 num. docs: 27
len(count_words): 63
[u'vivienda', u'empresas', u'viviendas', u'protecci\xf3n oficial', u'econom\xeda', u'familias', u'alquiler', u'precio', u'deporte', u'empresa', u'seguridad social', u'actividad', u'j\xf3venes', u'educaci\xf3n', u'cultura', u'presidencia', u'investigaci\xf3n', u'peque\xf1as', u'vivienda protegida', u'acceso', u'ayudas', u'vivienda libre', u'empresas espa\xf1olas', u'econom\xeda espa\xf1ola', u'lengua']
Date: 2000-09-27 00:00:

In [108]:
'''
def add_candidate_block():
    ns = intervention['who']['name'] + " " + intervention['who']['surname']
    if intervention['who']['group']:
        # "candidates":
        candidate_index = get_candidate_index(candidates, intervention['who']['group'])
        if candidate_index < 0:
            candidates.append({'block_indices': [block_index],'name':intervention['who']['group'],'party':ns})
        else:
            candidates[candidate_index]['block_indices'].append(block_index)
        # "blocks":
        blocks.append({"issues":[doc['label']], "words":words, "sentences":sentences})

        block_index += 1
    else:
        print session_date, "num. exp:", document['num_exp'], ", None intervention['who']['group']"
'''

def get_anchor(num_exp):
    anchor_name = ""
    if num_exp:
        if '/' in num_exp:
            anchor_name = num_exp.split('/')[1]
        else:
            anchor_name = num_exp
    return anchor_name

def get_debates(debates_by = "day"):
    raw_debates = []
    block_index = 0
    candidates = []
    blocks = []
    current_year_month = ""
    current_id = -1

    for session_date, docs in df_by_date:
        print "Date:", session_date, "num. docs:", len(docs)
        # d_session_date = datetime.datetime.strptime(session_date, '%Y-%m-%d %H:%M:%S')
        d_session_date = session_date
        doc_url = "../politilines/data/marked/" + d_session_date.strftime('%Y%m%d') + "_doc_marked.html"
        doc_title = "Session diary from " + d_session_date.strftime('%d-%m-%Y') 
        
        if debates_by == "day":
            block_index = 0
            candidates = []
            blocks = []
        else:
            if current_year_month != d_session_date.strftime('%Y-%m'):
                block_index = 0
                candidates = []
                blocks = []
                current_id += 1
                new_month = True
            else:
                new_month = False
            
        for idr, doc in docs.iterrows():
            # print "\t", idr, doc['label'], doc['keywords']
            document = documents[idr]
            
            # doc_keywords = doc['keywords']
            doc_keywords = first_keywords[session_date]
            
            anchor_name = get_anchor(document['num_exp'])
            words, sentences = get_words_sentences_v2(document['question'], doc_keywords, doc_url + "#" + anchor_name, doc_title)
            
            if len(words):
                group_name = get_group_name(document['group'])
                if group_name:
                    candidate_index = get_candidate_index(candidates, group_name)
                    if candidate_index < 0:
                        # "candidates": [{"block_indices": [0, 9, 13, 15, 37, 48, 54, 59, 61, 75, 82, 87, 92, 96, 97], "name": "Ron Paul"},
                        candidates.append({'block_indices': [block_index],'name':group_name,'party':document['speaker']})
                    else:
                        candidates[candidate_index]['block_indices'].append(block_index)
                    # "blocks": [{"issues": ["Personal History"], "words": {"country": [1, 2]}, "sentences": ["", "And we have a lot of troubles around the world, as you see, the Middle East in flames, and what's going on in this country with gas prices and the economy.", "And I'm here to talk about a positive solutions that confront this country that include everybody from the bottom up.   "]}}
                    blocks.append({"issues":[doc['label']], "words":words, "sentences":sentences})

                    block_index += 1

            for int_index, intervention in enumerate(document['interventions']):
                anchor_name = get_anchor(document['num_exp'])
                words, sentences = get_words_sentences_v2(intervention['text'], doc_keywords, doc_url + "#" + anchor_name, doc_title)

                if len(words):
                    ns = intervention['who']['name'] + " " + intervention['who']['surname']
                    group_name = get_group_name(intervention['who']['group'])
                    if group_name:
                        # "candidates":
                        candidate_index = get_candidate_index(candidates, group_name)
                        if candidate_index < 0:
                            candidates.append({'block_indices': [block_index],'name':group_name,'party':ns})
                        else:
                            candidates[candidate_index]['block_indices'].append(block_index)
                        # "blocks":
                        blocks.append({"issues":[doc['label']], "words":words, "sentences":sentences})

                        block_index += 1

        if debates_by == "day":
            raw_debates.append([session_date, {'candidates':candidates, 'blocks':blocks}])
        else:
            current_year_month = d_session_date.strftime('%Y-%m')
            if new_month:
                raw_debates.append([d_session_date, {'candidates':candidates, 'blocks':blocks}])
            else:
                raw_debates[current_id] = [d_session_date, {'candidates':candidates, 'blocks':blocks}]
    
    return raw_debates

In [109]:
raw_debates_by_dates = get_debates()
print "-"*120
raw_debates_by_month = get_debates(debates_by = "month")

Date: 2000-09-13 00:00:00 num. docs: 24
Date: 2000-09-20 00:00:00 num. docs: 27

    POPULAR EN ELCONGRESO not found
Date: 2000-09-27 00:00:00 num. docs: 25
Date: 2000-10-04 00:00:00 num. docs: 22
Date: 2000-10-18 00:00:00 num. docs: 28
Date: 2000-11-29 00:00:00 num. docs: 24
Date: 2000-12-13 00:00:00 num. docs: 28
Date: 2000-12-20 00:00:00 num. docs: 22
Date: 2001-02-07 00:00:00 num. docs: 27
Date: 2001-02-14 00:00:00 num. docs: 26
Date: 2001-02-21 00:00:00 num. docs: 27

    POPULAR EN EL CONGFRESO not found
Date: 2001-03-07 00:00:00 num. docs: 25
Date: 2001-03-14 00:00:00 num. docs: 27
Date: 2001-03-21 00:00:00 num. docs: 25
Date: 2001-04-04 00:00:00 num. docs: 28
Date: 2001-04-18 00:00:00 num. docs: 26
Date: 2001-04-25 00:00:00 num. docs: 27
Date: 2001-05-16 00:00:00 num. docs: 24
Date: 2001-05-23 00:00:00 num. docs: 27
Date: 2001-05-30 00:00:00 num. docs: 26
Date: 2001-06-13 00:00:00 num. docs: 25
Date: 2001-06-20 00:00:00 num. docs: 28
Date: 2001-09-12 00:00:00 num. docs: 22
Date

In [110]:
# ordenar
raw_debates_by_dates.sort(key=lambda col: col[0], reverse=True)
raw_debates_by_month.sort(key=lambda col: col[0], reverse=True)

print "len(raw_debates_by_dates):", len(raw_debates_by_dates)
print "len(raw_debates_by_month):", len(raw_debates_by_month)

len(raw_debates_by_dates): 310
len(raw_debates_by_month): 131


#### Resum de resultats, get_debates_by_politicians:

In [95]:
# ordenar
raw_debates.sort(key=lambda col: col[0], reverse=True)

for raw_debate in raw_debates:
    print "Date:", raw_debate[0], "num. blocks:", len(raw_debate[1]['blocks'])
    candidates = raw_debate[1]['candidates']
    print "candidates:", len(candidates)
    l_candidates = []
    parties = []
    for candidate in candidates:
        if not candidate['name'] in l_candidates:
            l_candidates.append(candidate['name'])
        if not candidate['party'] in parties:
            parties.append(candidate['party'])
    print "\t candidates:", len(l_candidates), ", ", l_candidates
    print "\t parties:", len(parties), ", ", parties

Date: 2015-03-11 00:00:00 num. blocks: 44
candidates: 6
	 candidates: 6 ,  [u'Mixto', u'Popular (PP)', u'Converg\xe8ncia i Uni\xf3', u'UPyD', u'Socialista (PSOE)', u'Izquierda Unida']
	 parties: 6 ,  [u'ALFRED BOSCH I PASCUAL', u'Mariano Rajoy Brey', u'PERE MACIAS I ARAU', u'Irene Lozano Domingo', u'Mario Bedera Bravo', u'<group>']
Date: 2015-02-18 00:00:00 num. blocks: 64
candidates: 7
	 candidates: 7 ,  [u'UPyD', u'Popular (PP)', u'Socialista (PSOE)', u'Izquierda Unida', u'Converg\xe8ncia i Uni\xf3', u'Mixto', u'PNV']
	 parties: 7 ,  [u'ROSA MAR\xcdA D\xcdEZ GONZ\xc1LEZ', u'Mariano Rajoy Brey', u'Pedro S\xe1nchez P\xe9rez-Castej\xf3n', u'Alberto Garz\xf3n Espinosa', u'MART\xcd BARBER\xc0 I MONTSERRAT', u'Alfred Bosch i Pascual', u'Isabel S\xe1nchez Robles']
Date: 2015-02-11 00:00:00 num. blocks: 34
candidates: 5
	 candidates: 5 ,  [u'Izquierda Unida', u'Popular (PP)', u'Socialista (PSOE)', u'Mixto', u'Converg\xe8ncia i Uni\xf3']
	 parties: 5 ,  [u'CAYO LARA MOYA', u'Mariano Rajoy Bre

### Export debates_by_date.json and debates_by_month.json

#### Get legislature by date

In [111]:
def get_legislature(legislatures_dict, d_doc_date):
    for key in legislatures_dict:
        if legislatures_dict[key]['start'].date() <= d_doc_date.date() <= legislatures_dict[key]['end'].date():
            return legislatures_dict[key]['_id'], legislatures_dict[key]['name']
    print "ERROR, get_legislature: date out of range:", doc_date
    return -1, ""

# Test get_legislature
loaded_legislatures_dict = load_dict_json('legislatures_dict.json')

my_date = "03/06/2012"
d_doc_date = datetime.datetime.strptime(my_date, "%d/%m/%Y")
print "my_date:", my_date, ", legislature:", get_legislature(loaded_legislatures_dict, d_doc_date)

my_date: 03/06/2012 , legislature: (10, u'X LEGISLATURA')


In [112]:
import datetime
import os

def save_debates(raw_debates, file_path, debates_by):

    #
    debates_dict = {}

    print "Saving... debates_by_" + debates_by + ".json"

    for raw_debate in raw_debates:
        d_session_date = raw_debate[0]
        # d_session_date = datetime.datetime.strptime(raw_debate[0], '%Y-%m-%d %H:%M:%S')
        legislature_id, legislature = get_legislature(loaded_legislatures_dict, d_session_date)

        file_name = "debate_" + datetime.datetime.strftime(d_session_date, "%Y%m%d") + "_by_" + debates_by + ".json"
        print "Saving", file_name, "..."
        save_dict_json(raw_debate[1], file_path + file_name)

        file_dict = {'url':file_name, 'city':'Madrid', 'date':datetime.datetime.strftime(d_session_date, "%Y-%m-%d"), 
                     'sponsors':'', 'venue':'Congreso de los diputados'}

        if legislature_id in debates_dict:
            debates_dict[legislature_id]['debates'].append( file_dict )
        else:
            debates_dict[legislature_id] = {'name':legislature,'listCollapse':True, 'debates':[ file_dict ]}

    debates = []
    for legislature_id in range(7,10+1):
        # for debate_legislature in debates_dict:
        print "Legislatura:", legislature_id
        debates.append(debates_dict[legislature_id])

    print "Saving... debates_by_" + debates_by + ".json"
    save_dict_json({'debateCategories':debates}, file_path + 'debates_by_' + debates_by + '.json')

In [113]:
#
file_path = 'docs/politilines_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
if not os.path.exists(file_path):
    os.makedirs(file_path)
    
save_debates(raw_debates_by_dates, file_path, "day")
print "-"*120
save_debates(raw_debates_by_month, file_path, "month")

Saving... debates_by_day.json
Saving debate_20150311_by_day.json ...
Saving debate_20150218_by_day.json ...
Saving debate_20150211_by_day.json ...
Saving debate_20150121_by_day.json ...
Saving debate_20141217_by_day.json ...
Saving debate_20141210_by_day.json ...
Saving debate_20141126_by_day.json ...
Saving debate_20141119_by_day.json ...
Saving debate_20141029_by_day.json ...
Saving debate_20141015_by_day.json ...
Saving debate_20141008_by_day.json ...
Saving debate_20140924_by_day.json ...
Saving debate_20140917_by_day.json ...
Saving debate_20140910_by_day.json ...
Saving debate_20140709_by_day.json ...
Saving debate_20140625_by_day.json ...
Saving debate_20140528_by_day.json ...
Saving debate_20140514_by_day.json ...
Saving debate_20140507_by_day.json ...
Saving debate_20140409_by_day.json ...
Saving debate_20140319_by_day.json ...
Saving debate_20140312_by_day.json ...
Saving debate_20140212_by_day.json ...
Saving debate_20140122_by_day.json ...
Saving debate_20131218_by_day.json

## The Words They Used

In [19]:
#
loaded_groups_dict = load_dict_json('groups_dict.json')

def get_group_trend(group_code):
    if group_code:
        for id in loaded_groups_dict:
            if loaded_groups_dict[id]['code'] == group_code:
                return loaded_groups_dict[id]['trend']
        print group_code, "not found"
    return ""

def get_group_trend_titles():
    left_groups = []
    right_groups = []
    for id in loaded_groups_dict:
        if loaded_groups_dict[id]['trend'] == 'I':
            if not loaded_groups_dict[id]['name'] in left_groups:
                left_groups.append(loaded_groups_dict[id]['name'])
        elif loaded_groups_dict[id]['trend'] == 'D':
            if not loaded_groups_dict[id]['name'] in right_groups:
                right_groups.append(loaded_groups_dict[id]['name'])

    left_group_title = "Left parties:"
    for l_group in left_groups:
        left_group_title += ("\n" + l_group)
    right_group_title = "Right parties:"
    for r_group in right_groups:
        right_group_title += ("\n" + r_group)
    
    return left_group_title, right_group_title


In [20]:
left_group_title, right_group_title = get_group_trend_titles()
    
left_groups_template = '<div class="g-democrat g-arrow">Words favored<br><span title="{0}">by Left parties</span></div>'
right_groups_template = '<div class="g-republican g-arrow">Words favored<br><span title="{0}">by Right parties</span></div>'
print left_groups_template.replace('{0}', left_group_title)
print "-"*120
print right_groups_template.replace('{0}', right_group_title)
print "-"*120
group_trend = get_group_trend('GS')
if group_trend:
    print "group_trend:", group_trend
else:
    print "No results."


<div class="g-democrat g-arrow">Words favored<br><span title="Left parties:
Socialista (PSOE)
Izquierda Unida
Esquerra Republicana">by Left parties</span></div>
------------------------------------------------------------------------------------------------------------------------
<div class="g-republican g-arrow">Words favored<br><span title="Right parties:
PNV
Coalición Canaria
Popular (PP)
Convergència i Unió
UPyD">by Right parties</span></div>
------------------------------------------------------------------------------------------------------------------------
group_trend: I


In [21]:
str_i_speeches = u'''
Creo que ahora es cuando echamos en falta las carencias de su política antiinflacionista, ahora es cuando echamos en falta un Gobierno serio que no dudó en apuntarse los tantos de la bajada de la inflación pero que ahora elude sus responsabilidades ante la subida, y es cuando le pregunto qué valoración hace de esa pérdida de poder adquisitivo que están sufriendo los españoles como consecuencia de nuestra elevada inflación.
'''

keyword = u'españoles'
#  RegExp("\\b(" + d3.requote(name) + ")\\b", "gi")
print re.findall(r'\b(%s)\b' % re.escape(keyword),str_i_speeches,flags=re.IGNORECASE)


[u'espa\xf1oles']


In [23]:
import os
import re

def remove_spaces_and_newline(s):
    s = s.replace('\n','')
    s = s.replace('"','\\"')
    return re.sub(r'\s+', ' ', s)

def add_speech(speeches, speaker, speech):
    for element in speeches:
        if speaker == element[0]:
            element[1].append(speaker.upper() + ": " + speech)
            return speeches
    speeches.append([speaker, [speaker.upper() + ": " + speech]])
    return speeches

def add_speaker(speakers, speaker, group):
    for element in speakers:
        if speaker == element[0]:
            return speakers
    speakers.append([speaker,group])
    return speakers            

#
file_path = 'docs/ny_times_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
if not os.path.exists(file_path):
    os.makedirs(file_path)

# Load templates
js_file_template = load_text_file('../D3/Our version of The Words They Used/The Words They Used/templates/index_unformated.js')
html_file_template = load_text_file('../D3/Our version of The Words They Used/The Words They Used/templates/The Words They Used_unformated.html')

speech_template = '{name: "democrat", speeches: [{0}]}, {name: "republican", speeches: [{1}]}'
# print speech_template

speaker_template = '"{0}": {name: "{1}", title: "{2}"}'
# print speaker_template

topic_template = '{name: "{0}", re: /\\b({1})\\b/gi, x: {2}, y: {3}}'

#
for session_date, docs in df_by_date:
    print "-"*120
    print "Date:", session_date, "num. docs:", len(docs)

    i_speeches = []
    d_speeches = []
    speakers = []
    session_keywords = []
    
    for idr, doc in docs.iterrows():
        # print "\t", idr, doc['label'], doc['keywords']

        document = documents[idr]

        group_trend = get_group_trend(document['group'])
        if group_trend and group_trend in ['I','D']:
            cleaned_speaker = remove_spaces_and_newline(document['speaker'])
            cleaned_text = remove_spaces_and_newline(document['question'])
            if group_trend == 'I':
                i_speeches = add_speech(i_speeches, cleaned_speaker, cleaned_text)
            if group_trend == 'D':
                d_speeches = add_speech(d_speeches, cleaned_speaker, cleaned_text)

            speakers = add_speaker(speakers, cleaned_speaker, document['group'])

        for intervention in document['interventions']:
            group_trend = get_group_trend(intervention['who']['group'])
            if group_trend and group_trend in ['I','D']:
                cleaned_ns = remove_spaces_and_newline(intervention['who']['name'] + " " + intervention['who']['surname'])
                cleaned_text = remove_spaces_and_newline(intervention['text'])
                if group_trend == 'I':
                    i_speeches = add_speech(i_speeches, cleaned_ns.upper(), cleaned_text)
                if group_trend == 'D':
                    d_speeches = add_speech(d_speeches, cleaned_ns.upper(), cleaned_text)

                speakers = add_speaker(speakers, cleaned_ns, intervention['who']['group'])
                    
        session_keywords += doc['keywords']

    str_i_speeches = "\""
    for i, speech in enumerate(i_speeches):
        for s in speech[1]:
            str_i_speeches += ("\\n" + s)
        if i<len(i_speeches)-1:
            str_i_speeches += "\",\n\""
        else:
            str_i_speeches += "\""
    str_speech = speech_template.replace('{0}',str_i_speeches)
    str_d_speeches = "\""
    for i, speech in enumerate(d_speeches):
        for s in speech[1]:
            str_d_speeches += ("\\n" + s)
        if i<len(d_speeches)-1:
            str_d_speeches += "\",\n\""
        else:
            str_d_speeches += "\""
    str_speech = str_speech.replace('{1}',str_d_speeches)
    # print str_speech
    # print "-"*120

    str_speakers = ""
    for speaker in speakers:
        str_speaker = speaker_template.replace('{0}',speaker[0].upper())
        str_speaker = str_speaker.replace('{1}',speaker[0])
        str_speaker = str_speaker.replace('{2}',get_group_name(speaker[1]))
        if str_speakers:
            str_speakers = str_speakers + ", " + str_speaker
        else:
            str_speakers = str_speaker
    # print str_speakers
    # print "-"*120
    
    str_topics = ""
    set_session_keywords = set(session_keywords)
    print "len(set_session_keywords):", len(set_session_keywords)
    found_session_keywords = []
    max_key_oc = 0
    for keyword in set_session_keywords:
        if len(keyword) > 2:
            # Cerca equivalent a RegExp("\\b(" + d3.requote(name) + ")\\b", "gi")  --> gi : global search ignorecase
            keyword_oc_i = len(re.findall(r'\b(%s)\b' % re.escape(keyword),str_i_speeches,flags=re.IGNORECASE))
            keyword_oc_d = len(re.findall(r'\b(%s)\b' % re.escape(keyword),str_d_speeches,flags=re.IGNORECASE))
            keyword_oc = keyword_oc_i + keyword_oc_d
            if keyword_oc > 0 :
                if max_key_oc < keyword_oc_i + keyword_oc_d:
                    max_key_oc = keyword_oc_i + keyword_oc_d
                found_session_keywords.append([keyword,keyword_oc_i,keyword_oc_d, keyword_oc])
    
    # TODO: reducir la lista de keywords utilizando 'stem' (lematización) y pasando en lugar de la keyword algo como esto:
    #       {name: "Economy", re: /\b(econom[a-z]+)\b/gi, x: 410, y: 340}
    print "len(found_session_keywords):",len(found_session_keywords)
    if len(found_session_keywords) > 0:
        # ordenar
        found_session_keywords.sort(key=lambda col: col[3], reverse=True)
        if len(found_session_keywords) > 15:
            final_session_keywords = found_session_keywords[:15]

        width = 970
        height = 540
        padding = 4
        r_factor = 2.
        max_r = max_key_oc*r_factor
        max_rows = max(1,int(height / (2.*max_r)))
        print doc['label']
        print "width:", width, "height:", height, "max_r:", max_r, "max_rows:", max_rows, "r_factor:", r_factor, "padding:", padding
        x_c = 0
        row_c = 1
        for keyword in final_session_keywords:
            # {name: "American dream", re: /\b(American dream)\b/gi, x: 558, y: 181},
            str_topic = topic_template.replace('{0}', keyword[0])
            str_topic = str_topic.replace('{1}', keyword[0])

            # calculem x, y
            key_r = r_factor*(keyword[1]+keyword[2])
            if x_c + key_r < width:
                x=x_c + key_r
                y=row_c*key_r
                x_c += key_r + padding
            else:
                if row_c+1 <= max_rows:
                    row_c += 1
                else:
                    print "ERROR: not enought space for all topics."
                    break
                x=max_r
                y=row_c*key_r
                x_c = max_r + key_r + padding
            str_topic = str_topic.replace('{2}', str(x))
            str_topic = str_topic.replace('{3}', str(y))

            if str_topics:
                str_topics = str_topics + ", " + str_topic
            else:
                str_topics = str_topic    

    #
    js_file = js_file_template.replace('{0}',str_speech)
    js_file = js_file.replace('{1}',str_speakers)
    js_file = js_file.replace('{2}',str_topics)

    str_format_date = datetime.datetime.strftime(session_date, "%Y%m%d")
    file_js_name = "js_file_" + str_format_date + ".js"
    print "Saving... ", file_js_name
    save_text_file(js_file, file_path + file_js_name)
    
    #
    left_group_title, right_group_title = get_group_trend_titles()
    html_file = html_file_template.replace('{0}',left_group_title)
    html_file = html_file.replace('{1}',right_group_title)
    html_file = html_file.replace('{2}',file_js_name)
    file_html_name = "The Words They Used_" + str_format_date + ".html"
    print "Saving... ", file_html_name
    save_text_file(html_file, file_path + file_html_name)
            

------------------------------------------------------------------------------------------------------------------------
Date: 2000-09-13 00:00:00 num. docs: 24

    POPULAR EN ELCONGRESO not found

    POPULAR EN ELCONGRESO not found
len(set_session_keywords): 208
len(found_session_keywords): 74
economía
width: 970 height: 540 max_r: 48.0 max_rows: 5 r_factor: 2.0 padding: 4
Saving...  js_file_20000913.js
Saving...  The Words They Used_20000913.html
------------------------------------------------------------------------------------------------------------------------
Date: 2000-09-20 00:00:00 num. docs: 27
 CATALÁN
    (CONVERGÉNCIA I UNIÓ) not found

    POPULAR EN ELCONGRESO not found

    CATALÁN (CONVERGÉNCIA I UNIÓ) not found
len(set_session_keywords): 245
len(found_session_keywords): 106
economía
width: 970 height: 540 max_r: 160.0 max_rows: 1 r_factor: 2.0 padding: 4
Saving...  js_file_20000920.js
Saving...  The Words They Used_20000920.html
-----------------------------------

In [78]:
section = '''
"\nJOSÉ LUIS RODRÍGUEZ ZAPATERO:  Gracias, señor presidente. Sí, lo mantengo, señor Rajoy. (Aplausos.)\nJOSÉ LUIS RODRÍGUEZ ZAPATERO:  Muchas gracias, señor presidente. Mi opinión es de respeto a ese editorial, como a todos los editoriales, y en este caso un respeto que expresa, en mi opinión, un sentimiento ampliamente mayoritario en la sociedad catalana, que desea autogobierno, que desea ser respetada, que desea que nadie desde el resto de España utilice su voluntad de autogobierno para enfrentar territorios y que desea seguir aportando todo lo que ha aportado históricamente en términos económicos, culturales y sociales al proyecto común que es España, la España democrática, plural. Deseo una sentencia del Tribunal Constitucional que reciba el acatamiento, por supuesto, y el apoyo de la inmensa mayoría de los ciudadanos y de los catalanes. Deseo que la sentencia confirme lo que he votado aquí: la constitucionalidad del Estatuto. Deseo que el Tribunal Constitucional trabaje con libertad, con autonomía y con independencia, y espero que una vez más una sentencia y una institución como el Tribunal Constitucional consolide el proyecto de desarrollo territorial de España que tiene en el Estatuto de Cataluña una pieza importante, en mi opinión. Y deseo que esa sentencia sirva para la integración. Lo que no haré será juicios previos. Hablemos cuando la sentencia se pronuncie, y todos debemos estar dispuestos no solo a acatarla, por supuesto, sino a favorecer también con la sentencia esa tarea de integración. No recordaré por qué tenemos que estar esperando esta sentencia, a pesar de lo que algunos han dicho, pero le puedo asegurar que si tuviera que votar hoy otra vez el Estatuto de Cataluña lo votaría con el mismo convencimiento de su constitucionalidad. Muchas gracias. (Aplausos.)"
'''

#  RegExp("\\b(" + d3.requote(name) + ")\\b", "gi")
print re.findall(r'(?:\n|^)([A-ZÁÉÍÓÚÑ\.()\- ]+):',section)


['JOS\xc3\x89 LUIS RODR\xc3\x8dGUEZ ZAPATERO', 'JOS\xc3\x89 LUIS RODR\xc3\x8dGUEZ ZAPATERO']


In [17]:
question = '''
FELIPE JESÚS SICILIA ALFÉREZ EN SUSTITUCIÓN DEL 
    DIPUTADO DON ALEJANDRO ALONSO NÚÑEZ: CREE EL GOBIERNO QUE CON UNA REDUCCIÓN DEL 31,2% DEL PRESUPUESTO DEL MINISTERIO DE AGRICULTURA, ALIMENTACIÓN Y MEDIO AMBIENTE SE PUEDEN LLEVAR A CABO LAS POLÍTICAS QUE LE CORRESPONDEN
'''
print remove_spaces_and_newline(question)

FELIPE JESÚS SICILIA ALFÉREZ EN SUSTITUCIÓN DEL DIPUTADO DON ALEJANDRO ALONSO NÚÑEZ: CREE EL GOBIERNO QUE CON UNA REDUCCIÓN DEL 31,2% DEL PRESUPUESTO DEL MINISTERIO DE AGRICULTURA, ALIMENTACIÓN Y MEDIO AMBIENTE SE PUEDEN LLEVAR A CABO LAS POLÍTICAS QUE LE CORRESPONDEN
