# Paquetes

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd

# Funciones

In [2]:
def get_html(url):
    '''
    Requests url and parses text. Returns Beautiful Soup object.
    '''
    resp = requests.get(url).text
    return BeautifulSoup(resp,"lxml")

In [3]:
def get_speech(soup):
    '''
    Returns text of speech
    '''
    return str(soup.find_all('p'))

In [4]:
def remove_tags(text):
    '''
    Strips text from html and other tags
    '''
    return re.sub('<[^>]+>|\]|\[|\\u200b|\\n|\xa0', '', text)

In [5]:
def get_speech_elements(speech_url):
    '''
    Returns speech elements
    '''
    soup = get_html(speech_url)
    speech = remove_tags(get_speech(soup))
    return [speech]

# Procesamiento

## 2002

In [23]:
# Fuente inicial - Agosto 2002
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2002/agosto/agosto2002.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
links = links[:15]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2002/agosto/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Aug_2002 = pd.DataFrame(tuples)
Aug_2002=Aug_2002.rename(columns={0: "word", 1: "times_Aug_2002"})
Aug_2002

Unnamed: 0,word,times_Aug_2002
0,the,140
1,of,53
2,ustedes,52
3,and,52
4,social,49
...,...,...
3675,humana,1
3676,vanidad,1
3677,rectificar,1
3678,incurra,1


In [28]:
# Fuente inicial - Septiembre 2002
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2002/septiembre/septiembre2002.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
base_link = "http://historico.presidencia.gov.co/discursos/discbursos2002/septiembre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Sep_2002 = pd.DataFrame(tuples)
Sep_2002=Sep_2002.rename(columns={0: "word", 1: "times_Sep_2002"})
Sep_2002

Unnamed: 0,word,times_Sep_2002
0,the,12
1,resource,12
2,you,12
3,are,12
4,looking,12
5,for,12
6,been,12
7,removed,12
8,had,12
9,its,12


In [30]:
# Fuente inicial - Octubre 2002
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2002/octubre/octubre2002.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2002/octubre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Oct_2002 = pd.DataFrame(tuples)
Oct_2002=Oct_2002.rename(columns={0: "word", 1: "times_Oct_2002"})
Oct_2002

Unnamed: 0,word,times_Oct_2002
0,ustedes,142
1,país,131
2,colombia,128
3,estado,119
4,gobierno,117
...,...,...
5945,efectivo,1
5946,contención,1
5947,cardenal,1
5948,pubiano,1


In [33]:
# Fuente inicial - Noviembre 2002
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2002/noviembre/noviembre2002.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2002/noviembre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Nov_2002 = pd.DataFrame(tuples)
Nov_2002=Nov_2002.rename(columns={0: "word", 1: "times_Nov_2002"})
Nov_2002

Unnamed: 0,word,times_Nov_2002
0,colombia,121
1,patria,113
2,seguridad,112
3,fuerza,112
4,gobierno,111
...,...,...
6531,lejanos,1
6532,agobiados,1
6533,maltratado,1
6534,derrotarla,1


In [34]:
# Fuente inicial - Diciembre 2002
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2002/diciembre/diciembre2002.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2002/diciembre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Dec_2002 = pd.DataFrame(tuples)
Dec_2002=Dec_2002.rename(columns={0: "word", 1: "times_Dec_2002"})
Dec_2002

Unnamed: 0,word,times_Dec_2002
0,ustedes,116
1,colombia,101
2,nación,95
3,patria,93
4,país,72
...,...,...
5669,comunicado,1
5670,proponernos,1
5671,caldas,1
5672,cobertura,1


In [35]:
speeches_2002=Aug_2002.merge(Sep_2002, how="outer", on="word")
speeches_2002=speeches_2002.merge(Oct_2002, how="outer", on="word")
speeches_2002=speeches_2002.merge(Nov_2002, how="outer", on="word")
speeches_2002=speeches_2002.merge(Dec_2002, how="outer", on="word")

speeches_2002 = speeches_2002.fillna(0)

speeches_2002["total_2002"]=speeches_2002["times_Aug_2002"] + speeches_2002["times_Sep_2002"] + speeches_2002["times_Oct_2002"] + speeches_2002["times_Nov_2002"] + speeches_2002["times_Dec_2002"]

speeches_2002

Unnamed: 0,word,times_Aug_2002,times_Sep_2002,times_Oct_2002,times_Nov_2002,times_Dec_2002,total_2002
0,the,140.0,12.0,3.0,1.0,1.0,157.0
1,of,53.0,0.0,0.0,1.0,0.0,54.0
2,ustedes,52.0,0.0,142.0,106.0,116.0,416.0
3,and,52.0,0.0,0.0,0.0,0.0,52.0
4,social,49.0,0.0,56.0,56.0,61.0,222.0
...,...,...,...,...,...,...,...
12767,sucedido,0.0,0.0,0.0,0.0,1.0,1.0
12768,regulares,0.0,0.0,0.0,0.0,1.0,1.0
12769,hiciéramos,0.0,0.0,0.0,0.0,1.0,1.0
12770,medir,0.0,0.0,0.0,0.0,1.0,1.0


In [36]:
speeches_2002.to_csv('Uribe_2002.csv') 

## 2003

In [12]:
# Fuente inicial - Enero 2003
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2003/enero/enero2003.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
links = links[:15]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2003/enero/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Jan_2003 = pd.DataFrame(tuples)
Jan_2003=Jan_2003.rename(columns={0: "word", 1: "times_Jan_2003"})
Jan_2003

Unnamed: 0,word,times_Jan_2003
0,gobierno,127
1,estado,88
2,año,86
3,va,86
4,mil,81
...,...,...
3690,apertura,1
3691,devolverle,1
3692,vigor,1
3693,ejemplar,1


In [13]:
# Fuente inicial - Febrero 2003
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2003/febrero/febrero2003.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
links = links[:15]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2003/febrero/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Feb_2003 = pd.DataFrame(tuples)
Feb_2003=Feb_2003.rename(columns={0: "word", 1: "times_Feb_2003"})
Feb_2003

Unnamed: 0,word,times_Feb_2003
0,estado,79
1,social,67
2,referendo,49
3,año,47
4,país,47
...,...,...
2675,conversara,1
2676,zarruk,1
2677,aplicamos,1
2678,especialmente,1


In [28]:
# Fuente inicial - Marzo 2003
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2003/marzo/marzo2003.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
links = links[:15]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2003/marzo/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Mar_2003 = pd.DataFrame(tuples)
Mar_2003=Mar_2003.rename(columns={0: "word", 1: "times_Mar_2003"})
Mar_2003

Unnamed: 0,word,times_Mar_2003
0,gobierno,52
1,terrorismo,34
2,terroristas,26
3,país,25
4,comunidad,24
...,...,...
2082,torio,1
2083,parece,1
2084,trato,1
2085,recoger,1


In [25]:
# Fuente inicial - Abril 2003
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2003/abril/abril2003.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
links = links[:15]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2003/abril/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Apr_2003 = pd.DataFrame(tuples)
Apr_2003= Apr_2003.rename(columns={0: "word", 1: "times_Apr_2003"})
Apr_2003

Unnamed: 0,word,times_Apr_2003
0,pública,75
1,colombia,74
2,gran,63
3,salud,58
4,universidad,55
...,...,...
3194,podría,1
3195,aprovecharse,1
3196,eléctrica,1
3197,alrededor,1


In [26]:
# Fuente inicial - Mayo 2003
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2003/mayo/mayo2003.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[15:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2003/mayo/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]

for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
May_2003 = pd.DataFrame(tuples)
May_2003=May_2003.rename(columns={0: "word", 1: "times_May_2003"})
May_2003

Unnamed: 0,word,times_May_2003
0,patria,68
1,colombia,67
2,ustedes,66
3,pública,53
4,derrotar,53
...,...,...
3056,autodefensas,1
3057,sometidos,1
3058,falsamente,1
3059,ilusionados,1


In [55]:
speeches_2003 = pd.DataFrame(columns = ['word'])
speeches_2003

Unnamed: 0,word


In [68]:
# Loop para cargar bases rápido

speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["Diciembre"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2003/" + mes + "/" + mes +  "2003.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.htm+?', str(principal))
    links = links[15:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2003/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2003 = pd.DataFrame(tuples)
    month = str(mes)
    df_2003=df_2003.rename(columns={0: "word", 1: month })
    speeches_2003=speeches_2003.merge(df_2003, how="outer", on="word")
speeches_2003  

Unnamed: 0,word,Junio,Julio,Agosto,Septiembre,Octubre,Noviembre,Diciembre
0,colombia,107.0,55.0,56.0,71.0,51.0,61.0,25.0
1,ustedes,68.0,27.0,76.0,82.0,49.0,70.0,44.0
2,país,50.0,32.0,24.0,33.0,45.0,29.0,25.0
3,social,49.0,25.0,38.0,45.0,48.0,33.0,21.0
4,mil,48.0,41.0,37.0,56.0,51.0,52.0,39.0
...,...,...,...,...,...,...,...,...
10535,electoraldiciembre,,,,,,,1.0
10536,ofensa,,,,,,,1.0
10537,difuntos,,,,,,,1.0
10538,cédula,,,,,,,1.0


In [69]:
speeches_2003_copy = speeches_2003
speeches_2003_copy

Unnamed: 0,word,Junio,Julio,Agosto,Septiembre,Octubre,Noviembre,Diciembre
0,colombia,107.0,55.0,56.0,71.0,51.0,61.0,25.0
1,ustedes,68.0,27.0,76.0,82.0,49.0,70.0,44.0
2,país,50.0,32.0,24.0,33.0,45.0,29.0,25.0
3,social,49.0,25.0,38.0,45.0,48.0,33.0,21.0
4,mil,48.0,41.0,37.0,56.0,51.0,52.0,39.0
...,...,...,...,...,...,...,...,...
10535,electoraldiciembre,,,,,,,1.0
10536,ofensa,,,,,,,1.0
10537,difuntos,,,,,,,1.0
10538,cédula,,,,,,,1.0


In [86]:
speeches_2003= speeches_2003_copy
speeches_2003=speeches_2003.merge(Jan_2003, how="outer", on="word")
speeches_2003=speeches_2003.merge(Feb_2003, how="outer", on="word")
speeches_2003=speeches_2003.merge(Mar_2003, how="outer", on="word")
speeches_2003=speeches_2003.merge(Apr_2003, how="outer", on="word")
speeches_2003=speeches_2003.merge(May_2003, how="outer", on="word")

speeches_2003 = speeches_2003.rename(columns={'Junio': 'times_Jun_2003', 'Julio': 'times_Jul_2003', 'Agosto': 'times_Aug_2003', 'Septiembre': 'times_Sep_2003', 'Octubre': 'times_Oct_2003', 'Noviembre': 'times_Nov_2003', 'Diciembre': 'times_Dec_2003'})

speeches_2003 = speeches_2003.fillna(0)

speeches_2003["total_2003"]=speeches_2003["times_Jan_2003"]  + speeches_2003["times_Feb_2003"]  + speeches_2003["times_Mar_2003"]  + speeches_2003["times_Apr_2003"]  + speeches_2003["times_May_2003"] + speeches_2003["times_Jun_2003"] + speeches_2003["times_Jul_2003"] + speeches_2003["times_Aug_2003"] + speeches_2003["times_Sep_2003"] + speeches_2003["times_Oct_2003"] + speeches_2003["times_Nov_2003"] + speeches_2003["times_Dec_2003"]

speeches_2003

Unnamed: 0,word,times_Jun_2003,times_Jul_2003,times_Aug_2003,times_Sep_2003,times_Oct_2003,times_Nov_2003,times_Dec_2003,times_Jan_2003,times_Feb_2003,times_Mar_2003,times_Apr_2003,times_May_2003,total_2003
0,colombia,107.0,55.0,56.0,71.0,51.0,61.0,25.0,67.0,32.0,22.0,74.0,67.0,688.0
1,ustedes,68.0,27.0,76.0,82.0,49.0,70.0,44.0,60.0,46.0,8.0,34.0,66.0,630.0
2,país,50.0,32.0,24.0,33.0,45.0,29.0,25.0,55.0,47.0,25.0,49.0,24.0,438.0
3,social,49.0,25.0,38.0,45.0,48.0,33.0,21.0,71.0,67.0,8.0,49.0,11.0,465.0
4,mil,48.0,41.0,37.0,56.0,51.0,52.0,39.0,81.0,45.0,0.0,42.0,12.0,504.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13751,lentitud,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
13752,doblarse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
13753,inaugurar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
13754,cerros,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [124]:
speeches_2003.to_csv('Uribe_2003.csv') 

## 2004

In [91]:
speeches_2004 = pd.DataFrame(columns = ['word'])
speeches_2004

Unnamed: 0,word


In [112]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["sept"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2004/" + mes + "/" + mes +  "2004.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.htm+?', str(principal))
    links = links[15:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2004/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2004 = pd.DataFrame(tuples)
    month = str(mes)
    df_2004=df_2004.rename(columns={0: "word", 1: month })
    speeches_2004=speeches_2004.merge(df_2004, how="outer", on="word")
speeches_2004

Unnamed: 0,word,Enero,Febrero,Marzo,Abril,Junio,Julio,Agosto,Mayo,Octubre,Noviembre,Diciembre,sept
0,gobierno,68.0,99.0,56.0,35.0,99.0,76.0,57.0,26.0,93.0,42.0,18.0,73.0
1,ustedes,58.0,84.0,41.0,17.0,29.0,32.0,69.0,19.0,35.0,12.0,45.0,26.0
2,mil,49.0,67.0,34.0,40.0,66.0,64.0,31.0,15.0,61.0,14.0,34.0,34.0
3,año,44.0,66.0,49.0,49.0,82.0,76.0,42.0,23.0,59.0,36.0,34.0,76.0
4,nacional,44.0,58.0,21.0,13.0,15.0,13.0,13.0,1.0,27.0,18.0,5.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13109,ciclista,,,,,,,,,,,,1.0
13110,burrito,,,,,,,,,,,,1.0
13111,ciclorutas,,,,,,,,,,,,1.0
13112,ríonegro,,,,,,,,,,,,1.0


In [121]:
speeches_2004_copy = speeches_2004
speeches_2004_copy

Unnamed: 0,word,times_Jan_2004,times_Feb_2004,times_Mar_2004,times_Apr_2004,times_Jun_2004,times_Jul_2004,times_Aug_2004,times_May_2004,times_Oct_2004,times_Nov_2004,times_Dec_2004,times_Sep_2004
0,gobierno,68.0,99.0,56.0,35.0,99.0,76.0,57.0,26.0,93.0,42.0,18.0,73.0
1,ustedes,58.0,84.0,41.0,17.0,29.0,32.0,69.0,19.0,35.0,12.0,45.0,26.0
2,mil,49.0,67.0,34.0,40.0,66.0,64.0,31.0,15.0,61.0,14.0,34.0,34.0
3,año,44.0,66.0,49.0,49.0,82.0,76.0,42.0,23.0,59.0,36.0,34.0,76.0
4,nacional,44.0,58.0,21.0,13.0,15.0,13.0,13.0,1.0,27.0,18.0,5.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13109,ciclista,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13110,burrito,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13111,ciclorutas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
13112,ríonegro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [122]:
speeches_2004= speeches_2004_copy

speeches_2004 = speeches_2004.rename(columns={'Mayo': 'times_May_2004','Abril': 'times_Apr_2004','Marzo': 'times_Mar_2004', 'Febrero': 'times_Feb_2004','Enero': 'times_Jan_2004','Junio': 'times_Jun_2004', 'Julio': 'times_Jul_2004', 'Agosto': 'times_Aug_2004', 'sept': 'times_Sep_2004', 'Octubre': 'times_Oct_2004', 'Noviembre': 'times_Nov_2004', 'Diciembre': 'times_Dec_2004'})

speeches_2004 = speeches_2004.fillna(0)

speeches_2004["total_2004"]=speeches_2004["times_Jan_2004"]  + speeches_2004["times_Feb_2004"]  + speeches_2004["times_Mar_2004"]  + speeches_2004["times_Apr_2004"]  + speeches_2004["times_May_2004"] + speeches_2004["times_Jun_2004"] + speeches_2004["times_Jul_2004"] + speeches_2004["times_Aug_2004"] + speeches_2004["times_Sep_2004"] + speeches_2004["times_Oct_2004"] + speeches_2004["times_Nov_2004"] + speeches_2004["times_Dec_2004"]

speeches_2004

Unnamed: 0,word,times_Jan_2004,times_Feb_2004,times_Mar_2004,times_Apr_2004,times_Jun_2004,times_Jul_2004,times_Aug_2004,times_May_2004,times_Oct_2004,times_Nov_2004,times_Dec_2004,times_Sep_2004,total_2004
0,gobierno,68.0,99.0,56.0,35.0,99.0,76.0,57.0,26.0,93.0,42.0,18.0,73.0,742.0
1,ustedes,58.0,84.0,41.0,17.0,29.0,32.0,69.0,19.0,35.0,12.0,45.0,26.0,467.0
2,mil,49.0,67.0,34.0,40.0,66.0,64.0,31.0,15.0,61.0,14.0,34.0,34.0,509.0
3,año,44.0,66.0,49.0,49.0,82.0,76.0,42.0,23.0,59.0,36.0,34.0,76.0,636.0
4,nacional,44.0,58.0,21.0,13.0,15.0,13.0,13.0,1.0,27.0,18.0,5.0,25.0,253.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13109,ciclista,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
13110,burrito,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
13111,ciclorutas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
13112,ríonegro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [123]:
speeches_2004.to_csv('Uribe_2004.csv') 

# 2005

In [6]:
speeches_2005 = pd.DataFrame(columns = ['word'])
speeches_2005

Unnamed: 0,word


In [25]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["noviembre"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2005/" + mes + "/" + mes +  "2005.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.htm+?', str(principal))
    links = links[15:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2005/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2005 = pd.DataFrame(tuples)
    month = str(mes)
    df_2005=df_2005.rename(columns={0: "word", 1: month })
    speeches_2005=speeches_2005.merge(df_2005, how="outer", on="word")
speeches_2005

Unnamed: 0,word,enero,febrero,marzo,mayo,abril,agosto,septiembre,octubre,junio,diciembre,julio,noviembre
0,mil,73.0,106.0,23.0,66.0,92.0,89.0,32.0,41.0,28.0,28.0,52.0,127.0
1,año,72.0,113.0,29.0,47.0,54.0,106.0,60.0,68.0,41.0,48.0,49.0,97.0
2,gobierno,57.0,97.0,60.0,49.0,108.0,103.0,66.0,93.0,64.0,30.0,56.0,111.0
3,colombia,53.0,89.0,90.0,47.0,110.0,110.0,104.0,91.0,53.0,36.0,58.0,100.0
4,ustedes,51.0,66.0,15.0,30.0,43.0,73.0,54.0,80.0,35.0,21.0,91.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15767,debida,,,,,,,,,,,,1.0
15768,ornamentación,,,,,,,,,,,,1.0
15769,engalanamiento,,,,,,,,,,,,1.0
15770,asombre,,,,,,,,,,,,1.0


In [26]:
speeches_2005_copy = speeches_2005
speeches_2005_copy

Unnamed: 0,word,enero,febrero,marzo,mayo,abril,agosto,septiembre,octubre,junio,diciembre,julio,noviembre
0,mil,73.0,106.0,23.0,66.0,92.0,89.0,32.0,41.0,28.0,28.0,52.0,127.0
1,año,72.0,113.0,29.0,47.0,54.0,106.0,60.0,68.0,41.0,48.0,49.0,97.0
2,gobierno,57.0,97.0,60.0,49.0,108.0,103.0,66.0,93.0,64.0,30.0,56.0,111.0
3,colombia,53.0,89.0,90.0,47.0,110.0,110.0,104.0,91.0,53.0,36.0,58.0,100.0
4,ustedes,51.0,66.0,15.0,30.0,43.0,73.0,54.0,80.0,35.0,21.0,91.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15767,debida,,,,,,,,,,,,1.0
15768,ornamentación,,,,,,,,,,,,1.0
15769,engalanamiento,,,,,,,,,,,,1.0
15770,asombre,,,,,,,,,,,,1.0


In [28]:
speeches_2005= speeches_2005_copy

speeches_2005 = speeches_2005.rename(columns={'mayo': 'times_May_2005','abril': 'times_Apr_2005','marzo': 'times_Mar_2005', 'febrero': 'times_Feb_2005','enero': 'times_Jan_2005','junio': 'times_Jun_2005', 'julio': 'times_Jul_2005', 'agosto': 'times_Aug_2005', 'septiembre': 'times_Sep_2005', 'octubre': 'times_Oct_2005', 'noviembre': 'times_Nov_2005', 'diciembre': 'times_Dec_2005'})

speeches_2005 = speeches_2005.fillna(0)

speeches_2005["total_2005"]=speeches_2005["times_Jan_2005"]  + speeches_2005["times_Feb_2005"]  + speeches_2005["times_Mar_2005"]  + speeches_2005["times_Apr_2005"]  + speeches_2005["times_May_2005"] + speeches_2005["times_Jun_2005"] + speeches_2005["times_Jul_2005"] + speeches_2005["times_Aug_2005"] + speeches_2005["times_Sep_2005"] + speeches_2005["times_Oct_2005"] + speeches_2005["times_Nov_2005"] + speeches_2005["times_Dec_2005"]

speeches_2005

Unnamed: 0,word,times_Jan_2005,times_Feb_2005,times_Mar_2005,times_May_2005,times_Apr_2005,times_Aug_2005,times_Sep_2005,times_Oct_2005,times_Jun_2005,times_Dec_2005,times_Jul_2005,times_Nov_2005,total_2005
0,mil,73.0,106.0,23.0,66.0,92.0,89.0,32.0,41.0,28.0,28.0,52.0,127.0,757.0
1,año,72.0,113.0,29.0,47.0,54.0,106.0,60.0,68.0,41.0,48.0,49.0,97.0,784.0
2,gobierno,57.0,97.0,60.0,49.0,108.0,103.0,66.0,93.0,64.0,30.0,56.0,111.0,894.0
3,colombia,53.0,89.0,90.0,47.0,110.0,110.0,104.0,91.0,53.0,36.0,58.0,100.0,941.0
4,ustedes,51.0,66.0,15.0,30.0,43.0,73.0,54.0,80.0,35.0,21.0,91.0,56.0,615.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15767,debida,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
15768,ornamentación,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
15769,engalanamiento,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
15770,asombre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [29]:
speeches_2005.to_csv('Uribe_2005.csv') 

# 2006

In [30]:
speeches_2006 = pd.DataFrame(columns = ['word'])
speeches_2006

Unnamed: 0,word


In [49]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["septiembre"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2006/" + mes + "/" + mes +  "2006.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.htm+?', str(principal))
    links = links[15:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2006/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2006 = pd.DataFrame(tuples)
    month = str(mes)
    df_2006=df_2006.rename(columns={0: "word", 1: month })
    speeches_2006=speeches_2006.merge(df_2006, how="outer", on="word")
speeches_2006

Unnamed: 0,word,enero,febrero,marzo,abril,mayo,junio,julio,agosto,octubre,noviembre,diciembre,septiembre
0,gobierno,119.0,106.0,180.0,165.0,67.0,51.0,131.0,115.0,87.0,53.0,24.0,110.0
1,país,100.0,47.0,101.0,123.0,53.0,66.0,78.0,121.0,55.0,150.0,55.0,110.0
2,colombia,91.0,96.0,142.0,254.0,155.0,48.0,113.0,97.0,50.0,174.0,75.0,72.0
3,mil,91.0,37.0,70.0,107.0,48.0,17.0,50.0,29.0,77.0,120.0,57.0,81.0
4,año,88.0,35.0,64.0,79.0,50.0,21.0,50.0,48.0,62.0,115.0,91.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16398,vincularon,,,,,,,,,,,,1.0
16399,colocadores,,,,,,,,,,,,1.0
16400,respalda,,,,,,,,,,,,1.0
16401,facilitarles,,,,,,,,,,,,1.0


In [50]:
speeches_2006_copy = speeches_2006
speeches_2006_copy

Unnamed: 0,word,enero,febrero,marzo,abril,mayo,junio,julio,agosto,octubre,noviembre,diciembre,septiembre
0,gobierno,119.0,106.0,180.0,165.0,67.0,51.0,131.0,115.0,87.0,53.0,24.0,110.0
1,país,100.0,47.0,101.0,123.0,53.0,66.0,78.0,121.0,55.0,150.0,55.0,110.0
2,colombia,91.0,96.0,142.0,254.0,155.0,48.0,113.0,97.0,50.0,174.0,75.0,72.0
3,mil,91.0,37.0,70.0,107.0,48.0,17.0,50.0,29.0,77.0,120.0,57.0,81.0
4,año,88.0,35.0,64.0,79.0,50.0,21.0,50.0,48.0,62.0,115.0,91.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16398,vincularon,,,,,,,,,,,,1.0
16399,colocadores,,,,,,,,,,,,1.0
16400,respalda,,,,,,,,,,,,1.0
16401,facilitarles,,,,,,,,,,,,1.0


In [51]:
speeches_2006= speeches_2006_copy

speeches_2006 = speeches_2006.rename(columns={'mayo': 'times_May_2006','abril': 'times_Apr_2006','marzo': 'times_Mar_2006', 'febrero': 'times_Feb_2006','enero': 'times_Jan_2006','junio': 'times_Jun_2006', 'julio': 'times_Jul_2006', 'agosto': 'times_Aug_2006', 'septiembre': 'times_Sep_2006', 'octubre': 'times_Oct_2006', 'noviembre': 'times_Nov_2006', 'diciembre': 'times_Dec_2006'})

speeches_2006 = speeches_2006.fillna(0)

speeches_2006["total_2006"]=speeches_2006["times_Jan_2006"]  + speeches_2006["times_Feb_2006"]  + speeches_2006["times_Mar_2006"]  + speeches_2006["times_Apr_2006"]  + speeches_2006["times_May_2006"] + speeches_2006["times_Jun_2006"] + speeches_2006["times_Jul_2006"] + speeches_2006["times_Aug_2006"] + speeches_2006["times_Sep_2006"] + speeches_2006["times_Oct_2006"] + speeches_2006["times_Nov_2006"] + speeches_2006["times_Dec_2006"]

speeches_2006

Unnamed: 0,word,times_Jan_2006,times_Feb_2006,times_Mar_2006,times_Apr_2006,times_May_2006,times_Jun_2006,times_Jul_2006,times_Aug_2006,times_Oct_2006,times_Nov_2006,times_Dec_2006,times_Sep_2006,total_2006
0,gobierno,119.0,106.0,180.0,165.0,67.0,51.0,131.0,115.0,87.0,53.0,24.0,110.0,1208.0
1,país,100.0,47.0,101.0,123.0,53.0,66.0,78.0,121.0,55.0,150.0,55.0,110.0,1059.0
2,colombia,91.0,96.0,142.0,254.0,155.0,48.0,113.0,97.0,50.0,174.0,75.0,72.0,1367.0
3,mil,91.0,37.0,70.0,107.0,48.0,17.0,50.0,29.0,77.0,120.0,57.0,81.0,784.0
4,año,88.0,35.0,64.0,79.0,50.0,21.0,50.0,48.0,62.0,115.0,91.0,76.0,779.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16398,vincularon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
16399,colocadores,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
16400,respalda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
16401,facilitarles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [52]:
speeches_2006.to_csv('Uribe_2006.csv') 

# 2007 (Enero-Junio)

In [90]:
speeches_2007 = pd.DataFrame(columns = ['word'])
speeches_2007

Unnamed: 0,word


In [102]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["junio"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2007/" + mes + "/" + mes +  "2007.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.htm+?', str(principal))
    links = links[15:]
    links = links[:15]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2007/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2007 = pd.DataFrame(tuples)
    month = str(mes)
    df_2007=df_2007.rename(columns={0: "word", 1: month })
    speeches_2007=speeches_2007.merge(df_2007, how="outer", on="word")
speeches_2007

Unnamed: 0,word,julio,enero,febrero,mayo,marzo,abril,junio
0,gobierno,393.0,54.0,228.0,363.0,104.0,150.0,321.0
1,colombia,373.0,204.0,182.0,181.0,144.0,119.0,170.0
2,país,341.0,125.0,201.0,173.0,104.0,82.0,273.0
3,mil,287.0,65.0,77.0,157.0,99.0,64.0,158.0
4,año,256.0,84.0,124.0,173.0,92.0,69.0,166.0
...,...,...,...,...,...,...,...,...
18153,dialéctica,,,,,,,1.0
18154,disfrazan,,,,,,,1.0
18155,despachar,,,,,,,1.0
18156,solicita,,,,,,,1.0


In [103]:
speeches_2007_copy = speeches_2007
speeches_2007_copy

Unnamed: 0,word,julio,enero,febrero,mayo,marzo,abril,junio
0,gobierno,393.0,54.0,228.0,363.0,104.0,150.0,321.0
1,colombia,373.0,204.0,182.0,181.0,144.0,119.0,170.0
2,país,341.0,125.0,201.0,173.0,104.0,82.0,273.0
3,mil,287.0,65.0,77.0,157.0,99.0,64.0,158.0
4,año,256.0,84.0,124.0,173.0,92.0,69.0,166.0
...,...,...,...,...,...,...,...,...
18153,dialéctica,,,,,,,1.0
18154,disfrazan,,,,,,,1.0
18155,despachar,,,,,,,1.0
18156,solicita,,,,,,,1.0


In [104]:
speeches_2007= speeches_2007_copy

speeches_2007 = speeches_2007.rename(columns={'mayo': 'times_May_2007','abril': 'times_Apr_2007','marzo': 'times_Mar_2007', 'febrero': 'times_Feb_2007','enero': 'times_Jan_2007','junio': 'times_Jun_2007', 'julio': 'times_Jul_2007'})

speeches_2007 = speeches_2007.fillna(0)

speeches_2007["total_2007"]=speeches_2007["times_Jan_2007"]  + speeches_2007["times_Feb_2007"]  + speeches_2007["times_Mar_2007"]  + speeches_2007["times_Apr_2007"]  + speeches_2007["times_May_2007"] + speeches_2007["times_Jun_2007"] + speeches_2007["times_Jul_2007"]

speeches_2007

Unnamed: 0,word,times_Jul_2007,times_Jan_2007,times_Feb_2007,times_May_2007,times_Mar_2007,times_Apr_2007,times_Jun_2007,total_2007
0,gobierno,393.0,54.0,228.0,363.0,104.0,150.0,321.0,1613.0
1,colombia,373.0,204.0,182.0,181.0,144.0,119.0,170.0,1373.0
2,país,341.0,125.0,201.0,173.0,104.0,82.0,273.0,1299.0
3,mil,287.0,65.0,77.0,157.0,99.0,64.0,158.0,907.0
4,año,256.0,84.0,124.0,173.0,92.0,69.0,166.0,964.0
...,...,...,...,...,...,...,...,...,...
18153,dialéctica,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
18154,disfrazan,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
18155,despachar,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
18156,solicita,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [105]:
speeches_2007.to_csv('Uribe_2007_1.csv') 

# 2007 (agosto-diciembre)

In [6]:
speeches_2007 = pd.DataFrame(columns = ['word'])
speeches_2007

Unnamed: 0,word


In [9]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2007/agosto/index.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.htm+?', str(principal))
links = links[11:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2007/agosto/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]
for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Aug_2007 = pd.DataFrame(tuples)
Aug_2007=Aug_2007.rename(columns={0: "word", 1: "times_Aug_2007"})
Aug_2007

Unnamed: 0,word,times_Aug_2007
0,social,128
1,colombia,126
2,gobierno,109
3,seguridad,107
4,mãs,99
...,...,...
3946,cardenal,1
3947,entone,1
3948,nuestro,1
3949,pulso,1


In [8]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2007/septiembre/index.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.html+?', str(principal))
links = links[11:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2007/septiembre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]
for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Sep_2007 = pd.DataFrame(tuples)
Sep_2007=Sep_2007.rename(columns={0: "word", 1: "times_Sep_2007"})
Sep_2007

Unnamed: 0,word,times_Sep_2007
0,mãs,277
1,gobierno,218
2,mil,216
3,social,212
4,colombia,209
...,...,...
5460,repusieron,1
5461,riohacha,1
5462,rezar,1
5463,pretendiente,1


In [13]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2007/octubre/index.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.html+?', str(principal))
links = links[3:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2007/octubre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]
for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Oct_2007 = pd.DataFrame(tuples)
Oct_2007=Oct_2007.rename(columns={0: "word", 1: "times_Oct_2007"})
Oct_2007

Unnamed: 0,word,times_Oct_2007
0,gobierno,596
1,mil,501
2,colombia,473
3,mãs,438
4,social,437
...,...,...
7975,abastecido,1
7976,caza,1
7977,lauforie,1
7978,bregamos,1


In [12]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2007/noviembre/index.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.html+?', str(principal))
links = links[3:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2007/noviembre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]
for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Nov_2007 = pd.DataFrame(tuples)
Nov_2007=Nov_2007.rename(columns={0: "word", 1: "times_Nov_2007"})
Nov_2007

Unnamed: 0,word,times_Nov_2007
0,colombia,678
1,social,660
2,gobierno,626
3,seguridad,509
4,gran,487
...,...,...
8828,abrirnos,1
8829,renovarlo,1
8830,factibilidad,1
8831,comparãbamos,1


In [16]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')
link = "http://historico.presidencia.gov.co/discursos/discursos2007/diciembre/index.htm"
html = requests.get(link).text
soup = BeautifulSoup(html)           
principal = soup.find_all('a')
links = re.findall('\w+.html+?', str(principal))
links = links[3:]
base_link = "http://historico.presidencia.gov.co/discursos/discursos2007/diciembre/"
speeches = list(map(lambda orig_string: base_link + orig_string, links))
not_of_interest = ["el", "la", "y"]
for speech in speeches:
    elements = get_speech_elements(speech)
    text=elements[0]
    result = re.sub(r'\r', '', text)
    result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
    result = result.split()
    result = [w for w in result if w.isalpha()]
    result = [w for w in result if w not in stopwords_esp]
    result = [w.lower() for w in result]
    result = [w for w in result if w not in not_of_interest]
    speech_full_text.extend(result)

freqdist = nltk.FreqDist(speech_full_text)
tuples= freqdist.most_common()
Dec_2007 = pd.DataFrame(tuples)
Dec_2007=Dec_2007.rename(columns={0: "word", 1: "times_Dec_2007"})
Dec_2007

Unnamed: 0,word,times_Dec_2007
0,gobierno,346
1,farc,337
2,secuestrados,264
3,colombia,223
4,presidente,214
...,...,...
6115,posuda,1
6116,mandaba,1
6117,comunicados,1
6118,aprobaba,1


In [17]:
speeches_2007=Aug_2007.merge(Sep_2007, how="outer", on="word")
speeches_2007=speeches_2007.merge(Oct_2007, how="outer", on="word")
speeches_2007=speeches_2007.merge(Nov_2007, how="outer", on="word")
speeches_2007=speeches_2007.merge(Dec_2007, how="outer", on="word")

speeches_2007 = speeches_2007.fillna(0)

speeches_2007["total_2007"]=speeches_2007["times_Aug_2007"] + speeches_2007["times_Sep_2007"] + speeches_2007["times_Oct_2007"] + speeches_2007["times_Nov_2007"] + speeches_2007["times_Dec_2007"]

speeches_2007

Unnamed: 0,word,times_Aug_2007,times_Sep_2007,times_Oct_2007,times_Nov_2007,times_Dec_2007,total_2007
0,social,128.0,212.0,437.0,660.0,132.0,1569.0
1,colombia,126.0,209.0,473.0,678.0,223.0,1709.0
2,gobierno,109.0,218.0,596.0,626.0,346.0,1895.0
3,seguridad,107.0,162.0,261.0,509.0,151.0,1190.0
4,mãs,99.0,277.0,438.0,481.0,140.0,1435.0
...,...,...,...,...,...,...,...
15765,billetes,0.0,0.0,0.0,0.0,1.0,1.0
15766,reproduciendo,0.0,0.0,0.0,0.0,1.0,1.0
15767,bravucones,0.0,0.0,0.0,0.0,1.0,1.0
15768,complacientes,0.0,0.0,0.0,0.0,1.0,1.0


In [18]:
speeches_2007.to_csv('Uribe_2007_2.csv') 

# 2008

In [6]:
speeches_2008 = pd.DataFrame(columns = ['word'])
speeches_2008

Unnamed: 0,word


In [39]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["enero"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2008/" + mes + "/index.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.html+?', str(principal))
    links = links[14:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2008/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2008 = pd.DataFrame(tuples)
    month = str(mes)
    df_2008=df_2008.rename(columns={0: "word", 1: month })
    speeches_2008=speeches_2008.merge(df_2008, how="outer", on="word")
speeches_2008

Unnamed: 0,word,abril,febrero,junio,marzo,julio,mayo,octubre,noviembre,agosto,diciembre,septiembre,enero
0,colombia,363.0,140.0,322.0,290.0,131.0,345.0,653.0,397.0,320.0,298.0,332.0,184.0
1,gobierno,315.0,142.0,379.0,207.0,130.0,227.0,613.0,449.0,206.0,391.0,360.0,144.0
2,social,278.0,104.0,188.0,139.0,108.0,270.0,447.0,225.0,272.0,239.0,268.0,80.0
3,seguridad,235.0,78.0,255.0,147.0,93.0,230.0,340.0,256.0,191.0,196.0,250.0,80.0
4,mãs,225.0,111.0,278.0,185.0,152.0,221.0,572.0,372.0,320.0,332.0,315.0,91.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22957,renovemos,,,,,,,,,,,,1.0
22958,primavera,,,,,,,,,,,,1.0
22959,floreciente,,,,,,,,,,,,1.0
22960,insinuado,,,,,,,,,,,,1.0


In [40]:
speeches_2008_copy = speeches_2008
speeches_2008_copy

Unnamed: 0,word,abril,febrero,junio,marzo,julio,mayo,octubre,noviembre,agosto,diciembre,septiembre,enero
0,colombia,363.0,140.0,322.0,290.0,131.0,345.0,653.0,397.0,320.0,298.0,332.0,184.0
1,gobierno,315.0,142.0,379.0,207.0,130.0,227.0,613.0,449.0,206.0,391.0,360.0,144.0
2,social,278.0,104.0,188.0,139.0,108.0,270.0,447.0,225.0,272.0,239.0,268.0,80.0
3,seguridad,235.0,78.0,255.0,147.0,93.0,230.0,340.0,256.0,191.0,196.0,250.0,80.0
4,mãs,225.0,111.0,278.0,185.0,152.0,221.0,572.0,372.0,320.0,332.0,315.0,91.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22957,renovemos,,,,,,,,,,,,1.0
22958,primavera,,,,,,,,,,,,1.0
22959,floreciente,,,,,,,,,,,,1.0
22960,insinuado,,,,,,,,,,,,1.0


In [41]:
speeches_2008= speeches_2008_copy

speeches_2008 = speeches_2008.rename(columns={'mayo': 'times_May_2008','abril': 'times_Apr_2008','marzo': 'times_Mar_2008', 'febrero': 'times_Feb_2008','enero': 'times_Jan_2008','junio': 'times_Jun_2008', 'julio': 'times_Jul_2008', 'agosto': 'times_Aug_2008', 'septiembre': 'times_Sep_2008', 'octubre': 'times_Oct_2008', 'noviembre': 'times_Nov_2008', 'diciembre': 'times_Dec_2008'})

speeches_2008 = speeches_2008.fillna(0)

speeches_2008["total_2008"]=speeches_2008["times_Jan_2008"]  + speeches_2008["times_Feb_2008"]  + speeches_2008["times_Mar_2008"]  + speeches_2008["times_Apr_2008"]  + speeches_2008["times_May_2008"] + speeches_2008["times_Jun_2008"] + speeches_2008["times_Jul_2008"] + speeches_2008["times_Aug_2008"] + speeches_2008["times_Sep_2008"] + speeches_2008["times_Oct_2008"] + speeches_2008["times_Nov_2008"] + speeches_2008["times_Dec_2008"]

speeches_2008

Unnamed: 0,word,times_Apr_2008,times_Feb_2008,times_Jun_2008,times_Mar_2008,times_Jul_2008,times_May_2008,times_Oct_2008,times_Nov_2008,times_Aug_2008,times_Dec_2008,times_Sep_2008,times_Jan_2008,total_2008
0,colombia,363.0,140.0,322.0,290.0,131.0,345.0,653.0,397.0,320.0,298.0,332.0,184.0,3775.0
1,gobierno,315.0,142.0,379.0,207.0,130.0,227.0,613.0,449.0,206.0,391.0,360.0,144.0,3563.0
2,social,278.0,104.0,188.0,139.0,108.0,270.0,447.0,225.0,272.0,239.0,268.0,80.0,2618.0
3,seguridad,235.0,78.0,255.0,147.0,93.0,230.0,340.0,256.0,191.0,196.0,250.0,80.0,2351.0
4,mãs,225.0,111.0,278.0,185.0,152.0,221.0,572.0,372.0,320.0,332.0,315.0,91.0,3174.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22957,renovemos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
22958,primavera,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
22959,floreciente,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
22960,insinuado,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [42]:
speeches_2008.to_csv('Uribe_2008.csv') 

# 2009

In [43]:
speeches_2009 = pd.DataFrame(columns = ['word'])
speeches_2009

Unnamed: 0,word


In [84]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["noviembre"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/discursos2009/" + mes + "/archivo.html"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.html+?', str(principal))
    links = links[15:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2009/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2009 = pd.DataFrame(tuples)
    month = str(mes)
    df_2009=df_2009.rename(columns={0: "word", 1: month })
    speeches_2009=speeches_2009.merge(df_2009, how="outer", on="word")
speeches_2009

Unnamed: 0,word,enero,abril,febrero,mayo,julio,marzo,agosto,junio,diciembre,septiembre,octubre,noviembre
0,mil,175.0,214.0,373.0,372.0,440.0,321.0,370.0,402.0,232.0,466.0,577.0,321.0
1,colombia,124.0,259.0,381.0,493.0,366.0,430.0,366.0,367.0,245.0,557.0,821.0,574.0
2,gran,114.0,153.0,316.0,362.0,328.0,221.0,269.0,350.0,189.0,333.0,430.0,356.0
3,millones,110.0,159.0,275.0,279.0,245.0,253.0,195.0,228.0,92.0,262.0,295.0,198.0
4,social,108.0,87.0,236.0,221.0,231.0,292.0,262.0,191.0,137.0,285.0,392.0,222.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23886,masificarlo,,,,,,,,,,,,1.0
23887,acertadamente,,,,,,,,,,,,1.0
23888,quiebran,,,,,,,,,,,,1.0
23889,rezado,,,,,,,,,,,,1.0


In [85]:
speeches_2009_copy = speeches_2009
speeches_2009_copy

Unnamed: 0,word,enero,abril,febrero,mayo,julio,marzo,agosto,junio,diciembre,septiembre,octubre,noviembre
0,mil,175.0,214.0,373.0,372.0,440.0,321.0,370.0,402.0,232.0,466.0,577.0,321.0
1,colombia,124.0,259.0,381.0,493.0,366.0,430.0,366.0,367.0,245.0,557.0,821.0,574.0
2,gran,114.0,153.0,316.0,362.0,328.0,221.0,269.0,350.0,189.0,333.0,430.0,356.0
3,millones,110.0,159.0,275.0,279.0,245.0,253.0,195.0,228.0,92.0,262.0,295.0,198.0
4,social,108.0,87.0,236.0,221.0,231.0,292.0,262.0,191.0,137.0,285.0,392.0,222.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23886,masificarlo,,,,,,,,,,,,1.0
23887,acertadamente,,,,,,,,,,,,1.0
23888,quiebran,,,,,,,,,,,,1.0
23889,rezado,,,,,,,,,,,,1.0


In [86]:
speeches_2009= speeches_2009_copy

speeches_2009 = speeches_2009.rename(columns={'mayo': 'times_May_2009','abril': 'times_Apr_2009','marzo': 'times_Mar_2009', 'febrero': 'times_Feb_2009','enero': 'times_Jan_2009','junio': 'times_Jun_2009', 'julio': 'times_Jul_2009', 'agosto': 'times_Aug_2009', 'septiembre': 'times_Sep_2009', 'octubre': 'times_Oct_2009', 'noviembre': 'times_Nov_2009', 'diciembre': 'times_Dec_2009'})

speeches_2009 = speeches_2009.fillna(0)

speeches_2009["total_2009"]=speeches_2009["times_Jan_2009"]  + speeches_2009["times_Feb_2009"]  + speeches_2009["times_Mar_2009"]  + speeches_2009["times_Apr_2009"]  + speeches_2009["times_May_2009"] + speeches_2009["times_Jun_2009"] + speeches_2009["times_Jul_2009"] + speeches_2009["times_Aug_2009"] + speeches_2009["times_Sep_2009"] + speeches_2009["times_Oct_2009"] + speeches_2009["times_Nov_2009"] + speeches_2009["times_Dec_2009"]

speeches_2009

Unnamed: 0,word,times_Jan_2009,times_Apr_2009,times_Feb_2009,times_May_2009,times_Jul_2009,times_Mar_2009,times_Aug_2009,times_Jun_2009,times_Dec_2009,times_Sep_2009,times_Oct_2009,times_Nov_2009,total_2009
0,mil,175.0,214.0,373.0,372.0,440.0,321.0,370.0,402.0,232.0,466.0,577.0,321.0,4263.0
1,colombia,124.0,259.0,381.0,493.0,366.0,430.0,366.0,367.0,245.0,557.0,821.0,574.0,4983.0
2,gran,114.0,153.0,316.0,362.0,328.0,221.0,269.0,350.0,189.0,333.0,430.0,356.0,3421.0
3,millones,110.0,159.0,275.0,279.0,245.0,253.0,195.0,228.0,92.0,262.0,295.0,198.0,2591.0
4,social,108.0,87.0,236.0,221.0,231.0,292.0,262.0,191.0,137.0,285.0,392.0,222.0,2664.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23886,masificarlo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
23887,acertadamente,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
23888,quiebran,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
23889,rezado,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [87]:
speeches_2009.to_csv('Uribe_2009.csv') 

# 2010

In [88]:
speeches_2010 = pd.DataFrame(columns = ['word'])
speeches_2010

Unnamed: 0,word


In [131]:
speech_full_text=[]
stopwords_esp = stopwords.words('spanish')

meses = ["julio"]

for mes in meses:
    link = "http://historico.presidencia.gov.co/discursos/index.htm"
    html = requests.get(link).text
    soup = BeautifulSoup(html)           
    principal = soup.find_all('a')
    links = re.findall('\w+.html+?', str(principal))
    links = links[9:]
    links = links[50:]
    base_link = "http://historico.presidencia.gov.co/discursos/discursos2010/" + mes + "/"
    speeches = list(map(lambda orig_string: base_link + orig_string, links))
    not_of_interest = ["el", "la", "y"]

    for speech in speeches:
        elements = get_speech_elements(speech)
        text=elements[0]
        result = re.sub(r'\r', '', text)
        result = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¡¿”“•\r´]+', '', result)
        result = result.split()
        result = [w for w in result if w.isalpha()]
        result = [w for w in result if w not in stopwords_esp]
        result = [w.lower() for w in result]
        result = [w for w in result if w not in not_of_interest]
        speech_full_text.extend(result)

    freqdist = nltk.FreqDist(speech_full_text)
    tuples= freqdist.most_common()
    df_2010 = pd.DataFrame(tuples)
    month = str(mes)
    df_2010=df_2010.rename(columns={0: "word", 1: "julio_3" })
    speeches_2010=speeches_2010.merge(df_2010, how="outer", on="word")
speeches_2010

Unnamed: 0,word,enero,marzo,abril,febrero,junio,mayo,julio_1,julio_2,julio_3
0,colombia,104.0,509.0,566.0,179.0,602.0,752.0,351.0,242.0,388.0
1,mil,87.0,305.0,338.0,158.0,257.0,411.0,144.0,124.0,137.0
2,gran,82.0,355.0,430.0,122.0,471.0,517.0,297.0,236.0,415.0
3,ciento,69.0,142.0,225.0,70.0,140.0,259.0,49.0,83.0,108.0
4,salud,69.0,117.0,67.0,180.0,84.0,95.0,27.0,44.0,62.0
...,...,...,...,...,...,...,...,...,...,...
21441,piano,,,,,,,,,1.0
21442,sonado,,,,,,,,,1.0
21443,maracas,,,,,,,,,1.0
21444,acordeones,,,,,,,,,1.0


In [132]:
speeches_2010_copy = speeches_2010
speeches_2010_copy

Unnamed: 0,word,enero,marzo,abril,febrero,junio,mayo,julio_1,julio_2,julio_3
0,colombia,104.0,509.0,566.0,179.0,602.0,752.0,351.0,242.0,388.0
1,mil,87.0,305.0,338.0,158.0,257.0,411.0,144.0,124.0,137.0
2,gran,82.0,355.0,430.0,122.0,471.0,517.0,297.0,236.0,415.0
3,ciento,69.0,142.0,225.0,70.0,140.0,259.0,49.0,83.0,108.0
4,salud,69.0,117.0,67.0,180.0,84.0,95.0,27.0,44.0,62.0
...,...,...,...,...,...,...,...,...,...,...
21441,piano,,,,,,,,,1.0
21442,sonado,,,,,,,,,1.0
21443,maracas,,,,,,,,,1.0
21444,acordeones,,,,,,,,,1.0


In [135]:
speeches_2010= speeches_2010_copy

speeches_2010["julio"]=speeches_2010["julio_1"]+speeches_2010["julio_2"]+speeches_2010["julio_3"]

speeches_2010=speeches_2010.drop(['julio_1', 'julio_2', 'julio_3'], axis=1)

speeches_2010 = speeches_2010.rename(columns={'mayo': 'times_May_2010','abril': 'times_Apr_2010','marzo': 'times_Mar_2010', 'febrero': 'times_Feb_2010','enero': 'times_Jan_2010','junio': 'times_Jun_2010', 'julio': 'times_Jul_2010', 'agosto': 'times_Aug_2010', 'septiembre': 'times_Sep_2010', 'octubre': 'times_Oct_2010', 'noviembre': 'times_Nov_2010', 'diciembre': 'times_Dec_2010'})

speeches_2010 = speeches_2010.fillna(0)

speeches_2010["total_2010"]=speeches_2010["times_Jan_2010"]  + speeches_2010["times_Feb_2010"]  + speeches_2010["times_Mar_2010"]  + speeches_2010["times_Apr_2010"]  + speeches_2010["times_May_2010"] + speeches_2010["times_Jun_2010"] + speeches_2010["times_Jul_2010"] 

speeches_2010

Unnamed: 0,word,times_Jan_2010,times_Mar_2010,times_Apr_2010,times_Feb_2010,times_Jun_2010,times_May_2010,julio_2010,times_Jul_2010,total_2010
0,colombia,104.0,509.0,566.0,179.0,602.0,752.0,981.0,981.0,3693.0
1,mil,87.0,305.0,338.0,158.0,257.0,411.0,405.0,405.0,1961.0
2,gran,82.0,355.0,430.0,122.0,471.0,517.0,948.0,948.0,2925.0
3,ciento,69.0,142.0,225.0,70.0,140.0,259.0,240.0,240.0,1145.0
4,salud,69.0,117.0,67.0,180.0,84.0,95.0,133.0,133.0,745.0
...,...,...,...,...,...,...,...,...,...,...
21441,piano,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21442,sonado,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21443,maracas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21444,acordeones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
speeches_2010.to_csv('Uribe_2010.csv') 