In [63]:
# import libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import pickle
import numpy as np
import unidecode
import os
# from tqdm import tqdm

In [35]:
os.path.abspath(os.getcwd()) # initial working directory
output_path = "../data/"

In [162]:
url = "http://www.generalisimofranco.com/Discursos/discursos/00000.HTM"
url_base = "http://www.generalisimofranco.com/Discursos/discursos/"

source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')

In [163]:
# get list of all years in which Franco gave speeches (each year is itself a link)
all_years = soup.find_all('a', href = re.compile("0000"))
all_years = [i for i in all_years if i.string is not None]
all_years = [i for i in all_years if "Discursos" in i.string]

In [164]:
# create a list of (year, link) entries for each year
year_list = []
for one_year in all_years:
    
    year_string = one_year.string.replace(".", "")[-4:] # extract year
    year = int(year_string) # convert year from string to int
    
    extension = one_year['href']
    year_link = url_base + str(extension)
    
    year_list.append([year, year_link])

In [172]:
# for each year, get the links to each of the speeches
all_speeches = []
for i in year_list:

    year = i[0]
    yearly_url = i[1]
    yearly_source = requests.get(yearly_url).text
    yearly_soup = BeautifulSoup(yearly_source, 'html.parser')
    all_yearly_speeches = yearly_soup.find_all('a', href = re.compile('.*({}).*'.format(year))) # all 'a' tags containing key word 'year' in the corresponding href
    # all_yearly_speeches = [i for i in all_yearly_speeches if i.string is not None]
    # all_yearly_speeches = [i for i in all_yearly_speeches if i.string.startswith('(')] # see e.g. 1954: every link included twice, once with actual title, once with date in parentheses as title
    
    yearly_speeches = [] # list to be filled with links to all speeches of a single year
    count = 0
    for speech in all_yearly_speeches:
        
        extension = speech['href']
        speech_link = url_base + str(extension)

        yearly_speeches.append([year, speech_link, count])
        count += 1
    
    all_speeches.append(yearly_speeches)

In [175]:
# flatten list
all_speeches = [speech for yearly_speeches in all_speeches for speech in yearly_speeches]

In [176]:
# for each speech, concatenate all (sub-)headings and paragraphs into a single text string
for speech in all_speeches:
    speech_url = speech[1]
    speech_source = requests.get(speech_url).text
    speech_soup = BeautifulSoup(speech_source, 'html.parser')

    all_paragraphs = speech_soup.find('blockquote').find_all(['p', 'span'])

    text = str()
    for paragraph in all_paragraphs:
        content = paragraph.text.replace('\r', '').replace('\n', '')
        text += ' ' + content

    speech.append(text)

In [177]:
df_franco = pd.DataFrame(all_speeches, columns=['Year', 'Link', 'Count', 'Text']) # convert to df
df_franco['Name'] = 'Francisco Franco' # add name column
df_franco['doc_id'] = df_franco['Year'].map(str) + '_' + df_franco['Name'] + '_' + df_franco['Count'].map(str)  
df_franco.drop(['Link', 'Count'], axis = 1) # drop link and count
df_franco = df_franco[['Year', 'Name', 'doc_id', 'Text']] # reorder columns

In [178]:
# match Peronist documents
data_path =  '../data/peronist_speeches/txt/' # where Peronist speech text files are saved
peronistas = []
for filename in os.listdir(os.path.abspath(data_path)):
    
    year = int(filename[:4])
    name = filename.split('_')[1]
    count = filename.split('_')[2].split('.')[0]
    doc_id = str(year) + '_' + name + '_' + count
    with open(data_path + filename, 'r', encoding="utf-8") as handle:
        text = handle.read().replace('\n', '')
    
    peronistas.append([year, name, doc_id, text])

In [179]:
df_peron = pd.DataFrame(peronistas, columns=['Year', 'Name', 'doc_id', 'Text']) # convert to df

In [180]:
df = pd.concat([df_franco, df_peron], sort = False)

In [181]:
# save combined df
with open(output_path + 'df.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [183]:
df[380:410]

Unnamed: 0,Year,Name,doc_id,Text
380,1966,Francisco Franco,1966_Francisco Franco_9,Pronunciadas ante las cámaras de...
381,1966,Francisco Franco,1966_Francisco Franco_10,Dirigido desde el Palacio de El ...
382,1967,Francisco Franco,1967_Francisco Franco_0,"Palacio de El Pardo, 11 de enero..."
383,1967,Francisco Franco,1967_Francisco Franco_1,"Pronunciadas, 12 de abril de 196..."
384,1967,Francisco Franco,1967_Francisco Franco_2,Pronunciadas el 25 de abril de ...
385,1967,Francisco Franco,1967_Francisco Franco_3,Pronunciadas el 26 de abril de ...
386,1967,Francisco Franco,1967_Francisco Franco_4,Pronunciadas el 27 de abril de ...
387,1967,Francisco Franco,1967_Francisco Franco_5,Pronunciadas desde el balcón del...
388,1967,Francisco Franco,1967_Francisco Franco_6,"Palacio de Ayete, 18 de septiemb..."
389,1967,Francisco Franco,1967_Francisco Franco_7,Enviado desde el Palacio de El P...
