In [2]:
# import libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import pickle
import numpy as np
import unidecode
import os
# from tqdm import tqdm

In [24]:
os.path.abspath(os.getcwd()) # initial working directory
output_path = "../data/"

In [3]:
url = "http://www.generalisimofranco.com/Discursos/discursos/00000.HTM"
url_base = "http://www.generalisimofranco.com/Discursos/discursos/"

source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')

In [4]:
# get list of all years in which Franco gave speeches (each year is itself a link)
all_years = soup.find_all('a', href = re.compile("0000"))
all_years = [i for i in all_years if i.string is not None]
all_years = [i for i in all_years if "Discursos" in i.string]

In [5]:
# create a list of (year, link) entries for each year
year_list = []
for one_year in all_years:
    
    year_string = one_year.string.replace(".", "")[-4:] # extract year
    year = int(year_string) # convert year from string to int
    
    extension = one_year['href']
    year_link = url_base + str(extension)
    
    year_list.append([year, year_link])

In [6]:
# for each year, get the links to each of the speeches
all_speeches = []
for i in year_list:

    year = i[0]
    yearly_url = i[1]
    yearly_source = requests.get(yearly_url).text
    yearly_soup = BeautifulSoup(yearly_source, 'html.parser')
    all_yearly_speeches = yearly_soup.find_all('a', href = re.compile('.*({}).*'.format(year))) # all 'a' tags containing key word 'year' in the corresponding href

    yearly_speeches = [] # list to be filled with links to all speeches of a single year
    count = 0
    for speech in all_yearly_speeches:
        
        extension = speech['href']
        speech_link = url_base + str(extension)

        yearly_speeches.append([year, speech_link, count])
        count += 1
    
    all_speeches.append(yearly_speeches)

In [8]:
# flatten list
all_speeches = [speech for yearly_speeches in all_speeches for speech in yearly_speeches]

In [13]:
# for each speech, concatenate all (sub-)headings and paragraphs into a single text string
for speech in all_speeches:
    speech_url = speech[1]
    speech_source = requests.get(speech_url).text
    speech_soup = BeautifulSoup(speech_source, 'html.parser')

    all_paragraphs = speech_soup.find('blockquote').find_all(['p', 'span'])

    text = str()
    for paragraph in all_paragraphs:
        content = paragraph.text.replace('\r', '').replace('\n', '')
        text += ' ' + content

    speech.append(text)

In [21]:
df = pd.DataFrame(all_speeches, columns=['Year', 'Link', 'Count', 'Text']) # convert to df
df['Name'] = 'Francisco Franco' # add name column
df['id'] = df['Year'].map(str) + '_' + df['Name'] + '_' + df['Count'].map(str)  
df.drop(['Link', 'Count'], axis = 1) # drop link and count
df = df[['Year', 'Name', 'id', 'Text']] # reorder columns

In [25]:
# save combined df
with open(output_path + 'df.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)