# UN Speeches – Scraper

## Python Setup

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib.request
import os
import time
import subprocess
import glob
import re

In [2]:
from americanize import americanize

## Parameters

In [3]:
year = 2017

In [4]:
headers = {'User-Agent': 'Chrome/41.0.2228.0'}

## Scrape list of all speakers, and links to their speeches:

In [5]:
if os.path.isdir('../data/raw_data/{}'.format(year))==False:
    os.mkdir('../data/raw_data/{}'.format(year))

In [6]:
session = year-1945

In [7]:
base_url = requests.get("https://gadebate.un.org/en/sessions-archive/{}".format(session), headers=headers)
print(base_url.status_code)
soup = BeautifulSoup(base_url.content, 'html.parser')

200


In [8]:
soup = soup.findAll(class_ = "media-body")

In [9]:
columns = ['country','name', 'title', 'link', 'speech']
df = pd.DataFrame(index=np.arange(len(soup)), columns=columns)

In [10]:
for i in range(0, len(soup)):
    df['country'][i] = soup[i].find(class_ = "media-heading").text
    df['name'][i] = soup[i].find(class_ = "name").text
    df['title'][i] = soup[i].find(class_ = "title").text
    df['link'][i] = 'https://gadebate.un.org'+soup[i].find('a').get('href')

In [11]:
for i in range(0, len(soup)):
    print("Scraping: {}/{}".format(i+1, len(soup)), end = '\r')
    tries = 0
    while tries < 5:
        temp_url = requests.get(df['link'][i], headers=headers)
        if temp_url.status_code!=200:
            tries += 1
            time.sleep(10)
        elif BeautifulSoup(temp_url.content, 'html.parser')==None:
            tries += 1
            time.sleep(10)
        elif BeautifulSoup(temp_url.content, 'html.parser').find(class_ = "statement-documents-list")==None:
            tries += 1
            time.sleep(10)            
        else:
            break
    if tries == 5:
        print("\nCould not scrape {}.\n".format(df['country'][i]))
        continue
    
    temp_soup = BeautifulSoup(temp_url.content, 'html.parser')
    link = None
    summary = None
    
    link_list = temp_soup.find(class_ = "statement-documents-list").findAll('a')
    for j in range(0, len(link_list)):
        if "English" in link_list[j].text:
            link = link_list[j].get('href')
            break
    if link!=None:
        urllib.request.urlretrieve(link, '../data/raw_data/{}/{}.pdf'.format(year, df['country'][i]))
    
    summary = temp_soup.find(class_="field-item even").text
    file = open('../data/raw_data/{}/{} SUMMARY.txt'.format(year, df['country'][i]), 'w')
    file.write(summary)
    file.close()
        
print("\nDone!")

Scraping: 199/199
Done!


## Converting to Text

Conversion to `.txt` is done using `pdftotxt` from the `Xpdf` suite (https://www.xpdfreader.com/pdftotext-man.html).

In [12]:
os.system('for file in ../data/raw_data/{}/*.pdf; do pdftotext "$file"; done;'.format(year))

0

## Cleaning

In [13]:
if os.path.isdir('../data/clean_data/{}'.format(year))==False:
    os.mkdir('../data/clean_data/{}'.format(year))

In [14]:
txt_files = glob.glob("../data/raw_data/{}/*.txt".format(year))

for txt_file in txt_files:
    with open(txt_file, 'r', encoding='utf-8', errors='ignore') as file: 
        txt = file.read()
    
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\x0c', ' ')

    # Convert all English to American English
    txt = americanize(txt)

    # Get rid of page headers and footers:
    txt = re.sub(r'\b(please )?check against delivery\b', '', txt)

    # RegEx Cleanup:
    txt = re.sub(r'\s+', ' ', txt)
    txt = txt.strip()
    
    file = open(txt_file.replace('/raw_data/', '/clean_data/'), 'w')
    file.write(txt)
    file.close()

## Combine into Dataset

In [15]:
txt_files = glob.glob("../data/clean_data/{}/*.txt".format(year))
countries = list(df['country'])
countries.sort()

In [16]:
speeches = []
for country in countries:
    if '../data/clean_data/{}/{}.txt'.format(year, country) in txt_files:
        with open('../data/clean_data/{}/{}.txt'.format(year, country), 'r') as file:
            speech = file.read()
    else:
        speech = np.nan
    speeches.append(speech)

In [17]:
summaries = []
for country in countries:
    with open('../data/clean_data/{}/{} SUMMARY.txt'.format(year, country), 'r') as myfile:
        summary = myfile.read()
    summaries.append(summary)

In [18]:
df = pd.DataFrame({'country': countries, 'speech': speeches, 'summary': summaries})

## Export

In [19]:
df.to_csv('../data/un_speeches_{}.csv'.format(year), index=False)

## Sandbox