## Collecting UN Global Compact Communication of Progress (COP) Annual Reports

Web scrapping from [here](https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active) for all COP reports submitted up to 2021. We only take into account the reports submitted in English.

Please specify below the focus year of this analysis:

In [1]:
focus_year = "2021"

Please select the focus language using one of the following values:
- en (English, default option)
- de (German)
- es (Spanish)
- fr (French)
- pt (Portuguese)

In [2]:
focus_language = 'en'

In [3]:
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10},
                 'de' : { 'name' : 'German', 'min_coocurrence' : 2},
                 'es' : { 'name' : 'Spanish', 'min_coocurrence' : 2},
                 'fr' : { 'name' : 'French', 'min_coocurrence' : 2},
                 'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 2},
               }

## 1. Gather document index information about COP reports available from the website
The [UN Global Compact website](https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active) contains entries for each COP report, describing the sector of the company submitting the report, country and year, as well as the language in which the repoort was written in and a link to a PDF file with the full report.

**The results in this section give a general view of the available COPs, it's not yet restricted by the focus_year and focus_language.**

In [4]:
!pip install --upgrade pip
!python -m pip install PyPDF2
!pip install --upgrade setuptools
!pip install requests
!pip install stop-words



In [5]:
import requests
import re
import pandas as pd
import PyPDF2
import shutil
import nltk
import os
import os.path
import requests
from urllib.request import urlretrieve
from bs4 import BeautifulSoup

In [6]:
#Display the total number of CoPs available

gc_url = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page=1&per_page=250"
gc_base_url = "https://www.unglobalcompact.org"
gc_home = requests.get(gc_url)

soup = BeautifulSoup(gc_home.content, 'lxml')
header = soup.h2.string

total_num_cops = re.search(r'(?<=: )[0-9]+', header)[0]
print("Total number of COPs available: %s" % total_num_cops)

Total number of COPs available: 54594


In [None]:
#Create a BeautifulSoup to parse all available HTML page
full_gc_url_part1 = "https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active?page="
full_gc_url_part2 = "&per_page=250"

def get_link(page):
    
    r = requests.get(full_gc_url_part1+str(page)+full_gc_url_part2)
    soup = BeautifulSoup(r.content, 'lxml')
    #links = [td.find_all('a')[0]['href'] for td in soup.find_all('td', 'participant')]

    return soup

#print("Getting full list of reports ...")
gc_full_list_soup=BeautifulSoup()
for i in range(0,219): #Current max page; Need to implement len(max page)
  gc_full_list_soup.append(get_link(i))

#gc_full_list_soup = BeautifulSoup(set_of_links.content, 'lxml')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [None]:
#check the existing 'th' class from the soup object
#removing of 'th' is required to collect only the gc_full_list_soup.find_all("tr") cell
element = gc_full_list_soup.find_all('th')

#remove th class
for th in gc_full_list_soup('th'):
  th.decompose()

#remove tags that have no content
for x in gc_full_list_soup.find_all():
  if len(x.get_text(strip=True)) ==0:
    x.extract()

#check the results
gc_full_list_soup

In [None]:
#Check SDGs contribution from each company
def check_sdgs(profile):
    has_sdg1 = "no"
    has_sdg2 = "no"
    has_sdg3 = "no"
    has_sdg4 = "no"
    has_sdg5 = "no"
    has_sdg6 = "no"
    has_sdg7 = "no"
    has_sdg8 = "no"
    has_sdg9 = "no"
    has_sdg10 = "no"
    has_sdg11 = "no"
    has_sdg12 = "no"
    has_sdg13 = "no"
    has_sdg14 = "no"
    has_sdg15 = "no"
    has_sdg16 = "no"
    has_sdg17 = "no"

    questions = profile.find_all("ul", class_='questionnaire')
    if len(questions) == 2:
        sdgs = questions[0].find_all("li")
        if len(sdgs) != 18:  # the correct SDG questionnaire has 17 questions + header
            temp_sdgs = questions[1].find_all("li")
            if len(temp_sdgs) == 18:
                sdgs = temp_sdgs
            else:
                sdgs = []
        if 'selected_question' in sdgs[1].get('class'):
            has_sdg1 = "yes"
        if 'selected_question' in sdgs[2].get('class'):
            has_sdg2 = "yes"
        if 'selected_question' in sdgs[3].get('class'):
            has_sdg3 = "yes"
        if 'selected_question' in sdgs[4].get('class'):
            has_sdg4 = "yes"
        if 'selected_question' in sdgs[5].get('class'):
            has_sdg5 = "yes"
        if 'selected_question' in sdgs[6].get('class'):
            has_sdg6 = "yes"
        if 'selected_question' in sdgs[7].get('class'):
            has_sdg7 = "yes"
        if 'selected_question' in sdgs[8].get('class'):
            has_sdg8 = "yes"
        if 'selected_question' in sdgs[9].get('class'):
            has_sdg9 = "yes"
        if 'selected_question' in sdgs[10].get('class'):
            has_sdg10 = "yes"
        if 'selected_question' in sdgs[11].get('class'):
            has_sdg11 = "yes"
        if 'selected_question' in sdgs[12].get('class'):
            has_sdg12 = "yes"
        if 'selected_question' in sdgs[13].get('class'):
            has_sdg13 = "yes"
        if 'selected_question' in sdgs[14].get('class'):
            has_sdg14 = "yes"
        if 'selected_question' in sdgs[15].get('class'):
            has_sdg15 = "yes"
        if 'selected_question' in sdgs[16].get('class'):
            has_sdg16 = "yes"
        if 'selected_question' in sdgs[17].get('class'):
            has_sdg17 = "yes"

          
    return (has_sdg1, has_sdg2, has_sdg3, has_sdg4, has_sdg5, has_sdg6, has_sdg7, has_sdg8, has_sdg9, has_sdg10, has_sdg11, has_sdg12,
             has_sdg13, has_sdg14, has_sdg15, has_sdg16, has_sdg17)

In [None]:
participants = gc_full_list_soup.find_all("tr")
pdfs = {}

num_pdfs = 0
num_nonpdfs = 0
num_noreport = 0

langregex = re.compile(r'(?<=\()[^\)\(]+(?=\)$)')

print("Getting details of each report ...")
for participant in participants:
    cells = participant.find_all('td')
    company = cells[0].get_text(strip=True)
    sector = cells[1].get_text(strip=True)
    country = cells[2].get_text(strip=True)
    year = cells[3].get_text(strip=True)

    participant_entry_url = gc_base_url + cells[0].a.get('href')
    participant_profile = requests.get(participant_entry_url)
    participant_profile_soup = BeautifulSoup(participant_profile.content, 'lxml')

    (participant_sdgs_1, participant_sdgs_2, participant_sdgs_3, participant_sdgs_4, participant_sdgs_5, participant_sdgs_6, participant_sdgs_7, participant_sdgs_8,
     participant_sdgs_9, participant_sdgs_10, participant_sdgs_11, participant_sdgs_12, 
      participant_sdgs_13, participant_sdgs_14, participant_sdgs_15, participant_sdgs_16, participant_sdgs_17) = check_sdgs(participant_profile_soup)

    main_body = participant_profile_soup.find("section", class_='main-content-body')
    list_items = main_body.find_all("li")
    found_report = False
    for li in list_items:
        if li.a:
            link = li.a.get('href')
            if "/attachments/" in link:
                if ".pdf" in link:
                    link = link.split('?')[0]
                    num_pdfs += 1
                    language = langregex.search(li.get_text(strip=True))[0]
                    pdfs[link] = { "company": company, "sector" : sector, "country" : country, "year" : year, "language" : language, "sdgs1" : participant_sdgs_1, "sdgs2" : participant_sdgs_2, "sdgs3" : participant_sdgs_3, "sdgs4" : participant_sdgs_4,
                                  "sdgs5" : participant_sdgs_5, "sdgs6" : participant_sdgs_6, "sdgs7" : participant_sdgs_7, "sdgs8" : participant_sdgs_8, "sdgs9" : participant_sdgs_9, "sdgs10" : participant_sdgs_10,
                                  "sdgs11" : participant_sdgs_11, "sdgs12" : participant_sdgs_12, "sdgs13" : participant_sdgs_13, "sdgs14" : participant_sdgs_14, "sdgs15" : participant_sdgs_15,
                                  "sdgs16" : participant_sdgs_16, "sdgs17" : participant_sdgs_17,}
                    print(".", end='')
                else:
                    num_nonpdfs += 1
                found_report = True
    if not found_report:
        num_noreport += 1
print(" done.")
print("PDFs: %d, non-PDFs: %d, no-report: %d" % (num_pdfs, num_nonpdfs, num_noreport))

In [None]:
#Saving index of reports so that it can be reused
reports_index_csv_filename = "reports_index.csv"

df_pdfs = pd.DataFrame.from_dict(pdfs, orient='index')
df_pdfs.to_csv(reports_index_csv_filename, sep='\t', encoding='utf-8')


---
## Starting point if there is already reports_index.csv file
This can be used when an index file is available (has been saved previously). Only run this cell if starting from this point, otherwise skip it. 

In [None]:
#load reports_index
reports_index_csv_filename = "reports_index.csv"

df_pdfs = pd.read_csv(reports_index_csv_filename, sep='\t', encoding='utf-8', index_col=0, dtype={'year': object})
pdfs = df_pdfs.to_dict(orient='index')
df_pdfs

In [None]:
companies = {}
countries = {}
sectors = {}
years = {}
languages = {}


for pdf in pdfs.keys():
    company = pdfs[pdf]["company"]
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]

    companies[company] = companies.get(company,0) + 1
    sectors[sector] = sectors.get(sector,0) + 1
    countries[country] = countries.get(country,0) + 1
    years[year] = years.get(year,0) + 1
    languages[language] = languages.get(language,0) + 1    
 

## 2. Selecting COP reports that match required criteria (up to focus_year, written in focus_language)

In [None]:
selected_companies = {}
selected_sectors = {}
selected_countries = {}
selected_years = {}
selected_countries_years = {}

selected_pdfs = {}

for pdf in pdfs.keys():
    company = pdfs[pdf]["company"]
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]

    if language == language_ref[focus_language]['name'] and int(year) <= int(focus_year):
        selected_pdfs[pdf] = pdfs[pdf]
        
        selected_companies[company] = selected_companies.get(company,0) + 1
        selected_sectors[sector] = selected_sectors.get(sector,0) + 1
        selected_countries[country] = selected_countries.get(country,0) + 1
        selected_years[year] = selected_years.get(year,0) + 1
        if country in selected_countries_years.keys():
            selected_countries_years[country][year] = selected_countries_years[country].get(year,0) + 1
        else:
            selected_countries_years[country] = {year : 1}

In [None]:
print("There are %d reports up to %s written in %s" % (len(selected_pdfs.keys()), focus_year, language_ref[focus_language]['name']))

## 3. Downloading PDF file for each COP report that matches required criteria
At this time we've only considered reports written in the focus language and submitted up to end of the focus year.

A folder should be specified as the location where PDFs will be downloaded to ('pdfs_folder' variable below).

If this process has been run before and some files are already available in the specified folder, they won't be downloaded again.

In [None]:
pdfs_folder = "../data/pdf/"

In [None]:
filenameregex = re.compile(r'(?<=/)[^$/]+(?=$)')

try:
    os.stat(pdfs_folder)
except:
    os.mkdir(pdfs_folder) 

for pdf in selected_pdfs.keys():
    filename = pdfs_folder + filenameregex.search(pdf)[0]

    if not os.path.isfile(filename):
        print("Saving %s" % (filename))
        file = requests.get('https:'+ pdf, stream=True)
        try:
            with open(filename, 'wb') as out_file: #file handler needed
                shutil.copyfileobj(file.raw, out_file) #file name closed needed
            del file
        except:
            print("Could not save %s" % (filename))
            continue
        
    else:
        print("Skipping %s, PDF already available in folder" % (filename))