## Collecting UN Global Compact Communication of Progress (COP) Annual Reports

Web scrapping from [here](https://www.unglobalcompact.org/participation/report/cop/create-and-submit/active) for all COP reports submitted up to 2021. We only take into account the reports submitted in English.

Last updated on August 19, 2021

Please specify below the focus year of this analysis:

In [1]:
focus_year = "2021"

Please select the focus language using one of the following values:
- en (English, default option)
- de (German)
- es (Spanish)
- fr (French)
- pt (Portuguese)

In [2]:
focus_language = 'en'

In [3]:
language_ref = { 'en' : { 'name' : 'English', 'min_coocurrence' : 10},
                 'de' : { 'name' : 'German', 'min_coocurrence' : 2},
                 'es' : { 'name' : 'Spanish', 'min_coocurrence' : 2},
                 'fr' : { 'name' : 'French', 'min_coocurrence' : 2},
                 'pt' : { 'name' : 'Portuguese', 'min_coocurrence' : 2},
               }

In [4]:
import requests
import re
import pandas as pd
import PyPDF2
import shutil
import nltk
import os
import os.path
import requests
from urllib.request import urlretrieve
from bs4 import BeautifulSoup


---
## Starting point if there is already reports_index.csv file
This can be used when an index file is available (has been saved previously). Only run this cell if starting from this point, otherwise skip it. 

In [5]:
#load reports_index
reports_index_csv_filename = "reports_index.csv"

df_pdfs = pd.read_csv(reports_index_csv_filename, sep='\t', encoding='utf-8', index_col=0, dtype={'year': object})
pdfs = df_pdfs.to_dict(orient='index')
df_pdfs

Unnamed: 0,ID,company,sector,country,year,published,language,sdgs1,sdgs2,sdgs3,...,sdgs8,sdgs9,sdgs10,sdgs11,sdgs12,sdgs13,sdgs14,sdgs15,sdgs16,sdgs17
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/502233/original/Gestion Responsable Stratego 2020.pdf,456892,STRATEGO,Support Services,Panama,2021,2021/09/02,Spanish,yes,no,yes,...,yes,no,yes,no,yes,no,no,no,yes,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/502231/original/SR21_E_All.pdf,456891,Nissan Motor Co. Ltd.,Automobiles & Parts,Japan,2021,2021/09/02,English,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/502232/original/SR21_J_All.pdf,456891,Nissan Motor Co. Ltd.,Automobiles & Parts,Japan,2021,2021/09/02,Japanese,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/502229/original/Integrated report_2020.pdf,456890,Sapporo Holdings Limited,Beverages,Japan,2021,2021/09/02,Japanese,no,yes,yes,...,yes,no,yes,yes,yes,yes,yes,yes,no,yes
//ungc-production.s3.us-west-2.amazonaws.com/attachments/cop_2021/502230/original/sapporo_sustainability_book_2021.pdf,456890,Sapporo Holdings Limited,Beverages,Japan,2021,2021/09/02,Japanese,no,yes,yes,...,yes,no,yes,yes,yes,yes,yes,yes,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
//ungc-production.s3.us-west-2.amazonaws.com/attachments/9368/original/cop_association_cohub_2010.pdf,10835,Association COHUB,Not Applicable,Benin,2011,2011/02/23,French,no,no,no,...,no,no,no,no,no,no,no,no,no,no
//ungc-production.s3.us-west-2.amazonaws.com/attachments/9369/original/RAPPORT_COHUB_2010.pdf,10835,Association COHUB,Not Applicable,Benin,2011,2011/02/23,French,no,no,no,...,no,no,no,no,no,no,no,no,no,no
//ungc-production.s3.us-west-2.amazonaws.com/attachments/9370/original/PLAN_D_ACTIONS_2011_-_COHUB.pdf,10835,Association COHUB,Not Applicable,Benin,2011,2011/02/23,French,no,no,no,...,no,no,no,no,no,no,no,no,no,no
//ungc-production.s3.us-west-2.amazonaws.com/attachments/9365/original/RAPPORT_COHUB_2010.pdf,10833,Association COHUB,Not Applicable,Benin,2011,2011/02/23,French,no,no,no,...,no,no,no,no,no,no,no,no,no,no


In [23]:
companies = {}
countries = {}
sectors = {}
years = {}
languages = {}


for pdf in pdfs.keys():
    company = pdfs[pdf]["company"]
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]

    companies[company] = companies.get(company,0) + 1
    sectors[sector] = sectors.get(sector,0) + 1
    countries[country] = countries.get(country,0) + 1
    years[year] = years.get(year,0) + 1
    languages[language] = languages.get(language,0) + 1    
 

In [24]:
selected_companies = {}
selected_sectors = {}
selected_countries = {}
selected_years = {}
selected_countries_years = {}

selected_pdfs = {}

for pdf in pdfs.keys():
    company = pdfs[pdf]["company"]
    language = pdfs[pdf]["language"]
    year = pdfs[pdf]["year"]
    country = pdfs[pdf]["country"]
    sector = pdfs[pdf]["sector"]

#    if language == language_ref[focus_language]['name'] and int(year) <= int(focus_year):
    if language == language_ref[focus_language]['name'] and int(year) <= int(focus_year): #only for the focus_year
        selected_pdfs[pdf] = pdfs[pdf]
        
        selected_companies[company] = selected_companies.get(company,0) + 1
        selected_sectors[sector] = selected_sectors.get(sector,0) + 1
        selected_countries[country] = selected_countries.get(country,0) + 1
        selected_years[year] = selected_years.get(year,0) + 1
        if country in selected_countries_years.keys():
            selected_countries_years[country][year] = selected_countries_years[country].get(year,0) + 1
        else:
            selected_countries_years[country] = {year : 1}

In [19]:
pdfs_folder = "../data/pdf/"

In [20]:
txts_folder = "../data/txt/"

### Analyzing the text of reports: DataFrame

In [21]:
os.stat(pdfs_folder)

os.stat_result(st_mode=16895, st_ino=3659174697993395, st_dev=517161065, st_nlink=1, st_uid=0, st_gid=0, st_size=0, st_atime=1629353715, st_mtime=1629353715, st_ctime=1629353715)

In [33]:
from collections import defaultdict
from pathlib import Path
import pandas as pd

results = defaultdict(list)

for file in Path(txts_folder).iterdir():
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["text"].append(file_open.read())

df = pd.DataFrame(results)

In [35]:
df