# **NDC analysis.** Keyword Search

In [1]:
##Import libraries
import os
import re
import pandas as pd
from pdfminer.high_level import extract_text

In [4]:
# Country NDCs Filepath.
ndcs_updated = "../data/NDCs"

Of note, Mali and Nigeria's updated NDC submissions are not searchable. Mali's updated NDC contains pictures that, unlike other NDC submissions that contained pictures, were not able to be converted to text using OCR. Nigeria's updated NDC is highlighteable but non-searchable, probably due to non-standard font encoding. As a result, these two NDCs will not show results for the keyword search.

## Definition of the keywords

In [10]:
kw_decent_work = ['decent work', 'decent job', 'decent jobs', 'good job', 'good jobs',
                  'quality job', 'quality jobs']

kw_green_jobs = ['green job', 'green jobs']

kw_employment = ['job', 'jobs', 'employment', 'livelihood', 'employment loss', 'job loss', 'job losses']


kw_justtransition = ['just transition', 'just energy transition', 
                     'fair transition', 'inclusive transition'] 

kw_social_inclusion = ['social inclusion', 'social justice', 'social equity', 'non-discrimination',
                       'equal access opportunities', 'equal access opportunity', 'intersectionalities',
                       'intersectionality', 'discrimination', 'equitable distribution', 'equitable outcome']

kw_organizations = ["employers' organizations", 'employers organizations', 'business membership organizations',
                    "workers' organizations", "workers organizations", 'trade unions', 'business organizations',
                    'business associations', 'trade associations', 'employers associations',  "workers associations",
                    'industry group', 'representative of workers', 'representatives of employers', 'workers unions', 
                    'industry group', 'representative of workers', 'representatives of employers', 'workers unions'
                    'workers representatives', "employers’ representatives", "workers’ associations"]

 
kw_stakeholders_engagement = ['stakeholders engagement', 'public engagement', 'stakeholder consultation', 'public consultation',
                            'public participation', 'citizen participation', 'stakeholder involvement', 'involvement of stakeholders', 'involved stakeholders',
                            'consultative process', 'civil society consultation', 'civil society participation', 'participation of actors',
                            'participation of citizens']
 
kw_ILO = ['ilo', 'international labor organization', 'international labour organization', 'international labour organisation', 
          'international labor organisation', 'international labour office']

## Keyword search function

This function will search through the all the Nationally Determined Contributions in the specified folder, and will return, for each country, which keywords were found, how many times, and the page at which each word appears. Only whole matches are counted, so if a keyword appears within another word in the document, it will not be counted.

The code is based on code based on Dan Luca Fulger's work.

In [11]:
def search_keywords_in_pdf(pdf_file, keywords):
    results = []
    text = extract_text(pdf_file).lower()

    for keyword in keywords:
        pattern = r'\b{}\b'.format(re.escape(keyword.lower()))
        matches = re.finditer(pattern, text)
        pages_with_keyword = [] # set()
        total_words_found = 0
        for match in matches:
            page_number = text.count('\f', 0, match.start()) + 1
            pages_with_keyword.append(page_number) # add(page_number)
            total_words_found += 1

        if pages_with_keyword:
            results.append({
                'PDF Name': os.path.basename(pdf_file),
                'Keyword': keyword,
                'Page Numbers': ', '.join(str(page_number) for page_number in sorted(pages_with_keyword)),
                'Total Words Found': total_words_found
            })

    return results

def search_keywords_in_multiple_pdfs(pdf_folder, keywords):
    pdf_files = [file for file in os.listdir(pdf_folder) if file.endswith('.pdf')]
    all_results = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        results = search_keywords_in_pdf(pdf_path, keywords)
        all_results.extend(results)

    return all_results

def analyze_pdfs(pdf_folder, keywords):
    results = search_keywords_in_multiple_pdfs(pdf_folder, keywords)

    if results:
        df = pd.DataFrame(results)
        df = df.groupby(['PDF Name', 'Keyword'], as_index=False)[['Page Numbers', 'Total Words Found']].agg({'Page Numbers': ', '.join, 'Total Words Found': 'sum'})
        keyword_counts = df.groupby('PDF Name')['Keyword'].nunique().reset_index()
        keyword_counts.columns = ['PDF Name', 'unique_keyword_count']
        unique_keywords = df.groupby('PDF Name').agg({'Keyword': lambda x: list(x), 
                                                      'Page Numbers': lambda x: [pages.split(', ') for pages in x], 
                                                      'Total Words Found': 'sum'}).reset_index()
        unique_keywords.columns = ['PDF Name', 'Unique Keywords', 'Page Numbers', 'Total Words Found']
        final_df = pd.merge(keyword_counts, unique_keywords, on='PDF Name')
        final_df['PDF Name'] = final_df['PDF Name'].str.replace('.pdf', '')
        final_df.rename(columns={'PDF Name': 'Country', 'Total Words Found': 'total_words', 'Page Numbers': 'pages', 'Unique Keywords': 'unique_keywords'}, inplace=True)

        all_countries = [file.replace('.pdf', '') for file in os.listdir(pdf_folder) if file.endswith('.pdf')]
        countries_with_keywords = final_df['Country'].tolist()
        countries_without_keywords = list(set(all_countries) - set(countries_with_keywords))

        no_keywords_data = {'Country': countries_without_keywords,
                            'unique_keyword_count': [0] * len(countries_without_keywords),
                            'unique_keywords': [[] for _ in range(len(countries_without_keywords))],
                            'pages': [[] for _ in range(len(countries_without_keywords))],
                            'total_words': [0] * len(countries_without_keywords)} 
        no_keywords_df = pd.DataFrame(no_keywords_data)

        final_df = pd.concat([final_df, no_keywords_df], ignore_index=True)

        return final_df

## Excel file production

Remark: The following code takes a long time to execute (>30min). Avoid re-runs if possible.

In [12]:
# List of keyword lists and corresponding dataframes names
keyword_lists = [kw_decent_work, kw_green_jobs, kw_employment, kw_stakeholders_engagement,
                 kw_ILO, kw_justtransition, kw_social_inclusion, kw_organizations]
dataframes_names = ['df_decent_work', 'df_green_jobs', 'df_employment', 'df_stakeholders_engagement',
                  'df_ILO', 'df_justtransition', 'df_social_inclusion', 'df_organizations']


for i, keywords in enumerate(keyword_lists):
    print(i,'/7')
    # Call the analyze_pdfs function with the current keyword list
    dataframe = analyze_pdfs(ndcs_updated, keywords)
    
    # Rename the dataframe accordingly
    globals()[dataframes_names[i]] = dataframe.copy()

0 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


1 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


2 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


3 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


4 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


5 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


6 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


7 /7


The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Argentina.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/atoure/Desktop/PASU/EPFL-ILO/NDC submissions/NDCs_1207/Ethiopia.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


In [13]:
dataframes_suffix = ['_green_jobs', '_employment', '_stakeholders_engagement',
                  '_ILO', '_justtransition', '_social_inclusion', '_organizations']

dataframes = [df_green_jobs, df_employment, df_stakeholders_engagement,
                  df_ILO, df_justtransition, df_social_inclusion, df_organizations]

# Start with one dataframe
merged_df = df_decent_work.copy()

# Add a suffix to each column name
for column in merged_df.columns:
    if column != 'Country':
        merged_df.rename(columns={column: column + '_decent_work'}, inplace=True)

# Merge dataframes using suffixes for columns
for i in range(len(dataframes)):
    if dataframes[i] is not None:
        merged_df = pd.merge(merged_df, dataframes[i], on='Country', how='outer', suffixes=('',  dataframes_suffix[i]))
        
# Add suffix for the first merge (only added when columns perferctly match)
for column in merged_df.columns:
    if column != 'Country':
        if column in df_green_jobs.columns:
            merged_df.rename(columns={column: column + '_green_jobs'}, inplace=True)

In [14]:
# Save the data with relevant columns
columns_to_include = [col for col in merged_df.columns if not col.startswith(('unique_keyword_count', 'pages'))]

merged_df[columns_to_include].to_excel('../proc/keywords_search_2808.xlsx', index=False) 

In [7]:
merged_df.head()

Unnamed: 0,Country,unique_keywords_decent_work,total_words_decent_work,unique_keywords_green_jobs,total_words_green_jobs,unique_keywords_employment,total_words_employment,unique_keywords_stakeholders_engagement,total_words_stakeholders_engagement,unique_keywords_ILO,total_words_ILO,unique_keywords_justtransition,total_words_justtransition,unique_keywords_social_inclusion,total_words_social_inclusion,unique_keywords_organizations,total_words_organizations
0,Afghanistan,[],0,[],0,[],0,[],0,[],0,[],0,[],0,[],0
1,Albania,[],0,[],0,"['employment', 'jobs', 'livelihood']",16,[],0,[],0,['just transition'],1,['discrimination'],1,[],0
2,Algeria,[],0,[],0,['employment'],1,[],0,[],0,[],0,[],0,[],0
3,Andorra,[],0,[],0,"['employment', 'jobs']",2,[],0,[],0,[],0,[],0,[],0
4,Angola,[],0,[],0,"['employment', 'jobs']",4,[],0,[],0,[],0,[],0,[],0
