In [131]:
import pandas as pd
import PyPDF2
import pdfplumber
import re

In [132]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages) # If required.
        
        for page_num in range(12,34): # selected specifically for the glossary
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    
    return text

# Usage
pdf_path = '9789240105485-eng.pdf'
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

    1Glossary terms in alphabetical order  
A
Accountability
Answerability or legal responsibility for 
identifying and removing obstacles and 
barriers to health  services. This should 
include responding to findings from 
monitoring and evaluation (1).
Accuracy
The degree to which a  measurement, test 
or procedure correctly reflect the true 
value or condition (2).
Administrative area
Delineated geographical areas within 
a particular territory created for the 
purpose of  administration. For  example: 
county, district, province, state, region, 
subnational or national (3).
Age groupings
The combining of ages into groups for 
the purpose of data  analysis. These 
groupings usually capture a time interval 
representing a developmental stage in 
the life course of a  human. The actual 
aggregation used depends on the purpose 
of the analysis and the sample size (4).
Age-specific mortality rate
A mortality rate of a particular age  group. 
The numerator is the number of deaths 
in tha

In [133]:
# Go line by line
lines = extracted_text.split('\n')
lines_to_keep = [line for line in lines if len(line) > 1]

exclude = "Glossary terms in alphabetical order"

lines_cleaned = [line for line in lines_to_keep if exclude not in line]

summary_dict = {}
lowercase_letters = tuple('abcdefghijklmnopqrstuvwxyz')
key = "NA"

for line in lines_cleaned:
    if len(line.split(" ")) > 5:
        # Ensure the key exists in the dictionary
        if key not in summary_dict:
            summary_dict[key] = []
        summary_dict[key].append(line)
    else:
        if line.startswith(lowercase_letters):
            # Ensure the key exists in the dictionary
            if key not in summary_dict:
                summary_dict[key] = []
            summary_dict[key].append(line)
        else:
            key = line

print(summary_dict)
# Convert to DataFrame
        

{'Accountability': ['Answerability or legal responsibility for ', 'identifying and removing obstacles and ', 'barriers to health  services. This should ', 'include responding to findings from ', 'monitoring and evaluation (1).'], 'Accuracy': ['The degree to which a  measurement, test ', 'or procedure correctly reflect the true ', 'value or condition (2).'], 'Delineated geographical areas within ': ['a particular territory created for the ', 'purpose of  administration. For  example: ', 'county, district, province, state, region, ', 'subnational or national (3).'], 'Age groupings': ['The combining of ages into groups for ', 'the purpose of data  analysis. These ', 'groupings usually capture a time interval ', 'representing a developmental stage in ', 'the life course of a  human. The actual ', 'aggregation used depends on the purpose ', 'of the analysis and the sample size (4).'], 'Age-specific mortality rate': ['A mortality rate of a particular age  group. ', 'The numerator is the numb

In [134]:
definitions_df = pd.DataFrame({
    'Term': summary_dict.keys(),
    'Description': [' '.join(values) for values in summary_dict.values()]
})
definitions_df.to_excel('definitions.xlsx', index=False,overwrite=False)

In [172]:
# Extract tables using pdfplumber

def extract_table_from_pdf(pdf_path):
    tables = []
    with open(pdf_path, 'rb') as file: 
        pdf = pdfplumber.open(file)
        for page in pdf.pages:
            if page.page_number> 33:
                for table in page.extract_tables():
                    tables.append(table)
    return tables

tables = extract_table_from_pdf(pdf_path)


In [175]:
for table in tables[:5]:
    print(table)

[['Thematic area', 'Relevant terms', None, None], ['Data source,\ncollection, and\ncollation', 'Data accessibility', 'Data type', 'Primary data'], [None, 'Data collection level', 'Digital health', 'Response rate'], [None, 'Data collection\nmethod', 'Geospatial data', 'Secondary data'], [None, 'Data element', 'Health data', 'Sex'], [None, 'Data information\npyramid', 'Home-based record', 'Survey'], [None, 'Data inputs', 'Measurement level', 'Survey data'], [None, 'Data integration', 'Microdata', 'Surveillance'], [None, 'Data life cycle', 'Population-based\nsurvey', 'Understandable/\nsimplicity'], [None, 'Data source', 'Population census', 'Unit of measure'], [None, 'Data standards', 'Preferred data sources', ''], ['Data security,\nprotection,\ngovernance and\nstandards', 'Compliance', 'Data steward', 'Mortality data'], [None, 'Confidentiality', 'Data stewardship', 'National health\nstrategic plan'], [None, 'Data access control', 'Focal point', 'National statistics\noffice'], [None, 'Dat

In [177]:
thematic_areas = []
for table in tables[:5]:
    for row in table:
        if row[0] not in ["",None, "Thematic area"]:
            thematic_areas.append(row[0])

# Remove \n from the thematic areas list
thematic_areas = [area.replace("\n"," ") for area in thematic_areas]
thematic_areas

['Data source, collection, and collation',
 'Data security, protection, governance and standards',
 'Disaggregation of public health data/ indicators',
 'Estimation methods',
 'Health Information systems',
 'Indicator Terminology',
 'Methods to assess data quality',
 'Monitoring and evaluation terms',
 'Quality of Care measurement',
 'Statistical and analytical terms',
 'For further information, please contact: Data Hub and Spoke Collaborative Secretariat World Health Organization Email: datagovernance@who.int']

In [186]:
# Extract the inner table from the nested list
inner_table = tables[0]  # Access the first element of the outer list

for inner_table in tables[:5]:
    # Initialize variables
    structured_data = {}
    current_thematic_area = None

    # Process each row in the inner table
    for row in inner_table:
        # Check if the first column has a valid string (not None) and is not "Thematic area"
        if isinstance(row[0], str) and row[0] != "Thematic area":
            # A new thematic area is found
            current_thematic_area = row[0]
            structured_data[current_thematic_area] = []
        elif row[0] is None and current_thematic_area:
            # Add themes to the current thematic area
            structured_data[current_thematic_area].append(row[1:])

    # Print the structured data
    for thematic_area, themes in structured_data.items():
        print(f"Thematic Area: {thematic_area}")
        for theme in themes:
            print(f"  - {theme}")

Thematic Area: Data source,
collection, and
collation
  - ['Data collection level', 'Digital health', 'Response rate']
  - ['Data collection\nmethod', 'Geospatial data', 'Secondary data']
  - ['Data element', 'Health data', 'Sex']
  - ['Data information\npyramid', 'Home-based record', 'Survey']
  - ['Data inputs', 'Measurement level', 'Survey data']
  - ['Data integration', 'Microdata', 'Surveillance']
  - ['Data life cycle', 'Population-based\nsurvey', 'Understandable/\nsimplicity']
  - ['Data source', 'Population census', 'Unit of measure']
  - ['Data standards', 'Preferred data sources', '']
Thematic Area: Data security,
protection,
governance and
standards
  - ['Confidentiality', 'Data stewardship', 'National health\nstrategic plan']
  - ['Data access control', 'Focal point', 'National statistics\noffice']
  - ['Data anonymization', 'International\nClassification of\nDiseases', 'Notifiable conditions']
  - ['Data consent', 'International Health\nRegulations', 'Publicly available']


In [185]:
# Extract the inner table from the nested list
inner_table = tables[0]  # Access the first element of the outer list

# Initialize variables
structured_data = {}
current_thematic_area = None

# Process each row in the inner table
for row in inner_table:
    # Check if the first column has a valid string (not None) and is not "Thematic area"
    if isinstance(row[0], str) and row[0] != "Thematic area":
        # A new thematic area is found
        current_thematic_area = row[0]
        structured_data[current_thematic_area] = []
    elif row[0] is None and current_thematic_area:
        # Add terms under the current thematic area
        structured_data[current_thematic_area].extend(row[1:])

# Flatten the structured data into a tidy format
data = []
for thematic_area, terms in structured_data.items():
    for term in terms:
        # Add only non-empty terms to the DataFrame
        if term:
            data.append([thematic_area, term])

# Create a DataFrame in tidy format
df = pd.DataFrame(data, columns=["Thematic Area", "Term"])

# Display the DataFrame
print(df)

                                        Thematic Area  \
0            Data source,\ncollection, and\ncollation   
1            Data source,\ncollection, and\ncollation   
2            Data source,\ncollection, and\ncollation   
3            Data source,\ncollection, and\ncollation   
4            Data source,\ncollection, and\ncollation   
5            Data source,\ncollection, and\ncollation   
6            Data source,\ncollection, and\ncollation   
7            Data source,\ncollection, and\ncollation   
8            Data source,\ncollection, and\ncollation   
9            Data source,\ncollection, and\ncollation   
10           Data source,\ncollection, and\ncollation   
11           Data source,\ncollection, and\ncollation   
12           Data source,\ncollection, and\ncollation   
13           Data source,\ncollection, and\ncollation   
14           Data source,\ncollection, and\ncollation   
15           Data source,\ncollection, and\ncollation   
16           Data source,\ncoll

In [188]:
# Initialize an empty DataFrame to store all data
all_data = []

# Loop through each table in the outer list
for inner_table in tables[:5]:
    # Initialize variables for each inner table
    structured_data = {}
    current_thematic_area = None

    # Process each row in the inner table
    for row in inner_table:
        # Check if the first column has a valid string (not None) and is not "Thematic area"
        if isinstance(row[0], str) and row[0] != "Thematic area":
            # A new thematic area is found
            current_thematic_area = row[0]
            structured_data[current_thematic_area] = []
        elif row[0] is None and current_thematic_area:
            # Add terms under the current thematic area
            structured_data[current_thematic_area].extend(row[1:])

    # Flatten the structured data from the current table
    for thematic_area, terms in structured_data.items():
        for term in terms:
            # Add only non-empty terms to the data list
            if term:
                all_data.append([thematic_area, term])

# Create a final DataFrame in tidy format
df = pd.DataFrame(all_data, columns=["Thematic Area", "Term"])

# Display the combined DataFrame
print(df)

# Optional: Save to a CSV file
df.to_csv("tidy_data_themes.csv", index=False)

                                Thematic Area                          Term
0    Data source,\ncollection, and\ncollation         Data collection level
1    Data source,\ncollection, and\ncollation                Digital health
2    Data source,\ncollection, and\ncollation                 Response rate
3    Data source,\ncollection, and\ncollation       Data collection\nmethod
4    Data source,\ncollection, and\ncollation               Geospatial data
..                                        ...                           ...
182         Statistical and\nanalytical terms  Years lived with\ndisability
183         Statistical and\nanalytical terms                   Interaction
184         Statistical and\nanalytical terms         Processed health data
185         Statistical and\nanalytical terms         Intervention coverage
186         Statistical and\nanalytical terms                    Proportion

[187 rows x 2 columns]
