In [6]:
!pip install PyPDF2
!pip install docx


import os
import PyPDF2
import pandas as pd
from pathlib import Path
import numpy as np
from docx import Document




ModuleNotFoundError: No module named 'exceptions'

### User Variables

In [7]:
"""
USER: Specify these variables
"""

folder_path = "tweede_pdfs2024"
file_year = 2024
csv_output_name = "tweede_2024"

"""
OPTION: Specify if you would like to reduce the dataset to a specific speech type. 
See the possible attributes here: https://opendata.tweedekamer.nl/documentatie/document
The code will search the first 10 lines of each document (header + a bit extra) and drop those that do not contain this string.
"""
speech_type = "tweeminutendebat"

### Functions

In [8]:
def list_files(folder_path):
    return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"An error occurred: {e}")
    return text

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        for para in doc.paragraphs:
            if para.text:
                text += para.text + "\n"
    except Exception as e:
        print(f"An error occurred: {e}")
    return text

def get_first_ten_lines(text):
    lines = text.splitlines()
    first_ten = lines[:10]
    return "\n".join(first_ten)



### Code

In [9]:
# Create the list of file names and paths
file_names = list_files(folder_path)

# Create an empty pandas DataFrame with three columns: filename, year, text.
speech_info = pd.DataFrame(columns= ["filename", "year", "text"])

In [10]:
# Fill the df with info from the documents
"""
This code will take 15 or more minutes to run and will generate 227 'EOF marker not found' errors - which is fine
The documents that are generating errors are .docx files mislabled as PDFs.
"""
for name in file_names:
    temp_path = os.path.join(folder_path, name)

    extracted_text = extract_text_from_pdf(temp_path)
    text_ten = get_first_ten_lines (extracted_text)

    data = pd.Series({
        "filename": name,
        "year": file_year,
        "text": text_ten})
    
    new_row = data.to_frame().T
    speech_info = pd.concat([speech_info, new_row], ignore_index=True)

# Display the DataFrame
speech_info

An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found


Unnamed: 0,filename,year,text
0,af95ec7a-a922-46b3-81b4-4b568af2bd96.docx,2024,
1,b36f49f6-9588-4e60-b9af-04b382aa1f12.pdf,2024,33 \nFunder ingssc hade aan woning en \nVoorz...
2,a08b68b5-2be7-458c-a575-2583f979bed7.pdf,2024,13 \nWet drempelv erlaging omg ang groot- \no...
3,f77123d0-8453-46fb-ba79-26a2a46290bd.docx,2024,
4,bebe3645-1def-4cf7-999b-e0c9cfc97d09.pdf,2024,
...,...,...,...
1670,041c51b3-eba0-4688-8498-534ce53a088b.pdf,2024,
1671,23f6592d-c4cb-4d7b-b033-9e576287563a.pdf,2024,8 \nStemming en moties Artikel 100-brief \nvo...
1672,8d7e0aaa-fcf9-417d-a354-b87d2024a27d.docx,2024,
1673,91d7c755-5cf3-4793-996e-6ee4cd943d16.pdf,2024,18 \nStemming motie Wijziging van de \nWare...


In [12]:
# Change empty strings to nan
speech_info['text'] = speech_info['text'].replace(r'^\s*$', np.nan, regex=True)

# Move the EOF error files into their own df, removing them from the old one.
print("nulls in speech df before: ", speech_info['text'].isnull().sum())
print()

df_none = speech_info[speech_info['text'].isnull()].copy()
speech_info.dropna(subset=['text'], inplace=True)

print("nulls in speech df after: ", speech_info['text'].isnull().sum())
print("nulls in .docx df after: ", df_none['text'].isnull().sum())
print()
df_none

nulls in speech df before:  454

nulls in speech df after:  0
nulls in .docx df after:  454



Unnamed: 0,filename,year,text
0,af95ec7a-a922-46b3-81b4-4b568af2bd96.docx,2024,
3,f77123d0-8453-46fb-ba79-26a2a46290bd.docx,2024,
4,bebe3645-1def-4cf7-999b-e0c9cfc97d09.pdf,2024,
5,495a256c-69d9-40ed-9935-b88f642c9627.pdf,2024,
8,ac23f84c-1c6e-418b-bab5-6fdd3c313402.docx,2024,
...,...,...,...
1656,ab8b2bc7-660d-4b0f-a4c6-96bc3eae5403.docx,2024,
1666,a09bfe92-9511-4fab-9e0a-b932bf7a7dd9.docx,2024,
1670,041c51b3-eba0-4688-8498-534ce53a088b.pdf,2024,
1672,8d7e0aaa-fcf9-417d-a354-b87d2024a27d.docx,2024,


In [15]:
# Change the file names in the df_none to .docx
df_none.loc[df_none['text'].isnull(), 'filename'] = (
    df_none.loc[df_none['text'].isnull(), 'filename']
      .str.replace('.pdf', '.docx', regex=False))


# list the file names
files_to_change = df_none['filename'].tolist()

# build file paths (old and new file names) and rename the docx files
for file_name in files_to_change:

    base_name = os.path.splitext(file_name)[0]
    old_file_name = base_name + '.pdf'
    
    new_path = os.path.join(folder_path, file_name)
    old_path = os.path.join(folder_path, old_file_name)
    os.rename(old_path, new_path)

df_none

FileNotFoundError: [Errno 2] No such file or directory: 'tweede_pdfs2024/af95ec7a-a922-46b3-81b4-4b568af2bd96.pdf' -> 'tweede_pdfs2024/af95ec7a-a922-46b3-81b4-4b568af2bd96.docx'

In [14]:
# Create the list of docx file names and paths
docx_names = df_none["filename"].tolist()

# get the info on the docx, incl. first ten lines of the docx files, and
# add them back to the speech df as new rows
for name in docx_names:
    temp_path = os.path.join(folder_path, name)

    extracted_text = extract_text_from_docx(temp_path)
    text_ten = get_first_ten_lines (extracted_text)

    data = pd.Series({
        "filename": name,
        "year": file_year,
        "text": text_ten})
    
    new_row = data.to_frame().T
    speech_info = pd.concat([speech_info, new_row], ignore_index=True)

print("nulls in speech df w docx info: ", speech_info['text'].isnull().sum())
speech_info.tail(10)

An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined
An error occurred: name 'Document' is not defined


Unnamed: 0,filename,year,text
1665,08255f2e-3eae-43d6-aa28-4da5e5d6d193.docx,2024,
1666,fedb0619-f944-4a0f-8ab5-3c7245837955.docx,2024,
1667,f4c216cf-e27d-4859-a0ba-8379733f6c01.docx,2024,
1668,0e900c2c-f684-4403-82ed-d0df3a65e450.docx,2024,
1669,95d6315e-1aa1-4cb5-a26c-9038c90a1230.docx,2024,
1670,ab8b2bc7-660d-4b0f-a4c6-96bc3eae5403.docx,2024,
1671,a09bfe92-9511-4fab-9e0a-b932bf7a7dd9.docx,2024,
1672,041c51b3-eba0-4688-8498-534ce53a088b.docx,2024,
1673,8d7e0aaa-fcf9-417d-a354-b87d2024a27d.docx,2024,
1674,61cea4e8-dc8d-49ca-8863-d783bb1cd156.docx,2024,


In [19]:
# save the df to csv
speech_info.to_csv(csv_output_name+".csv", index=False)

# OPTION: Reduce dataset to just the specified speech type

In [15]:
# drop rows that do not contain the designated speech type (from top code cell) in the first 10 rows of the document
speech_info = speech_info[speech_info['text'].str.contains("tweeminutendebat", na=False)]
print(speech_info.shape)
speech_info

(247, 3)


Unnamed: 0,filename,year,text
7,00b786eb-afed-48d1-a170-fd385ec7942b.pdf,2024,"6 \nLandbouw , klimaat en voedsel \nLandbouw ..."
9,00e8a26a-ee73-427d-9a0c-09d3f58c8662.pdf,2024,4 \nNationaal Plan Ener giesyst eem \nNationa...
22,04318c33-4aee-4780-965a-703f179cbc09.pdf,2024,6 \nPersonen- en familier echt \nPersonen- e...
24,04d68bb3-e423-4f57-83ca-4322259c58e8.pdf,2024,27 \nGevangenisw ezen en tbs \nGevangeniswez e...
26,04f304bd-484b-4002-92b5-88a8fb96e09e.pdf,2024,6 \nWoningbouw opgave en koopsect or \nWoningb...
...,...,...,...
1439,f54b797d-ef74-4052-86e6-e5d20b5a2caa.docx,2024,"Stikstof, NPLG en natuur\nStikstof, NPLG en na..."
1443,fb376dc5-23b8-4cfd-b019-7f53bdd06b24.docx,2024,Carbon Capture & Storage (CCS)\nCarbon Capture...
1444,fbbdb407-23c8-4d04-a3b2-b3368b81457c.docx,2024,Exportkredietverzekeringen\nExportkredietverze...
1446,fedb0619-f944-4a0f-8ab5-3c7245837955.docx,2024,Regeling van werkzaamheden (stemmingen)\nRegel...


In [18]:
# save the df to csv
temp_name = csv_output_name + speech_type
speech_info.to_csv(temp_name+".csv", index=False)