In [3]:
import os
import csv
from PyPDF2 import PdfReader

INPUT_DIR = 'papers_adhd'
OUTPUT_DIR = 'papers_adhd_csvs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

output_csv = os.path.join(OUTPUT_DIR, 'all_papers.csv')

with open(output_csv, mode='w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(
        csvfile,
        quoting=csv.QUOTE_MINIMAL,    # only quote fields when needed
        quotechar='"',                # wrap fields containing special chars in "
        escapechar='\\'               # escape embedded quotechars with backslash
    )
    writer.writerow(['page_text', 'source_file', 'page_number'])

    for fname in os.listdir(INPUT_DIR):
        if not fname.lower().endswith('.pdf'):
            continue

        pdf_path = os.path.join(INPUT_DIR, fname)
        try:
            reader = PdfReader(pdf_path)
        except Exception as e:
            print(f'[!] Could not open "{fname}": {e}')
            continue

        for page_num, page in enumerate(reader.pages, start=1):
            try:
                text = page.extract_text() or ''
            except Exception as e:
                text = f'[Error on page {page_num}: {e}]'

            # remove any null bytes that can break the CSV writer
            text = text.replace('\x00', '')

            writer.writerow([text, fname, page_num])

        print(f'  → Processed "{fname}" ({len(reader.pages)} pages)')

print(f'All done! Combined CSV written to:\n  {output_csv}')


  → Processed "paper1.pdf" (28 pages)
  → Processed "paper10.pdf" (2 pages)
  → Processed "paper11.pdf" (3 pages)
  → Processed "paper12.pdf" (13 pages)
  → Processed "paper13.pdf" (28 pages)
  → Processed "paper14.pdf" (13 pages)
  → Processed "paper15.pdf" (13 pages)
  → Processed "paper16.pdf" (17 pages)
  → Processed "paper17.pdf" (15 pages)
  → Processed "paper18.pdf" (22 pages)
  → Processed "paper19.pdf" (11 pages)
  → Processed "paper2.pdf" (14 pages)
  → Processed "paper20.pdf" (8 pages)
  → Processed "paper21.pdf" (24 pages)
  → Processed "paper3.pdf" (13 pages)
  → Processed "paper4.pdf" (15 pages)
  → Processed "paper5.pdf" (11 pages)
  → Processed "paper6.pdf" (9 pages)
  → Processed "paper7.pdf" (6 pages)
  → Processed "paper8.pdf" (11 pages)
  → Processed "paper9.pdf" (4 pages)
All done! Combined CSV written to:
  papers_adhd_csvs\all_papers.csv


In [None]:
import pandas as pd

# df= pd.read_csv(r"papers_adhd_csvs/all_papers.csv", encoding='utf-8', escapechar='\\')
df= pd.read_csv(r"papers_adhd_csvs/all_papers.csv", encoding='utf-8')
# df['page_text'] = df['page_text'].str.replace(r'\\', '', regex=True)
max_length = df['page_text'].str.len().max()

0    Exploring N-Back Cognitive Training for Childr...
1    prevalence of ADHD, the chances are high that ...
2    these interventions, teachers (behavioral clas...
3    inhibition and working memory-demanding execut...
4    learning or attention disorders. Based on our ...
5    consisted of 80 children ( mean age : 10.14 ye...
6    task presented participants with images at one...
7    training task, which was adaptive, the n-back ...
8    level. For example, if a child successfully co...
9    used to measure participants’ broad math skill...
Name: page_text, dtype: object