<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2006_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import pandas as pd
import unicodedata
from rapidfuzz import process, fuzz

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m116.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting pymupdf-layout
  Downloading pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (2.8 kB)
Collecting PyMuPDF==1.26.6 (from pymupdf-layout)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting onnxruntime (from pymupdf-layout)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->pymupdf-layout)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->pymupdf-layout)
  Downl

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [5]:
AFFILIATION_KEYWORDS = [
    "University", "College", "Department", "Center", "Institute",
    "Laboratory", "School", "Hospital", "UC", "Centre", "Research",
    "Corporation", "Defence", "Defense", "Université", "Universite",
    "Universiy"
]

AFFILIATION_RE = re.compile(
    r"|".join(map(re.escape, AFFILIATION_KEYWORDS)),
    re.IGNORECASE
)

def looks_like_authors(line: str) -> bool:
    return (
        re.search(r'[A-Z][a-z]+', line) is not None
        and not AFFILIATION_RE.search(line)
    )

In [6]:
def looks_like_affiliation(line: str) -> bool:
    line = " ".join(line.split())
    return bool(AFFILIATION_RE.search(line))

In [13]:
def extract_title(lines):
    title_lines = []
    i = 0

    if not lines[0].startswith("“"):
        return None, 0

    while i < len(lines):
        title_lines.append(lines[i])
        if lines[i].endswith("”"):
            break
        i += 1

    return " ".join(title_lines), i + 1

In [14]:
def parse_entry(entry):
    lines = [l.strip() for l in entry.split("\n") if l.strip()]

    title, i = extract_title(lines)
    authors, i = extract_authors(lines, i)
    affiliations, i = extract_affiliations(lines, i)
    abstract = extract_abstract(lines, i)

    if title:
      no_quote_title = title.strip(r'(?:“”)')
    else:
      no_quote_title = None

    if authors:
      if '&' in authors:
        if ',' in authors:
          no_amp_authors = authors.replace(', &', ',')
        else:
          no_amp_authors = authors.replace(' &', ',')
      else:
        no_amp_authors = authors
    else:
      no_amp_authors = None

    return {
        "title": no_quote_title,
        "author(s)": no_amp_authors,
        "affiliation(s)": affiliations,
        "abstract": abstract
    }

In [15]:
def extract_authors(lines, start):
    authors = []
    i = start

    while i < len(lines) and looks_like_authors(lines[i]):
        authors.append(lines[i])
        i += 1

    return " ".join(authors), i

In [16]:
def extract_affiliations(lines, start):
    affiliations = []
    i = start

    while i < len(lines) and looks_like_affiliation(lines[i]):
        affiliations.append(lines[i])
        i += 1

    return " ".join(affiliations), i

In [17]:
def extract_abstract(lines, start):
    abstract = lines[start:]

    # Stop if session metadata leaks in
    cutoff = []
    for line in abstract:
        if line.startswith(("Session:", "Room:", "Chaired by:")):
            break
        cutoff.append(line)

    return " ".join(cutoff)


In [19]:
def clean_text(text):
    if not text:
        return text

    text = re.sub(r'-\s+', '', text)      # fix broken hyphenation
    text = re.sub(r'\s{2,}', ' ', text)   # collapse spaces
    text = text.strip()

    return text

In [20]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": 'ffi'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

# Program

108 total entries (18 posters, 90 talks). PDF has abstracts and also a table schedule of just titles and authors. Affiliations have ([author initials]) to show which author belongs to that institute.


## Grab text from the pdf

In [21]:
year = '2006'
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')
table_pages = program[6:10]
abstract_pages = program[11:]

In [22]:
abstract_text = []
for page in abstract_pages:
  abstract_text.append(page.get_text('text'))

In [23]:
abstract_text[0]

'10\n2\nTALKS\n2\nTalks\n2.1\nSunday, July 30\nSession: Decision Processes\nRoom: York Room\nChaired by: Eric-Jan Wagenmakers\n8:00-8:25\n“Human and Ideal Sequential Decision Making: Local-\nizing the Cognitive Bottleneck”\nBrian Stankiewicz* & Kyler Eastman\nUniversity of Texas at Austin\nMost natural decision involve a sequence of decisions in which\nthe decision maker can continually gather more information be-\nfore ﬁnally declaring.\nThe current studies investigate human\nbehavior in a sequential decision making task involving varia-\ntions of a “seek & destroy” task. In this task, participants are\nattempting to localize and destroy his/her opponent with noisy\nobservations and artillery while maximizing their expected re-\nward. We compared the human performance to that of the ideal\ndecision maker using partially observable Markov decision pro-\ncesses (POMDP). Using the POMDP computed the expected\nreward for ideal performance and computed a ratio between\nthe human’s performa

## Clean up entries

In [24]:
split_abstract_text = [
    part
    for abstract in abstract_text
    for part in re.split(r"\d{1,2}:\d{2}-\d{1,2}:\d{2}\s*", abstract)
]

In [25]:
split_entries = []

for entry in split_abstract_text:
    entry = fix_ligatures(entry.strip())
    if not entry:
        continue

    lines = entry.split('\n')

    # Detect page-break continuation
    if re.match(r'^\d', entry):
        # Too short → junk
        if len(lines) < 3:
            continue

        # Skip cases where the continuation itself is another numbered item
        if re.match(r'^\d', lines[0]) and re.match(r'^\d', lines[-1]):
            continue

        # Extract meaningful continuation text
        continuation = '\n'.join(lines[3:]).strip()
        if not continuation:
            continue

        # Attach to previous entry if possible
        if split_entries:
            split_entries[-1] += '\n' + continuation
        else:
            # Edge case: continuation with no previous entry
            split_entries.append(continuation)

    else:
        # Normal entry → add as-is
        split_entries.append(entry)

In [43]:
split_entries[1:3]

['“Human and Ideal Sequential Decision Making: Local-\nizing the Cognitive Bottleneck”\nBrian Stankiewicz* & Kyler Eastman\nUniversity of Texas at Austin\nMost natural decision involve a sequence of decisions in which\nthe decision maker can continually gather more information be-\nfore finally declaring.\nThe current studies investigate human\nbehavior in a sequential decision making task involving varia-\ntions of a “seek & destroy” task. In this task, participants are\nattempting to localize and destroy his/her opponent with noisy\nobservations and artillery while maximizing their expected re-\nward. We compared the human performance to that of the ideal\ndecision maker using partially observable Markov decision pro-\ncesses (POMDP). Using the POMDP computed the expected\nreward for ideal performance and computed a ratio between\nthe human’s performance and the optimal performance (an ef-\nficiency measure). The goal of these studies was to understand\nthe cognitive limitations prev

In [27]:
parsed_entries = []

for entry in split_entries:
  parsed_entries.append(parse_entry(entry))

In [44]:
parsed_entries[1:3]

[{'title': 'Human and Ideal Sequential Decision Making: Local- izing the Cognitive Bottleneck',
  'author(s)': 'Brian Stankiewicz*, Kyler Eastman',
  'affiliation(s)': 'University of Texas at Austin',
  'abstract': 'Most natural decision involve a sequence of decisions in which the decision maker can continually gather more information be- fore finally declaring. The current studies investigate human behavior in a sequential decision making task involving varia- tions of a “seek & destroy” task. In this task, participants are attempting to localize and destroy his/her opponent with noisy observations and artillery while maximizing their expected re- ward. We compared the human performance to that of the ideal decision maker using partially observable Markov decision pro- cesses (POMDP). Using the POMDP computed the expected reward for ideal performance and computed a ratio between the human’s performance and the optimal performance (an ef- ficiency measure). The goal of these studies w

In [29]:
final_entries = []
for entry in parsed_entries:
  clean_entry = {}
  for k in ["author(s)", "affiliation(s)", "title", "abstract"]:
    clean_entry["year"] = year
    clean_entry[k] = clean_text(entry[k])
  final_entries.append(clean_entry)

In [45]:
final_entries[1:3]

[{'year': '2006',
  'author(s)': 'Brian Stankiewicz*, Kyler Eastman',
  'affiliation(s)': 'University of Texas at Austin',
  'title': 'Human and Ideal Sequential Decision Making: Localizing the Cognitive Bottleneck',
  'abstract': 'Most natural decision involve a sequence of decisions in which the decision maker can continually gather more information before finally declaring. The current studies investigate human behavior in a sequential decision making task involving variations of a “seek & destroy” task. In this task, participants are attempting to localize and destroy his/her opponent with noisy observations and artillery while maximizing their expected reward. We compared the human performance to that of the ideal decision maker using partially observable Markov decision processes (POMDP). Using the POMDP computed the expected reward for ideal performance and computed a ratio between the human’s performance and the optimal performance (an efficiency measure). The goal of these stu

# Create df and convert to csv

In [33]:
# Skip first entry because extraneous text
df = pd.DataFrame(final_entries, columns=["year", "author(s)", "affiliation(s)", "title", "abstract"])

In [46]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,abstract
0,2006,,,,"2 Talks 2.1 Sunday, July 30"
1,2006,"Brian Stankiewicz*, Kyler Eastman",University of Texas at Austin,Human and Ideal Sequential Decision Making: Lo...,Most natural decision involve a sequence of de...
2,2006,Richard Shiffrin,Indiana University,Paradoxes real and imagined,I use a variant of the ’Exchange Paradox’ to m...
3,2006,Eric-Jan Wagenmakers,University of Amsterdam,Modeling choice behavior in the Iowa gambling ...,"The purpose of the Iowa gambling task, develop..."
4,2006,Yingrui Yang,Rensselaer Polytechnic Institute,Toward a mental decision logic of the small-gr...,Human decision making is really a two-stage pr...


In [35]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)