<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2003_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import pandas as pd
import unicodedata
from rapidfuzz import process, fuzz

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m124.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting pymupdf-layout
  Downloading pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (2.8 kB)
Collecting PyMuPDF==1.26.6 (from pymupdf-layout)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting onnxruntime (from pymupdf-layout)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->pymupdf-layout)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->pymupdf-layout)
  Downl

In [4]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [200]:
AFFILIATION_KEYWORDS = [
    "University", "College", "Department", "Center", "Institute",
    "Laboratory", "School", "Hospital", "UC", "Centre", "Research",
    "Corporation", "Defence", "Université", "Universite", "Universiy"
]

def looks_like_affiliation(s: str) -> bool:
    return any(k.lower() in s.lower() for k in AFFILIATION_KEYWORDS)

In [201]:
def normalize_whitespace(s: str) -> str:
    return " ".join(s.replace("\n", " ").split())

In [202]:
def strip_leading_noise(s: str) -> str:
    """
    Removes leading sentence fragments that are not names.
    Keeps last sentence before names.
    """
    parts = re.split(r"\.\s+", s)
    if len(parts) > 1:
        return parts[-1]
    return s

In [213]:
def split_authors_affiliations(entry: str) -> tuple[str, str]:
    # Normalize
    entry = normalize_whitespace(entry)
    # entry = strip_leading_noise(entry)

    # Split on commas and 'and'
    chunks = re.split(r",|\band\b|&", entry)
    chunks = [c.strip() for c in chunks if c.strip()]

    authors = []
    affiliations = []

    for chunk in chunks:
        if looks_like_affiliation(chunk):
            affiliations.append(chunk)
        else:
            authors.append(chunk)
            # # reject obvious non-names
            # if re.search(r"[A-Z][a-z]", chunk):
            #     authors.append(chunk)

    return (
        ", ".join(authors),
        "; ".join(dict.fromkeys(affiliations)).rstrip('.')  # dedupe, keep order
    )

# Program

Only program pdf and no abstracts. 51 total entries about papers. When there are two "columns", the text still goes from left to right across the entire page and is just visually two columns (there are either new lines or multiple spaces to separate the two talk entries in each "column").



## Grab text from the pdf

In [14]:
year = '2003'
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')

In [226]:
# Extracts text as one whole string
program_text = pymupdf4llm.to_text(program)
program_text[:1500]

'______________________________________________________________________________ \n\nThursday, July24 \n\n______________________________________________________________________________ \n\n7:00 pm : Welcome and Registration, University Village \n\n______________________________________________________________________________ \n\nFriday Morning, July 25 \n\n______________________________________________________________________________ \n\n7:30 Continental breakfast, Ballroom A, Student Union \n\n8:00+ Registration: Student Union\n\n______________________________________________________________________________ \n\nSession 1, Room 325: Sensation, Perception, and Psychophysics I Steve Link, Chair \n\nSession2, Ballroom B: Methodology and Statistics I \n\nMichel Reggenwetter, Chair \n\n8:30 The analytical form of the Daylight Locus. Geoffrey Iverson and Charlie Chubb, University of California at Irvine \n\nModel complexity and mimicry: A case study of Connectionist models of speech perceptio

## Clean up entries

In [87]:
# Split by whether it is visually one or "two" columns in the pdf
section_split = re.split(r"_+", program_text)
session_2col_split = [sect.strip() for sect in section_split if re.search(r"Session\s*[0-9],", sect)]
session_1col_split = [sect.strip() for sect in section_split if not re.search(r"Session\s*[0-9],", sect) and '.' in sect]

In [227]:
session_2col_split[0]

'Session 1, Room 325: Sensation, Perception, and Psychophysics I Steve Link, Chair \n\nSession2, Ballroom B: Methodology and Statistics I \n\nMichel Reggenwetter, Chair \n\n8:30 The analytical form of the Daylight Locus. Geoffrey Iverson and Charlie Chubb, University of California at Irvine \n\nModel complexity and mimicry: A case study of Connectionist models of speech perception. In Jae Myung, Woojae Kim, and Mark Pitt, Ohio State University \n\n9:00 The Café Illusion. Thaddeus Cowan, Kansas State University and Weber State University and Ben Smith , Weber State University and Mark Pitt, Ohio State University \n\nEfficient computation of Fisher Information in MDL-based model selection \n\nYong Su, In Jae Myung \n\n9:30 A theory of opponent processes. Steve Link, University of California and McMasters University \n\nRobust asymptotic statistical theory for knowledge digraph contribution analysis \n\nRichard M. Golden, University \n\nof Texas at Dallas'

In [230]:
session_1col_split[:2]

['10:30 Young Investigator Award Presentation and Talk, Ballroom B \n\nBaysian inference for testing axioms of measurement and decision. \n\nGeorge Karabatsos, University of Illinois at Chicago, YIA winner for 2002',
 '11:30-12:00+ Poster Session, (Hallway)\n\n   1.  Distinctive sequential statistical regularities in story summary and story recall data. Cynthia Jaynes and Richard Golden, University of Texas at Dallas.\n\n   2. Cortical oscillations during memory encoding predict subsequent memory. Per Sederberg, Brandeis University, Joseph Madsen, Children’s Hospital, Boston , and Michael Kahana , Brandeis University\n\n   3.  An exemplar-based random walk model of old recognition. Roger Stanton, Robert Nosofsky and Andrew Cohen, Indiana University\n\n   4. An exemplar similarity model of short-term perceptual recognition. Justin Kanter and Robert Nosofsky, Indiana Universiy']

### Split by talk entries

In [231]:
entry_strings = []

for entry in session_2col_split:
  # Splits by time with any whitespace after it
  time_entry = re.split(r"\d{1,2}:\d{2}\s*", entry)
  for i, e in enumerate(time_entry):
    if "Chair" in e:
      continue
    else:
      lines = re.split(r"\n\n", e, 1)

      for line in lines:
        if line.strip():
          entry_strings.append(line.strip())

for entry in session_1col_split:
  # Split by numbered list for list of posters
  if re.search(r"[1-4]\.", entry):
    poster_entry = re.split(r"[1-4]\.\s*", entry)
    if poster_entry:
      entry_strings.extend([post.strip() for post in poster_entry])
  # Splits by times
  elif re.search(r"\d{1,2}:\d{2}\s*", entry):
    time_entry = re.split(r"\d{1,2}:\d{2}\s*", entry)
    if time_entry:
      entry_strings.extend([time.strip() for time in time_entry])
  else:
    if entry.strip():
      entry_strings.append(entry.strip())

In [232]:
entry_strings[:3]

['The analytical form of the Daylight Locus. Geoffrey Iverson and Charlie Chubb, University of California at Irvine',
 'Model complexity and mimicry: A case study of Connectionist models of speech perception. In Jae Myung, Woojae Kim, and Mark Pitt, Ohio State University',
 'The Café Illusion. Thaddeus Cowan, Kansas State University and Weber State University and Ben Smith , Weber State University and Mark Pitt, Ohio State University']

### Split titles from authors and affiliations

In [193]:
titles = []
authors_and_affiliations = []
for string in sorted_session_strings:
  if re.search(r"[a-z]{2,}\.", string):
    split = string.split(".", 1)
  else:
    split = string.split("\n\n)", 1)
  if len(split) == 1:
    continue
  title, auth_aff = split
  titles.append(title.strip())
  authors_and_affiliations.append(auth_aff.strip())

In [234]:
titles[:7]

['The analytical form of the Daylight Locus',
 'Model complexity and mimicry: A case study of Connectionist models of speech perception',
 'The Café Illusion',
 'A theory of opponent processes',
 'Reasoning with rules that have rare The manifold of probability density exceptions: A Bayesian second order functions',
 'Thresholding nonparametric regression',
 'Techniques for hazard function analysis']

In [235]:
authors_and_affiliations[:7]

['Geoffrey Iverson and Charlie Chubb, University of California at Irvine',
 'In Jae Myung, Woojae Kim, and Mark Pitt, Ohio State University',
 'Thaddeus Cowan, Kansas State University and Weber State University and Ben Smith , Weber State University and Mark Pitt, Ohio State University',
 'Steve Link, University of California and McMasters University',
 'Jun Zhang, University of probability approach. Donald Bamber Michigan. and I. R. Goodman, Space and Naval Warfare Systems Center, and Hung T. Nguyen, New Mexico State University.',
 'Pyramid model of human problem Zhang Wen, National University of solving. Zheng Li and Zygmunt Pizlo,',
 'Richard Chechile, Tufts University.']

### Split authors from affiliations and create dictionary with year, title, author, affiliation

In [218]:
parsed_entries = []

for a, auth_and_aff in enumerate(authors_and_affiliations):
  auth, aff = split_authors_affiliations(auth_and_aff)
  parsed_entries.append({
        "year": year,
        "author(s)": auth,
        "affiliation(s)": aff,
        "title": titles[a]
  })

In [237]:
parsed_entries[:3]

[{'year': '2003',
  'author(s)': 'Geoffrey Iverson, Charlie Chubb',
  'affiliation(s)': 'University of California at Irvine',
  'title': 'The analytical form of the Daylight Locus'},
 {'year': '2003',
  'author(s)': 'In Jae Myung, Woojae Kim, Mark Pitt',
  'affiliation(s)': 'Ohio State University',
  'title': 'Model complexity and mimicry: A case study of Connectionist models of speech perception'},
 {'year': '2003',
  'author(s)': 'Thaddeus Cowan, Ben Smith, Mark Pitt',
  'affiliation(s)': 'Kansas State University; Weber State University; Ohio State University',
  'title': 'The Café Illusion'}]

# Create df and convert to csv

In [220]:
# Skip first entry because extraneous text
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title"])

In [221]:
df.head(5)

Unnamed: 0,year,author(s),affiliation(s),title
0,2003,"Geoffrey Iverson, Charlie Chubb",University of California at Irvine,The analytical form of the Daylight Locus
1,2003,"In Jae Myung, Woojae Kim, Mark Pitt",Ohio State University,Model complexity and mimicry: A case study of ...
2,2003,"Thaddeus Cowan, Ben Smith, Mark Pitt",Kansas State University; Weber State Universit...,The Café Illusion
3,2003,Steve Link,University of California; McMasters University,A theory of opponent processes
4,2003,"Jun Zhang, I. R. Goodman, Space, Hung T. Nguyen",University of probability approach. Donald Bam...,Reasoning with rules that have rare The manifo...


In [223]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)