<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2014_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install wordfreq
# !pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import pandas as pd
import unicodedata
import wordfreq
# from rapidfuzz import process, fuzz

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m123.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting pymupdf-layout
  Downloading pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (2.8 kB)
Collecting PyMuPDF==1.26.6 (from pymupdf-layout)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting onnxruntime (from pymupdf-layout)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->pymupdf-layout)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->pymupdf-layout)
  Downl

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [4]:
AFFILIATION_KEYWORDS = [
    "University", "College", "Department", "Center", "Institute",
    "Laboratory", "School", "Hospital", "UC", "Centre", "Research",
    "Corporation", "Defence", "Université", "Universite", "Universiy",
    "Universidad", "Univeristy", "KU", "Irvine", "Canada", "Universität"
]

AFF_SPLIT_RE = re.compile(
    r'_(?P<aff>.*?)_(?=\s(?!,))',
    re.DOTALL
)

def split_after_last_affiliation(text: str):
    last_match = None

    for m in AFF_SPLIT_RE.finditer(text):
        aff = m.group("aff")
        if any(k in aff for k in AFFILIATION_KEYWORDS):
            last_match = m

    if last_match:
        split_idx = last_match.end()
        info = text[:split_idx].strip()
        abstract = text[split_idx:].strip()
        return info, abstract

    return None, text.strip()

In [5]:
def normalize_whitespace(s: str) -> str:
    return " ".join(s.replace("\n", "").split())

In [6]:
def normalize_affiliations(entry: str) -> str:
    return re.sub(
        r'(University of California)\s*,?\s*'
        r'(Irvine|Davis|Berkeley|Los Angeles|San Diego|Santa Barbara|Santa Cruz|Riverside|Merced)',
        r'\1, \2',
        entry
    )

In [7]:
LOCATIONS = [
    'United States of America', 'Switzerland', 'Japan', 'Bremen',
    'Berlin, Germany', 'Germany', 'Berlin', 'Norway', 'Turkey',
    'Belgium', 'Italy', 'Israel', 'New Brunswick, New Jersey',
    'Australia', 'The Netherlands', 'USA', 'Netherlands, The', 'Netherlands',
    'United Kingdom', 'Singapore', 'France', 'Dayton OH', 'Dayton',
    'Taiwan, Republic of China', 'Austria', 'Canada',
    'Edmonton', 'Bloomington, Indiana', 'Indiana', 'Russian Federation',
    'University Park, Pennsylvania', 'California', 'San Francisco, California',
    'Taipei, Taiwan', 'Charlottesville, Virginia', 'New York, New York',
    'Toronto, Ontario', 'New Haven, Connecticut', 'Ann Arbor, Michigan', 'Ohio',
    'Ottawa, Ontario', 'Houston, Texas', 'UK', 'New Brunswick, Piscataway, NJ'
]

# compile once
LOCATION_RE = re.compile(
    r',\s*(?:' + '|'.join(map(re.escape, LOCATIONS)) + r')\b',
    re.I
)

def remove_locations(entry: str) -> str:
    return LOCATION_RE.sub('', entry).strip()

In [8]:
def split_authors_affiliations(entry: str) -> tuple[str, str]:
    entry = normalize_whitespace(entry)
    entry = normalize_affiliations(entry)

    # split only on commas WITH SPACES
    tokens = re.split(r'\s+,\s+(?=[A-Z])', entry)

    authors = []
    affiliations = []

    for token in tokens:
        token = token.strip()

        # if token is one word and we already have an affiliation then attach
        if len(token.split()) == 1 and affiliations and token[0].isupper():
            affiliations[-1] = affiliations[-1] + ", " + token
            continue

        if looks_like_affiliation(token):
            affiliations.append(token)
        else:
            authors.append(token)

    if len(set(affiliations)) == 1:
        affiliations = affiliations[0]
    else:
      affiliations = "; ".join(affiliations)

    return (
        ", ".join(authors),
        affiliations
    )

In [9]:
def remove_trailing_text(text):
  no_trailing_junk = entry.split(".")[:-1]

  return ".".join(no_trailing_junk)

In [10]:
def clean_text(text, words_to_hyphenate=None):
    if not text:
        return text

    if words_to_hyphenate:
        for word, hyphenated_word in words_to_hyphenate:
            text = text.replace(word, hyphenated_word)

    text = re.sub(r' \n\n\d{1,3} \n\n', ' ', text)  # Remove page breaks with page number
    text = re.sub(r'\s*\n\s*', ' ', text)    # Replace newlines with spaces


    text = re.sub(r'-\s+(?!\b(?:and|or)\b)', '', text)  # Get rid of - and space after
                                                        # unless word after is
                                                        # "and" or "or"

    text = re.sub(r'\s{2}', ' ', text)       # Collapse two adjacent spaces into one

    text = re.sub(r'\.\s*##.*$', '.', text,\
                  flags=re.DOTALL)           # Gets rid of extraneous text after
                                             # last sentence
    text = text.strip()
    text = fix_ligatures(text)

    return text

In [11]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": "ffi",
    "¨u": "ü",
    "¨a": "ä",
    "´e": "é",
    "`e": "è",
    "`a": "à",
    "¨o": "ö",
    "˚a": "å",
    "c¸": "ç",
    '“': '"',
    '”': '"',
    "’": "'",
    '˜n': 'ñ',
    'ˇs': 'š'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [12]:
# Checks if valid word using Zipf frequency
def is_probably_valid(word, threshold=2.5):
    return wordfreq.zipf_frequency(word, "en") > threshold  # smaller number cuts off
                                                            # more words, bigger is
                                                            # more lenient

# Program

127 entries total (3 keynote talks, 2 plenary talks, 20 symposium talks, 70 talks, and 32 posters)

Markdown shows bold and italic text here.


## Grab text from the pdf

In [13]:
year = '2014'
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')

In [14]:
program_text = pymupdf4llm.to_markdown(program)

In [17]:
program_text[13_500:15_000]

's** \n\n**Saturday, 9:00** Palais Chair: Zigmunt Pizlo \n\n**Symmetry and the computational goals that underlie perception** Horace Barlow, _University of Cambridge, United Kingdom_ Although I am (or was) a neurophysiologist, I do not think records of impulse trains from neurons in perceptual systems can be interpreted properly until we answer the question ”What are the goals of the computations these systems and their neurons are performing?” This is simply because you cannot test whether a system does the job you think it may do unless you have ideas about what that job is. The proposition I like the sound of, and shall argue for here, is that the two main computations in early vision are cross-correlation of patches of the image with fixed templates, and auto-correlations of pairs of image patches related by some specified transformations. One definition of symmetry is ”invariance under transformation”, so is symmetry detection the main computational goal of early vision? This is t

## Split text into presentation entries

In [18]:
all_abstracts = program_text.split('**Abstracts For Keynote Talks**')[1] # this is where abstracts start
split_abstracts = re.split(r'\n\n\*\*', all_abstracts)
abstract_entries = ['**' + entry.strip() for entry in split_abstracts if len(entry) > 200][:-14]
abstract_entries[-1] = abstract_entries[-1].split('\n\n77', 1)[0].strip()

In [19]:
abstract_entries[:2]

['**Symmetry and the computational goals that underlie perception** Horace Barlow, _University of Cambridge, United Kingdom_ Although I am (or was) a neurophysiologist, I do not think records of impulse trains from neurons in perceptual systems can be interpreted properly until we answer the question ”What are the goals of the computations these systems and their neurons are performing?” This is simply because you cannot test whether a system does the job you think it may do unless you have ideas about what that job is. The proposition I like the sound of, and shall argue for here, is that the two main computations in early vision are cross-correlation of patches of the image with fixed templates, and auto-correlations of pairs of image patches related by some specified transformations. One definition of symmetry is ”invariance under transformation”, so is symmetry detection the main computational goal of early vision? This is the first point to be discussed, and I think it turns out t

## Find words to re-hyphenate

In [22]:
pattern = re.compile(r'([A-Za-z]+)-\n\s*([A-Za-z]+)')
possible_hyphenated_words = []

counter = 0
for p, page in enumerate(program[10:104]):  # these are the pages with abstracts only
  text = fix_ligatures(page.get_text('text'))
  matches = pattern.findall(text)

  for left, right in matches:
    word = f"{left}{right}"
    hyphenated_word = f"{left}-{right}"
    if not is_probably_valid(word, threshold=1.6):
      possible_hyphenated_words.append([word, hyphenated_word])

      print(f"{counter:>3}: Page {p+7:<3} {hyphenated_word:<30} {word}")
      counter += 1

  0: Page 8   neuro-physiologically          neurophysiologically
  1: Page 9   hypochon-driacal               hypochondriacal
  2: Page 14  human-made                     humanmade
  3: Page 14  Smith-Kettlewell               SmithKettlewell
  4: Page 15  group-theoretic                grouptheoretic
  5: Page 15  one-dimensional                onedimensional
  6: Page 15  out-there                      outthere
  7: Page 16  multi-class                    multiclass
  8: Page 16  skeleton-based                 skeletonbased
  9: Page 19  Dzhafarov-Kujala               DzhafarovKujala
 10: Page 19  Fischer-Hilbert                FischerHilbert
 11: Page 19  Abramsky-Brandenburger         AbramskyBrandenburger
 12: Page 19  by-default                     bydefault
 13: Page 19  con-textuality                 contextuality
 14: Page 20  Neuro-Engineering              NeuroEngineering
 15: Page 20  post-iconic                    posticonic
 16: Page 21  Breit-meyer                    Bre

In [23]:
# Pick indices of words to rehyphenate
indices = [(2, 12), 14, 15, 17, 18, 21, 22, (24, 26), 29, 30, 32, 35, 37, 38, 40, (47, 49), (52, 54), 56, (59, 61), (66, 68), 70, 71, 74, (78, 81), 83, (85, 90), 92, (94, 98)]
words_to_hyphenate = [
    possible_hyphenated_words[i]
    for item in indices
    for i in ([item] if isinstance(item, int) else range(*item))
]

In [24]:
for word, hyphenated_word in words_to_hyphenate:
  print(f'{word:<30} {hyphenated_word}')

humanmade                      human-made
SmithKettlewell                Smith-Kettlewell
grouptheoretic                 group-theoretic
onedimensional                 one-dimensional
outthere                       out-there
multiclass                     multi-class
skeletonbased                  skeleton-based
DzhafarovKujala                Dzhafarov-Kujala
FischerHilbert                 Fischer-Hilbert
AbramskyBrandenburger          Abramsky-Brandenburger
NeuroEngineering               Neuro-Engineering
posticonic                     post-iconic
wellknown                      well-known
samedifferent                  same-different
fastsame                       fast-same
functionalform                 functional-form
DiscreteState                  Discrete-State
Continuousstrength             Continuous-strength
parietaloccipital              parietal-occipital
nondecision                    non-decision
activelyA                      actively-A
wellknown                      well-

## Sort authors, affiliations, title, and abstract

In [25]:
parsed_entries = []

for entry in abstract_entries:
  cleaned_entry = clean_text(entry, words_to_hyphenate)
  if '_' in cleaned_entry:
    info_text, abstract = split_after_last_affiliation(cleaned_entry)
    info_text = info_text + '_'   # add back last '_'
  else:
    parsed_entries.append({
        'year': '',
        'author(s)': '',
        'affiliation(s)': '',
        'title': '',
        'type': '',
        'abstract': cleaned_entry
    })
    continue

  # Extracts title
  title_parts = re.findall(r'\*\*(.*?)\*\*', info_text)
  title = ' '.join(a.strip() for a in title_parts) if title_parts else None

  # Extracts all affiliations in entry
  affiliation_parts = re.findall(r'_(.*?)_', info_text)
  affiliations = '; '.join(a.strip()\
                           for a in affiliation_parts)\
                           if affiliation_parts else None

  # Removes title and affiliation from info_text to get authors
  authors_text = info_text

  for t in title_parts:
    authors_text = authors_text.replace(f'**{t}**', '')

  for a in affiliation_parts:
    authors_text = authors_text.replace(f'_{a}_', '')

  # Cleans up punctuation & whitespace
  authors = authors_text.strip().split(',')
  list_authors = [a.strip() for a in authors if a.strip() and a.strip() != '_']
  cleaned_authors = ', '.join(list_authors)

  if len(set(affiliation_parts)) == 1:
    affiliations = affiliation_parts[0]
  else:
    affiliations = '; '.join(affiliation_parts)

  affiliations = remove_locations(affiliations)

  parsed_entries.append({
    'year': year,
    'author(s)': cleaned_authors,
    'affiliation(s)': affiliations,
    'title': title.strip('.'),
    'type': '',
    'abstract': abstract.strip()
  })

In [26]:
parsed_entries[:2]

[{'year': '2014',
  'author(s)': 'Horace Barlow',
  'affiliation(s)': 'University of Cambridge',
  'title': 'Symmetry and the computational goals that underlie perception',
  'type': '',
  'abstract': 'Although I am (or was) a neurophysiologist, I do not think records of impulse trains from neurons in perceptual systems can be interpreted properly until we answer the question "What are the goals of the computations these systems and their neurons are performing?" This is simply because you cannot test whether a system does the job you think it may do unless you have ideas about what that job is. The proposition I like the sound of, and shall argue for here, is that the two main computations in early vision are cross-correlation of patches of the image with fixed templates, and auto-correlations of pairs of image patches related by some specified transformations. One definition of symmetry is "invariance under transformation", so is symmetry detection the main computational goal of earl

# Create df and convert to csv

In [27]:
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [28]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2014,Horace Barlow,University of Cambridge,Symmetry and the computational goals that unde...,,"Although I am (or was) a neurophysiologist, I ..."
1,2014,Richard S. Sutton,University Of Alberta,Reinforcement Learning and Psychology: A Perso...,,The modern field of reinforcement learning (RL...
2,2014,Wolf Vanpaemel,KU Leuven,Five routes to better models of cognition,,An important goal in cognitive science is to b...
3,2014,Joachim Vandekerckhove,"University of California, Irvine",A crowd-sourced scheduling system for academic...,,I will present preliminary results of a crowd-...
4,2014,Richard M. Shiffrin,Indiana University,Moving past BMS and MDL: Making model evaluati...,,I present a generalization of Bayesian Model S...


In [29]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)