<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2013_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install wordfreq
# !pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import pandas as pd
import unicodedata
import wordfreq
# from rapidfuzz import process, fuzz

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting pymupdf-layout
  Downloading pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (2.8 kB)
Collecting PyMuPDF==1.26.6 (from pymupdf-layout)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting onnxruntime (from pymupdf-layout)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->pymupdf-layout)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->pymupdf-layout)
  Downlo

In [4]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [67]:
AFFILIATION_KEYWORDS = [
    "University", "College", "Department", "Center", "Institute",
    "Laboratory", "School", "Hospital", "UC", "Centre", "Research",
    "Corporation", "Defence", "Université", "Universite", "Universiy",
    "Universidad", "Univeristy"
]
AFFILIATION_KEYWORDS = re.compile(r'\b(' + '|'.join(AFFILIATION_KEYWORDS) + r')\b',
                                  re.I)

def looks_like_affiliation(chunk):
    return bool(AFFILIATION_KEYWORDS.search(chunk))

In [68]:
def normalize_whitespace(s: str) -> str:
    return " ".join(s.replace("\n", "").split())

In [69]:
def normalize_affiliations(entry: str) -> str:
    return re.sub(
        r'(University of California)\s*,?\s*'
        r'(Irvine|Davis|Berkeley|Los Angeles|San Diego|Santa Barbara|Santa Cruz|Riverside|Merced)',
        r'\1, \2',
        entry
    )

In [70]:
LOCATIONS = [
    'United States of America', 'Switzerland', 'Japan',
    'Germany', 'Berlin', 'Belgium', 'Italy', 'Israel',
    'Australia', 'The Netherlands', 'USA', 'Netherlands',
    'United Kingdom', 'Singapore', 'France',
    'Taiwan, Republic of China', 'Austria', 'Canada'
]

# compile once
LOCATION_RE = re.compile(
    r',\s*(?:' + '|'.join(map(re.escape, LOCATIONS)) + r')\b',
    re.I
)

def remove_locations(entry: str) -> str:
    return LOCATION_RE.sub('', entry).strip()

In [71]:
def split_authors_affiliations(entry: str) -> tuple[str, str]:
    entry = normalize_whitespace(entry)
    entry = normalize_affiliations(entry)

    # split only on commas WITH SPACES
    tokens = re.split(r'\s+,\s+(?=[A-Z])', entry)

    authors = []
    affiliations = []

    for token in tokens:
        token = token.strip()

        # if token is one word and we already have an affiliation then attach
        if len(token.split()) == 1 and affiliations and token[0].isupper():
            affiliations[-1] = affiliations[-1] + ", " + token
            continue

        if looks_like_affiliation(token):
            affiliations.append(token)
        else:
            authors.append(token)

    if len(set(affiliations)) == 1:
        affiliations = affiliations[0]
    else:
      affiliations = "; ".join(affiliations)

    return (
        ", ".join(authors),
        affiliations
    )

In [72]:
def remove_trailing_text(text):
  no_trailing_junk = entry.split(".")[:-1]

  return ".".join(no_trailing_junk)

In [73]:
def clean_text(text, words_to_hyphenate=None):
    if not text:
        return text

    if words_to_hyphenate:
        for word, hyphenated_word in words_to_hyphenate:
            text = text.replace(word, hyphenated_word)

    text = re.sub(r' \n\n\d{1,3} \n\n', ' ', text)  # Remove page breaks with page number
    text = re.sub(r'\s*\n\s*', ' ', text)    # Replace newlines with spaces


    text = re.sub(r'-\s+(?!\b(?:and|or)\b)', '', text)  # Get rid of - and space after
                                                        # unless word after is
                                                        # "and" or "or"

    text = re.sub(r'\s{2}', ' ', text)       # Collapse two adjacent spaces into one

    text = re.sub(r'\.\s*##.*$', '.', text,\
                  flags=re.DOTALL)           # Gets rid of extraneous text after
                                             # last sentence
    text = text.strip()
    text = fix_ligatures(text)

    return text

In [74]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": "ffi",
    "¨u": "ü",
    "¨a": "ä",
    "´e": "é",
    "`e": "è",
    "`a": "à",
    "¨o": "ö",
    "˚a": "å",
    "c¸": "ç"
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [80]:
# Checks if valid word using Zipf frequency
def is_probably_valid(word, threshold=2.5):
    return wordfreq.zipf_frequency(word, "en") > threshold  # smaller number cuts off
                                                            # more words, bigger is
                                                            # more lenient

# Program

177 entries total (2 keynote talks, 27 symposium talks, 106 talks, and 42 posters)

Markdown shows bold and italic text here.


## Grab text from the pdf

In [14]:
year = '2013'
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')

In [15]:
program_text = pymupdf4llm.to_markdown(program)

In [19]:
program_text[16000:17500]

'ing will be held in seminar room S13 (1st floor, building 6, Campus Griebnitzsee). \n\n6 \n\n## **Abstracts For Keynote Talks** \n\nChair: Hans Colonius \n\n## **Monday, 9:00** \n\n## Helmholtz \n\n**Machine learning methods for system identification in sensory psychology.** Felix A. Wichmann, _University of T¨ubingen, Germany_ . As a prerequisite to quantitative psychophysical models of sensory processing it is necessary to know to what extent decisions in behavioral tasks depend on specific stimulus features, the perceptual cues: Given the highdimensional input, which are the features the sensory systems base their computations on? Over the last years we have developed inverse machine learning methods for (potentially nonlinear) system identification, and have applied them to identify regions of visual saliency (Kienzle et al., 2009), to gender discrimination of human faces (Wichmann et al., 2005; Macke & Wichmann, 2010), and to the identification of auditory tones in noise (Sch¨onf

## Split text into presentation entries

In [20]:
all_abstracts = program_text.split('Chair: Hans Colonius')[1] # this is where abstracts start
split_abstracts = re.split(r'\n\n\*\*', all_abstracts)
entries = [entry.strip() for entry in split_abstracts if len(entry) > 120][:-18]

In [24]:
ROOM_KEYWORDS = ['Helmholtz', 'Bayes', 'Euler', 'Fechner', 'Lobby']
room_re = re.compile(
    r'\b\s*(?:' + '|'.join(ROOM_KEYWORDS) + r')\s*\*\*\b',
    re.I
)
abstract_entries = []

for entry in entries:
  split_entry = re.split(room_re, entry)
  abstract_entries.extend([entry.strip() for entry in split_entry if entry.strip()])

abstract_entries = ['**' + entry for entry in abstract_entries if len(entry) > 50]

In [25]:
abstract_entries[:2]

['**Machine learning methods for system identification in sensory psychology.** Felix A. Wichmann, _University of T¨ubingen, Germany_ . As a prerequisite to quantitative psychophysical models of sensory processing it is necessary to know to what extent decisions in behavioral tasks depend on specific stimulus features, the perceptual cues: Given the highdimensional input, which are the features the sensory systems base their computations on? Over the last years we have developed inverse machine learning methods for (potentially nonlinear) system identification, and have applied them to identify regions of visual saliency (Kienzle et al., 2009), to gender discrimination of human faces (Wichmann et al., 2005; Macke & Wichmann, 2010), and to the identification of auditory tones in noise (Sch¨onfelder & Wichmann, 2012; 2013). In my talk I will concentrate on how stimulus-response data can be analyzed relying on _L_ 1-regularized multiple logistic regression. This method prevents both over-

## Find words to re-hyphenate

In [85]:
pattern = re.compile(r'([A-Za-z]+)-\n\s*([A-Za-z]+)')
possible_hyphenated_words = []

counter = 0
for p, page in enumerate(program[10:104]):  # these are the pages with abstracts only
  text = fix_ligatures(page.get_text('text'))
  matches = pattern.findall(text)

  for left, right in matches:
    word = f"{left}{right}"
    hyphenated_word = f"{left}-{right}"
    if not is_probably_valid(word, threshold=1.6):
      possible_hyphenated_words.append([word, hyphenated_word])

      print(f"{counter:>3}: Page {p+7:<3} {hyphenated_word:<30} {word}")
      counter += 1

  0: Page 7   high-dimensional               highdimensional
  1: Page 7   non-compensatory               noncompensatory
  2: Page 9   low-frequency                  lowfrequency
  3: Page 9   fix-ational                    fixational
  4: Page 9   Bet-tenb                       Bettenb
  5: Page 9   Holschnei-der                  Holschneider
  6: Page 10  Philipps-University            PhilippsUniversity
  7: Page 11  longest-fixation               longestfixation
  8: Page 11  Truken-brod                    Trukenbrod
  9: Page 11  Uni-versit                     Universit
 10: Page 12  first-order                    firstorder
 11: Page 13  interval-ordered               intervalordered
 12: Page 13  interval-order                 intervalorder
 13: Page 15  speed-accuracy                 speedaccuracy
 14: Page 15  speed-accuracy                 speedaccuracy
 15: Page 15  Oll-man                        Ollman
 16: Page 17  identifi-ability               identifiability
 17: Page 

In [86]:
# Pick indices of words to rehyphenate
indices = [(0,2), (6,7), (10,14), (20,23), (25,32), 36, (38,40), (42,45), 47, 50, (55,56), 60, (62,67), (69,72), 74, (76,77), (82,83), (85,87), (89,90), (92,95), (97,103), (106,107), (109,110), (114,123), 127, (130,131), 134]

words_to_hyphenate = [
    possible_hyphenated_words[i]
    for item in indices
    for i in ([item] if isinstance(item, int) else range(*item))
]

In [84]:
# for word, hyphenated_word in words_to_hyphenate:
#   print(f'{word:<30} {hyphenated_word}')

## Sort authors, affiliations, title, and abstract

In [87]:
parsed_entries = []

for entry in abstract_entries:
  cleaned_entry = clean_text(entry, words_to_hyphenate)
  if ' . ' in cleaned_entry:
    info_text, abstract = cleaned_entry.split(' . ', 1)
  else:
      parsed_entries.append({
        'year': '',
        'author(s)': '',
        'affiliation(s)': '',
        'title': cleaned_entry,
        'type': '',
        'abstract': ''
      })
      continue

  # Extracts title
  title_parts = re.findall(r'\*\*(.*?)\*\*', info_text)
  title = ' '.join(a.strip() for a in title_parts) if title_parts else None

  # Extracts all affiliations in entry
  affiliation_parts = re.findall(r'_(.*?)_', info_text)
  affiliations = '; '.join(a.strip()\
                           for a in affiliation_parts)\
                           if affiliation_parts else None

  # Removes title and affiliation from info_text to get authors
  authors_text = info_text

  for t in title_parts:
    authors_text = authors_text.replace(f'**{t}**', '')

  for a in affiliation_parts:
    authors_text = authors_text.replace(f'_{a}_', '')

  # Cleans up punctuation & whitespace
  authors = authors_text.strip().split(',')
  list_authors = [a.strip() for a in authors if a.strip()]
  cleaned_authors = ', '.join(list_authors)

  if len(set(affiliation_parts)) == 1:
    affiliations = affiliation_parts[0]
  else:
    affiliations = '; '.join(affiliation_parts)

  affiliations = remove_locations(affiliations)

  parsed_entries.append({
    'year': year,
    'author(s)': cleaned_authors,
    'affiliation(s)': affiliations,
    'title': title.strip('.'),
    'type': '',
    'abstract': abstract
  })

In [88]:
parsed_entries[:2]

[{'year': '2013',
  'author(s)': 'Felix A. Wichmann',
  'affiliation(s)': 'University of Tübingen',
  'title': 'Machine learning methods for system identification in sensory psychology',
  'type': '',
  'abstract': 'As a prerequisite to quantitative psychophysical models of sensory processing it is necessary to know to what extent decisions in behavioral tasks depend on specific stimulus features, the perceptual cues: Given the high-dimensional input, which are the features the sensory systems base their computations on? Over the last years we have developed inverse machine learning methods for (potentially nonlinear) system identification, and have applied them to identify regions of visual saliency (Kienzle et al., 2009), to gender discrimination of human faces (Wichmann et al., 2005; Macke & Wichmann, 2010), and to the identification of auditory tones in noise (Schönfelder & Wichmann, 2012; 2013). In my talk I will concentrate on how stimulus-response data can be analyzed relying on

# Create df and convert to csv

In [89]:
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [90]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2013,Felix A. Wichmann,University of Tübingen,Machine learning methods for system identifica...,,As a prerequisite to quantitative psychophysic...
1,2013,Clintin P. Davis-Stober,University of Missouri,A new perspective on non-compensatory decision...,,Lexicographic semiorders are mathematical stru...
2,2013,"Michele Rucci, Jonathan D. Victor, Xutao Kuang",Boston University; Weill Cornell Medical Colle...,Effects of microscopic eye movements on contra...,,The response characteristics of neurons in the...
3,2013,"Natallia Makarava, Mario Bettenbühl, Ralf Engb...",University of Potsdam,Bayesian estimation of the scaling parameter o...,,In this study we re-evaluate the estimation of...
4,2013,"Wolfgang Einhauser¨, Bernard Marius ’t Hart",Philipps-University Marburg; Center for Interd...,Gaze in real-world scenarios: interaction of t...,,Under natural conditions gaze is a good proxy ...


In [91]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)