<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2002_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pymupdf
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import re
import pandas as pd
import unicodedata
from rapidfuzz import process, fuzz



In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.1, but some are my own code just turned into a function.

In [4]:
TITLE_SECTION_MARKERS = ['   ', '---']
TALK_TITLE_MARKERS = [r"\s{7,}", r"[1-9]\."]  # at least 7 consecutive whitespace or numbered entry

def group_abstract_entries(text):
    entries = []

    for page_text in text:
      entries_split = re.split(r"\n\s*\n(?=\s*(?:[A-Z][\sA-Za-z]|[0-9]\.))", page_text)
      for e, entry in enumerate(entries_split):
        if entry.isspace():
          continue

        # Adds each entry
        if entry and any(re.search(marker, entry) for marker in TALK_TITLE_MARKERS):
          if '---' in entry:
            continue
          else:
            entries.append(entry.strip())

        # Adds rest of abstract text broken by page break to its abstract entry
        if entry:
          first_char = entry.strip()[0]
          # if first word is lowercase
          if first_char.islower():
            entries[-1] += ' ' + entry.strip()
          # if first word is uppercase
          elif entries and first_char.isupper():
            if not any(marker in entry for marker in TITLE_SECTION_MARKERS):
              if entries[-1].split(' ')[-1] == 'University':
                join_string = ' \n'
              else:
                join_string = ' '
              entries[-1] += join_string + entry.strip()

    return entries

In [5]:
def group_program_entries(text):
    entries = []

    # Protect closing parenthesis
    text = [entry.replace(')\n', '))\n') for entry in text]
    # Get rid of period after "vs"
    text = [entry.replace(' vs. ', ' vs ') for entry in text]

    # Split on time
    page_entries = []
    for page_text in text:
        page_entries.extend(
            re.split(r"\d{1,2}:\d{2}\n", page_text)
        )

    # Split on ')\\n'
    second_page_entries = []
    for entry in page_entries:
        second_page_entries.extend(entry.split(')\n'))

    # Split on numbered list items
    final_page_entries = []
    for entry in second_page_entries:
        final_page_entries.extend(
            re.split(r"[1-9]\.\n", entry)
        )

    # Filter empties and "coffee break" lines
    for entry in final_page_entries:
        if entry.strip() and "COFFEE BREAK" not in entry:
            entries.append(entry)

    return entries

In [6]:
AFFILIATION_KEYWORDS = [
    "University", "College", "Department", "Center", "Institute",
    "Laboratory", "School", "Hospital", "UC", "Centre", "Research",
    "Corporation", "Defence", "Université"
]

def parse_program_entry(year, entry_text):
    """
    Parse a program entry into title, authors, affiliations.
    Returns dict with string fields:
      - title
      - authors (comma-separated)
      - affiliations (semicolon-separated)
    """

    # ---------- 1. Normalize ----------
    text = entry_text.strip()

    # ---------- 2. Extract title ----------
    m = re.search(r"[.?]", text)
    if not m:
        return None

    title_raw = text[:m.start()]
    remainder = text[m.start() + 1:]

    title = " ".join(title_raw.split())

    # ---------- 3. Extract affiliations ----------
    affiliations = re.findall(r"\(([^()]*)\)", remainder)

    affiliations = [
        " ".join(a.replace("\n", " ").split())
        for a in affiliations
        if any(k in a for k in AFFILIATION_KEYWORDS)
    ]

    affiliations_str = "; ".join(affiliations)

    # Remove affiliations from remainder
    remainder = re.sub(r"\([^()]*\)", "", remainder)

    # ---------- 4. Extract authors ----------
    remainder = remainder.replace("\n", " ")
    remainder = remainder.replace("&", ",")
    remainder = re.sub(r"\s+", " ", remainder)

    author_candidates = re.split(r",|\band\b", remainder)

    authors = []
    for a in author_candidates:
        a = a.strip()
        if not a:
            continue
        if any(k in a for k in AFFILIATION_KEYWORDS):
            continue

        tokens = a.split()
        cap_tokens = sum(t[0].isupper() for t in tokens if t)

        if cap_tokens >= 2:
            authors.append(a)

    authors_str = ", ".join(authors)

    return {
        "year": year,
        "author(s)": authors_str,
        "affiliation(s)": affiliations_str,
        "title": title
    }

In [7]:
def normalize(s):
    return re.sub(r'\s+', ' ', s.lower()).strip()

In [8]:
def is_author_line(line: str) -> bool:
    return (
        ',' in line and
        any(word in line for word in AFFILIATION_KEYWORDS)
    )

In [9]:
def is_abstract_line(line: str) -> bool:
    return proportion_titlecase(line) < 0.5 and len(line.split()) >= 6

In [45]:
def parse_abstract_entry(text: str) -> dict:
    lines = [ln.strip() for ln in text.split('\n') if ln.strip()]

    author_idx = None
    for i, line in enumerate(lines):
        if is_author_line(line) and ':' not in line:
            author_idx = i
            break

    if author_idx is None:
        return {}

    # ---- TITLE ----
    title_lines = lines[:author_idx]
    title = ' '.join(title_lines)

    # ---- AUTHORS + AFFILIATIONS ----
    auth_aff_lines = []
    i = author_idx
    while i < len(lines) and not is_abstract_line(lines[i]):
        auth_aff_lines.append(lines[i])
        i += 1

    auth_aff = ' '.join(auth_aff_lines)

    # ---- ABSTRACT ----
    abstract_lines = lines[i:]
    abstract = ' '.join(abstract_lines)

    return {
        "title": title,
        "authors_affiliations": auth_aff,
        "abstract": abstract
    }

In [11]:
def proportion_titlecase(s: str) -> float:
    """Return proportion of words starting with uppercase letters."""
    words = re.findall(r"[A-Za-z][A-Za-z'-]*", s)  # ignore numbers/punctuation
    if not words:
        return 0

    upper = sum(1 for w in words if w[0].isupper())
    return upper / len(words)

In [12]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": 'ffi'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

# Program

There are 61 entries total. 59 abstracts (50 talks and 9 posters) and 2 plenary talks.



## Grab text from the pdfs

In [13]:
year = '2002'
abstracts = pymupdf.open(pdfs_path + f'smp{year}_abstracts.pdf')
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')

# For these, indices are text[page_num][piece_of_text]
abstract_text = []
program_text = []

# Grabs text from each page in the doc
for page in abstracts:
  abstract_text.append(page.get_text('text'))

for page in program[1:]:  # first page is just title page / some info
  program_text.append(page.get_text('text'))

In [14]:
program_text[0]

"3/11/2021\nProgram for the 35th Annual Meeting of the Society for Mathematical Psychology\nweb.archive.org/web/20070708163316/http://www.users.muohio.edu/thomasrd/program.html\n2/5\nMeeting Site \nMarcum Conference Center \nEvents \nThursday Night Reception at the Tavern at the Inn, 7 pm - 10pm.\nPoster Session and Reception (Cash Bar), Friday Evening 5:30-7pm, Heritage Room at Shriver Center.\nBanquet 7pm (following Poster Session), Heritage Room at Shriver Center. \nFriday, July 26, 2002\n_\nSensory Processing and Perception\nAnalysis of Response Time and Accuracy Models\nof Psychophysics and Decision\n8:30\nSpike Timing Variability Limits Motion\nDiscrimination. Joe Lappin*, Bart Borghuis**,\nDuje Tadin*, Martin Lankheet**, and Wim van\nde Grind**\n(*Vanderbilt Vision Research Center, Vanderbilt\nUniversity and ** Helmholtz Institute, Utrecht\nUniversity)\nAdaptive Techniques for Response Latencies.\nRagnar Steingrimsson (UC- Irvine)\n8:55\nDominance times for binocular rivalry and

In [15]:
abstract_text[0]

" \n \nConference Abstracts \n35th Annual Meeting of the Society for Mathematical Psychology  \nJuly 25-28, 2002, Miami University, Oxford, OH \n \n \nSensory Processing and Perception---------------------------------------------------------- \n \nSpike Timing Variability Limits Motion Discrimination                           Friday 8:30 a.m. \n Joe Lappin, Duje Tadin, Vanderbilt University, Bart Borghuis, Martin Lankheet, Wim van de Grind, \nUtrecht University  \nThe visible structure of moving stimulus patterns must be fully represented by responses of retinal \nneurons.  (Entropy cannot be reduced by later cortical mechanisms and cognitive processes.)  Because \nmotion perception depends on correlations between multiple input signals, discriminations of moving \nimages should be limited by the temporal reliability of spike trains at the first stages of vision.  To study the \ntemporal limits of early visual information, we have (a) developed a method to describe the timing \nvariabi

## Clean up entries

### Abstract

In [16]:
abstract_strings = []
removed_lines = []
for page_text in abstract_text:
    # Break up characters like 'fi', 'ffi', 'fl', etc.
    cleaned_entry = fix_ligatures(page_text)

    if not cleaned_entry:  # if no words in entry
      continue

    abstract_strings.append(cleaned_entry)

In [17]:
abstract_strings[0]

" \n \nConference Abstracts \n35th Annual Meeting of the Society for Mathematical Psychology  \nJuly 25-28, 2002, Miami University, Oxford, OH \n \n \nSensory Processing and Perception---------------------------------------------------------- \n \nSpike Timing Variability Limits Motion Discrimination                           Friday 8:30 a.m. \n Joe Lappin, Duje Tadin, Vanderbilt University, Bart Borghuis, Martin Lankheet, Wim van de Grind, \nUtrecht University  \nThe visible structure of moving stimulus patterns must be fully represented by responses of retinal \nneurons.  (Entropy cannot be reduced by later cortical mechanisms and cognitive processes.)  Because \nmotion perception depends on correlations between multiple input signals, discriminations of moving \nimages should be limited by the temporal reliability of spike trains at the first stages of vision.  To study the \ntemporal limits of early visual information, we have (a) developed a method to describe the timing \nvariabi

In [18]:
# Groups text in between lines that start w/ words from left column ("Poster", "Fri", etc.)
fixed_abstract_strings = group_abstract_entries(abstract_strings)

In [19]:
fixed_abstract_strings[0]

"Spike Timing Variability Limits Motion Discrimination                           Friday 8:30 a.m. \n Joe Lappin, Duje Tadin, Vanderbilt University, Bart Borghuis, Martin Lankheet, Wim van de Grind, \nUtrecht University  \nThe visible structure of moving stimulus patterns must be fully represented by responses of retinal \nneurons.  (Entropy cannot be reduced by later cortical mechanisms and cognitive processes.)  Because \nmotion perception depends on correlations between multiple input signals, discriminations of moving \nimages should be limited by the temporal reliability of spike trains at the first stages of vision.  To study the \ntemporal limits of early visual information, we have (a) developed a method to describe the timing \nvariability of spike trains, (b) evaluated the spike timing variability of cat retinal ganglion and LGN cells in \nresponse to moving gratings of varied contrast and temporal frequency, (c) analyzed these temporal effects \non a simple correlational mode

### Program

In [20]:
program_strings = []
for page_text in program_text:
    # Break up characters like 'fi', 'ffi', 'fl', etc.
    cleaned_entry = fix_ligatures(page_text)

    if not cleaned_entry:  # if no words in entry
      continue

    # program_strings.append(cleaned_entry)

    lines = cleaned_entry.split('\n')
    no_header_text = lines[4:]

    program_strings.append('\n'.join(no_header_text))  # turns the word list back into one string

In [21]:
program_strings[0]

"Meeting Site \nMarcum Conference Center \nEvents \nThursday Night Reception at the Tavern at the Inn, 7 pm - 10pm.\nPoster Session and Reception (Cash Bar), Friday Evening 5:30-7pm, Heritage Room at Shriver Center.\nBanquet 7pm (following Poster Session), Heritage Room at Shriver Center. \nFriday, July 26, 2002\n_\nSensory Processing and Perception\nAnalysis of Response Time and Accuracy Models\nof Psychophysics and Decision\n8:30\nSpike Timing Variability Limits Motion\nDiscrimination. Joe Lappin*, Bart Borghuis**,\nDuje Tadin*, Martin Lankheet**, and Wim van\nde Grind**\n(*Vanderbilt Vision Research Center, Vanderbilt\nUniversity and ** Helmholtz Institute, Utrecht\nUniversity)\nAdaptive Techniques for Response Latencies.\nRagnar Steingrimsson (UC- Irvine)\n8:55\nDominance times for binocular rivalry and\nambiguous motion rivalry share individual\ndiffferences. Keith D. White (University of\nFlorida), John D. Pettigrew (University of\nQueensland)\nTesting Capacity Hypotheses: Applyi

In [22]:
fixed_program_strings = group_program_entries(program_strings)

In [23]:
fixed_program_strings[:5]

['Meeting Site \nMarcum Conference Center \nEvents \nThursday Night Reception at the Tavern at the Inn, 7 pm - 10pm.\nPoster Session and Reception (Cash Bar), Friday Evening 5:30-7pm, Heritage Room at Shriver Center.\nBanquet 7pm (following Poster Session), Heritage Room at Shriver Center. \nFriday, July 26, 2002\n_\nSensory Processing and Perception\nAnalysis of Response Time and Accuracy Models\nof Psychophysics and Decision\n',
 'Spike Timing Variability Limits Motion\nDiscrimination. Joe Lappin*, Bart Borghuis**,\nDuje Tadin*, Martin Lankheet**, and Wim van\nde Grind**\n(*Vanderbilt Vision Research Center, Vanderbilt\nUniversity and ** Helmholtz Institute, Utrecht\nUniversity)',
 'Adaptive Techniques for Response Latencies.\nRagnar Steingrimsson (UC- Irvine)',
 'Dominance times for binocular rivalry and\nambiguous motion rivalry share individual\ndiffferences. Keith D. White (University of\nFlorida), John D. Pettigrew (University of\nQueensland)',
 'Testing Capacity Hypotheses: App

## Collect title, authors, and affiliations from program pdf

In [24]:
parsed_entries = []

for entry in fixed_program_strings:
    parsed = parse_program_entry(year, entry)
    if parsed:
        parsed_entries.append(parsed)

In [25]:
parsed_entries[:5]

[{'year': '2002',
  'author(s)': 'Poster Session, Friday Evening 5:30-7pm, 2002 _ Sensory Processing, Perception Analysis of Response Time, Accuracy Models of Psychophysics',
  'affiliation(s)': '',
  'title': 'Meeting Site Marcum Conference Center Events Thursday Night Reception at the Tavern at the Inn, 7 pm - 10pm'},
 {'year': '2002',
  'author(s)': 'Joe Lappin*, Bart Borghuis**, Duje Tadin*, Martin Lankheet**, Wim van de Grind**',
  'affiliation(s)': '*Vanderbilt Vision Research Center, Vanderbilt University and ** Helmholtz Institute, Utrecht University',
  'title': 'Spike Timing Variability Limits Motion Discrimination'},
 {'year': '2002',
  'author(s)': 'Ragnar Steingrimsson',
  'affiliation(s)': 'UC- Irvine',
  'title': 'Adaptive Techniques for Response Latencies'},
 {'year': '2002',
  'author(s)': 'Keith D. White, John D. Pettigrew',
  'affiliation(s)': 'University of Florida; University of Queensland',
  'title': 'Dominance times for binocular rivalry and ambiguous motion riv

## Collect abstracts and convert to df

In [46]:
lookup = {normalize(e["title"]): e for e in parsed_entries}
titles = list(lookup.keys())

SIM_THRESHOLD = 70  # conservative

for entry in fixed_abstract_strings:
    parsed = parse_abstract_entry(entry)

    title = parsed.get("title")
    if not title:
        continue

    key = normalize(title)

    # 1. Exact match
    if key in lookup:
        lookup[key]["abstract"] = parsed["abstract"]
        continue

    # 2. Fuzzy fallback
    match = process.extractOne(
        key,
        titles,
        scorer=fuzz.token_sort_ratio
    )

    if match:
        matched_key, score, _ = match
        if score >= SIM_THRESHOLD:
            lookup[matched_key]["abstract"] = parsed["abstract"]

In [47]:
parsed_entries[1:3]

[{'year': '2002',
  'author(s)': 'Joe Lappin*, Bart Borghuis**, Duje Tadin*, Martin Lankheet**, Wim van de Grind**',
  'affiliation(s)': '*Vanderbilt Vision Research Center, Vanderbilt University and ** Helmholtz Institute, Utrecht University',
  'title': 'Spike Timing Variability Limits Motion Discrimination',
  'abstract': "The visible structure of moving stimulus patterns must be fully represented by responses of retinal neurons.  (Entropy cannot be reduced by later cortical mechanisms and cognitive processes.)  Because motion perception depends on correlations between multiple input signals, discriminations of moving images should be limited by the temporal reliability of spike trains at the first stages of vision.  To study the temporal limits of early visual information, we have (a) developed a method to describe the timing variability of spike trains, (b) evaluated the spike timing variability of cat retinal ganglion and LGN cells in response to moving gratings of varied contras

## Create df and convert to csv

In [48]:
# Skip first entry because extraneous text
df = pd.DataFrame(parsed_entries[1:], columns=["year", "author(s)", "affiliation(s)", "title", "abstract"])

In [50]:
df.head(5)

Unnamed: 0,year,author(s),affiliation(s),title,abstract
0,2002,"Joe Lappin*, Bart Borghuis**, Duje Tadin*, Mar...","*Vanderbilt Vision Research Center, Vanderbilt...",Spike Timing Variability Limits Motion Discrim...,The visible structure of moving stimulus patte...
1,2002,Ragnar Steingrimsson,UC- Irvine,Adaptive Techniques for Response Latencies,Using adaptive procedures to estimate selectiv...
2,2002,"Keith D. White, John D. Pettigrew",University of Florida; University of Queensland,Dominance times for binocular rivalry and ambi...,Binocular rivalry can be produced by stimulati...
3,2002,"Michael J. Wenger, Christof Schuster, James T....",University of Notre Dame; Indiana University,Testing Capacity Hypotheses: Applying Proporti...,Proportional hazards (Cox) regression models w...
4,2002,Keith K. Niall,"Defence R&D Canada, Toronto",Perspicuity without depth: 'mental rotation' a...,In some typical judgments on perspective pictu...


In [51]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/smp{year}_program.csv", index=False)