<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp1998_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
!pip install pymupdf
import glob
import os
import pathlib
import pymupdf
import re
import pandas as pd
import unicodedata

In [4]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.1

In [26]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": 'ffi'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [5]:
def is_talk_entry(text):
    """
    Heuristic to determine whether a list entry is an actual talk.
    Returns True if it looks like a talk with Title. Authors, Affiliation. Abstract
    """
    t = text.strip()

    # Must contain at least 2 periods (title + affiliation)
    if t.count(".") < 2:
        return False

    # Must contain a comma (authors part)
    if "," not in t:
        return False

    # Must have a capital letter starting the authors section
    if not re.search(r"\.\s+[A-Z]", t):
        return False

    return True

In [6]:
def parse_talk(entry):
    raw = entry.strip()

    # 0. If not a talk, return empty row
    if not is_talk_entry(raw):
        return {
            "year": 1998,
            "author(s)": "",
            "affiliation(s)": "",
            "title": "",
            "type": "",
            "abstract": ""
        }

    # 1. Remove time prefix (8:00, 10:30-12:00, etc.)
    cleaned = re.sub(r"^\s*\d{1,2}:\d{2}(?:-\d{1,2}:\d{2})?\s+", "", raw)

    # 2. Separate TITLE from REST (authors + affiliation + abstract)
    m = re.match(r"^(.*?)\.\s+([A-Z].+)$", cleaned)
    if not m:
        # fallback case
        return {
            "year": 1998,
            "author(s)": "",
            "affiliation(s)": "",
            "title": cleaned,
            "type": "",
            "abstract": ""
        }

    title = m.group(1).strip()
    remainder = m.group(2).strip()

    # 3. Separate authors, affiliation, abstract
    m2 = re.match(r"^(.*?)\s*,\s*(.*?)\.\s*(.*)$", remainder)
    if not m2:
        return {
            "year": 1998,
            "author(s)": "",
            "affiliation(s)": "",
            "title": title,
            "type": "",
            "abstract": remainder
        }

    authors = m2.group(1).strip()
    affiliation = m2.group(2).strip()
    abstract = m2.group(3).strip()

    return {
        "year": 1998,
        "author(s)": authors.replace(' and', ','),
        "affiliation(s)": affiliation,
        "title": title,
        "type": "",
        "abstract": abstract
    }

# Program

There are 66 talks (2 plenary) and 13 posters, so 79 total.

## Grab text from SMP 1998 program

In [16]:
year = '1998'
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')

# For these, indices are text[page_num][piece_of_text]
program_text = []

# Grabs text from each page in the doc
for page in program:
  program_text.append(page.get_text('blocks'))
program.close()

In [17]:
program_text[2][:3]

[(76.89540100097656,
  74.44743347167969,
  534.818603515625,
  96.96625518798828,
  'accurately recovered from the distribution function than from the density functions, and that for certain models, it is \ndifﬁcult to recover accurate parameters under any circumstances.\n',
  0,
  0),
 (76.89540100097656,
  111.16289520263672,
  536.7532348632812,
  301.59381103515625,
  '8:50\xa0Statistical Models for Single-Process Response Time.\xa0Richard A. Chechile,\xa0Tufts \nUniversity.\nIt is important to distinguish between purely statistical models of data and psychological models. For example, with \nmultinomial process-tree research, it is possible to have a number of different psychological models for the same \nstatistical model of the categorical data (i.e., the ultinomial likelihood distribution). Also, there is uniform \nagreement among researchers on the speciﬁcation of the multinomial likelihood function, and this function plays a \ncrucial role in estimating the parameters for al

## Clean up entries

In [34]:
program_strings = []
for block in program_text:
  for i, entry_tuple in enumerate(block):
    # Extract the text string from the tuple entry
    entry = entry_tuple[4]
    # Break up characters like 'fi', 'ffi', 'fl', etc.
    cleaned_entry = fix_ligatures(entry)
    # Turns string into list of words
    words = cleaned_entry.split()

    if not words:  # if no words in entry
      continue

    program_strings.append(' '.join(words))  # turns the word list back into one string

## Collect authors, affiliations, and title

In [35]:
program_list = []
to_fix_list = []

# Filters out a good number of entries that aren't about the talks
filtered_program_strings = [entry for entry in program_strings\
                            if len(entry.split()) > 5]  # 5 gets rid of some non-talk entries
                                                        # w/o removing abstract text split by
                                                        # page break

# Fixes abstract entries separated by new page
separated_text = [] # for checking if all were caught
for e, entry in enumerate(filtered_program_strings):
  if e == 0:
    continue
  else: # splits strings into list of words
    string_words = entry.split(' ')
    prev_string_words = filtered_program_strings[e-1].split(' ')

  # Checks if text starts with lowercase letter
  if entry[0].islower():
    filtered_program_strings[e-1] += ' ' + entry
    separated_text.append(filtered_program_strings[e])  # there are 20 cutoff portions
                                                        # this catches most of them
    del filtered_program_strings[e]
  # Checks if last word of previous text and first word of current text both start w/ uppercase
  elif string_words[0][0].isupper() and (prev_string_words[-1][0].isupper() or prev_string_words[-1] == 'and' or prev_string_words[-1] == '&'): # this catches the rest except 1
    filtered_program_strings[e-1] += ' ' + entry
    separated_text.append(filtered_program_strings[e])
    del filtered_program_strings[e]

In [36]:
parsed_entries = [parse_talk(x) for x in filtered_program_strings]

In [37]:
df = pd.DataFrame(parsed_entries,
    columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"]
)

# Drops rows where author or affiliation are missing
df = df[
    df["author(s)"].notna() & (df["author(s)"].str.len() > 3) &
    df["affiliation(s)"].notna() & (df["affiliation(s)"].str.len() > 3)
]

In [38]:
df[:10]

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
5,1998,"Barbara Bruhns Frey, Keith Clayton",Vanderbilt University,The Application of Nonlinear Dynamical Time Se...,,Nonlinear dynamical time series measures can a...
6,1998,"Trisha Van Zandt, Steven Yantis",The Johns Hopkins University,Parameter Estimation by Fits to Density and Di...,,There are a number of ways to estimate a model...
7,1998,Richard A. Chechile,Tufts University,Statistical Models for Single-Process Response...,,It is important to distinguish between purely ...
8,1998,Stephen W. Link,McMaster University,Wave Theory and Reaction Time,,New relations between RTs provide insights int...
9,1998,Roger Ratcliff,"Northwestern University, and Anjali Thapar, Wi...","Aging, Reaction Time, and a Diffusion Model",,Forty old and forty young adults were tested o...
11,1998,Elke U. Weber,The Ohio State University,Effects of Decision Content on Decision Processes,,This talk reviews existing evidence that the c...
12,1998,Yuri Tada,The Ohio State University,Psychological Space of Decision Similarity,,In exploring factors that determine the choice...
13,1998,Ann-Renee Blais,The Ohio State University,"Women, Decision Content and Other Dangerous Th...",,Research in Judgment and Decision Making shows...
14,1998,Eric Fein,The Ohio State University,Content-Specific Methods of Decision Making,,Most theories of decision making either disreg...
16,1998,David N. Levin,University of Chicago,A Differential Geometric Description of Percep...,,"Previous reports (Levin, J. Math. Psych., acce..."


## Convert to CSV file

In [39]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/smp{year}_program.csv", index=False)