<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2001_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pymupdf
import glob
import os
import pathlib
import pymupdf
import re
import pandas as pd
import unicodedata

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m119.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7


In [4]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.1, but some are my own code just turned into a function.

In [5]:
SECTION_MARKERS = {"Poster", "Fri", "Sat", "Sun"}

def group_entries(lines):
    entries = []
    current = []

    for ln in lines:
        stripped = ln.strip()
        if not stripped:
            continue  # skip blank lines

        # Check if this starts a new section
        if any(stripped.startswith(marker) for marker in SECTION_MARKERS):
            # Save the previous entry if it exists
            if current:
                entries.append(" ".join(current).strip())
            current = [stripped]
        else:
            current.append(stripped)

    # Add the final entry
    if current:
        entries.append(" ".join(current).strip())

    return entries

In [6]:
def proportion_titlecase(s: str) -> float:
    """Return proportion of words starting with uppercase letters."""
    words = re.findall(r"[A-Za-z][A-Za-z'-]*", s)  # ignore numbers/punctuation
    if not words:
        return 0

    upper = sum(1 for w in words if w[0].isupper())
    return upper / len(words)

In [7]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": 'ffi'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [37]:
LOC_MARKERS = ['H2', 'Carmichael']

def clean_title(title):
    cleaned = []
    for line in title.split('\n'):
        if not line:
            continue

        words = line.split(' ')
        first = words[0]

        if first[0].isdigit() or any(first.startswith(m) for m in LOC_MARKERS):
            cleaned.append(' '.join(words[1:]))
        else:
            cleaned.append(line)

    return ' '.join(cleaned)

In [38]:
def split_author_affiliation(auth_aff):
    authors = []
    affiliations = []

    lines = auth_aff.split('\n')

    for i, line in enumerate(lines):
        if not line:
            continue

        # Case 1: presenter explicitly marked
        if '(presenter)' in line:
            name, aff = line.split('(presenter)', 1)
            authors.append(name.strip())
            affiliations.append(aff.strip())
            continue

        words = line.split(' ')

        # Case 2: single-word continuation line → attach to previous affiliation
        if ' ' not in line:
            affiliations[-1] += ' ' + line
            continue

        # Case 3: initials present (e.g., "Albert J. Ahumada")
        if '.' in words[1]:
            last_name_index = None
            for j, word in enumerate(words):
                if j == 0 or '.' in word:
                    continue
                last_name_index = j + 1
                break

            authors.append(' '.join(words[:last_name_index]))
            affiliations.append(' '.join(words[last_name_index:]))

        # Case 4: normal First Last pattern
        else:
            authors.append(' '.join(words[:2]))
            affiliations.append(' '.join(words[2:]))

    return authors, affiliations

# Program
There are 57 abstracts total.



## Grab text from the abstract pdf

In [8]:
year = '2001'
abstracts = pymupdf.open(pdfs_path + f'smp{year}_abstracts.pdf')

# For these, indices are text[page_num][piece_of_text]
program_text = []
abstract_text = []

# Grabs text from each page in the doc
for page in abstracts:
  abstract_text.append(page.get_text('blocks'))

In [29]:
abstract_text[0][8:]

[(43.01347351074219,
  292.98785400390625,
  564.9149169921875,
  507.6805725097656,
  "Poster\nDecomposing the Learning Curve in Serial Recall: Theory and Data \nKelly Addis (presenter) Volen Center for Complex Systems, Brandeis University\nMichael Kahana Volen Center for Complex Systems, Brandeis University\nOur current understanding of serial learning relies on the form of the learning curve and on\nthe changes in the serial position curve over repeated study-test trials (Ward, 1937). Modern\nvariants of classical chaining theory provide an adequate ﬁt to these data (e.g.,\nLewandowsky and Murdock, 1989). The averaging of data that produces these functions\nobscures the detailed history of individual items over the course of study-test trials.\nExtending Tulving's (1964) analysis of free recall learning, we present a new analysis of\nserial learning that tracks the acquisition and forgetting of item and order information, at the\nlevel of individual items. Applying this analysis to 

## Clean up entries

In [20]:
abstract_strings = []
removed_lines = []
for block in abstract_text:
  for entry_tuple in block:
    # Extract the text string from the tuple entry
    entry = entry_tuple[4]
    # Break up characters like 'fi', 'ffi', 'fl', etc.
    cleaned_entry = fix_ligatures(entry)

    if not entry:  # if no words in entry
      continue

    if cleaned_entry == '3/11/2021\nMathematical Psychology 2001 Brown University\n'\
    or 'web.archive.org/web/' in cleaned_entry:
      removed_lines.append(cleaned_entry)
      continue

    abstract_strings.append(cleaned_entry)  # turns the word list back into one string

In [21]:
# Groups text in between lines that start w/ words from left column ("Poster", "Fri", etc.)
fixed_abstract_strings = group_entries(abstract_strings)

In [24]:
fixed_abstract_strings[1:3]

["Poster\nDecomposing the Learning Curve in Serial Recall: Theory and Data \nKelly Addis (presenter) Volen Center for Complex Systems, Brandeis University\nMichael Kahana Volen Center for Complex Systems, Brandeis University\nOur current understanding of serial learning relies on the form of the learning curve and on\nthe changes in the serial position curve over repeated study-test trials (Ward, 1937). Modern\nvariants of classical chaining theory provide an adequate fit to these data (e.g.,\nLewandowsky and Murdock, 1989). The averaging of data that produces these functions\nobscures the detailed history of individual items over the course of study-test trials.\nExtending Tulving's (1964) analysis of free recall learning, we present a new analysis of\nserial learning that tracks the acquisition and forgetting of item and order information, at the\nlevel of individual items. Applying this analysis to a large data set on serial list learning we\nshow that several variants of chaining t

## Sort abstracts from title, author, affiliation
Based on proportion of capitalized/titlecase words in each line of text. Also separates title from author and affiliation.

In [25]:
# Sorts text based on proportion of capitalized/titlecase words
first_pass_list = []

for entry in fixed_abstract_strings:
  line_list = entry.split('\n')

  temp_info_list = []
  temp_abstract_list = []
  for i, line in enumerate(line_list):
    line_words = line.split(' ')

    # Skip lines that are mostly titlecase (likely titles/authors)
    if (proportion_titlecase(line) >= 0.5) and (i < 10):  # also makes sure don't get
                                                          # last line of abstract if
                                                          # it is just a capitalized word
      if line_words:
        line = ' '.join(line_words) + '\n'
      temp_info_list.append(line)
    # Rest is likely abstract text
    else:
      if line_words and ':' in line_words[0]: # remove time from beginning of abstract
        line_words = line_words[1:]
        line = ' '.join(line_words)

      # Skip empty lines
      if line.strip():
        temp_abstract_list.append(line)

  entry_dict = {
      # 'title_text':
      'info_text': ' '.join(temp_info_list).strip(),
      'abstract_text': ' '.join(temp_abstract_list).strip()
  }
  first_pass_list.append(entry_dict)

In [28]:
first_pass_list[1:3]

[{'info_text': 'Poster\n Decomposing the Learning Curve in Serial Recall: Theory and Data \n Kelly Addis (presenter) Volen Center for Complex Systems, Brandeis University\n Michael Kahana Volen Center for Complex Systems, Brandeis University',
  'abstract_text': "Our current understanding of serial learning relies on the form of the learning curve and on the changes in the serial position curve over repeated study-test trials (Ward, 1937). Modern variants of classical chaining theory provide an adequate fit to these data (e.g., Lewandowsky and Murdock, 1989). The averaging of data that produces these functions obscures the detailed history of individual items over the course of study-test trials. Extending Tulving's (1964) analysis of free recall learning, we present a new analysis of serial learning that tracks the acquisition and forgetting of item and order information, at the level of individual items. Applying this analysis to a large data set on serial list learning we show that 

In [30]:
# Sorts out title from authors and affiliations
aff_words = ['University', 'College', 'Department', 'Center', '(presenter)', 'School']
# unsorted = []
title_and_auth_aff_list = []

for entry in first_pass_list[1:]: # first entry is not a talk
  info_lines = entry['info_text'].split('\n')
  no_date_entry = '\n'.join(info_lines[1:]).strip()

  temp_title = []
  temp_auth_aff = []
  for i, line in enumerate(no_date_entry.split('\n')):
    # Adds first line as title
    if i == 0:
      temp_title.append(line)
    elif (',' in line) or (any(word in line for word in aff_words)):
      temp_auth_aff.append(line.strip())
    else:
      if i == 1:
        temp_title.append(line.strip())
      elif i > 1:
        line_words = line.split(' ')
        if len(line_words) == 1:
          temp_auth_aff[i-1] += ' ' + line
        else:
          temp_auth_aff.append(line.strip())
        # unsorted.append(line.strip())

  title_and_auth_aff_list.append(['\n'.join(temp_title).strip(),
      '\n'.join(temp_auth_aff).rstrip('\n')
  ])

In [31]:
title_and_auth_aff_list[:5]

[['Decomposing the Learning Curve in Serial Recall: Theory and Data',
  'Kelly Addis (presenter) Volen Center for Complex Systems, Brandeis University\nMichael Kahana Volen Center for Complex Systems, Brandeis University'],
 ['H265 Functional Measurement of Color Masking',
  'Albert J. Ahumada, Jr. NASA Ames Research Center'],
 ["H206 Crucial Errors in Miller's (1956) Application of Information Metrics in a Comparison\nof the Spans of Absolute Judgment and Immediate Memory: A Proposal",
  'Bruce L. Bachelder Psychological & Educational Services, Morganton, NC'],
 ['H206 Evidence for a Localized Learning Mechanism for Self-Excitatory, Lateral-Inhibitory\nDecision Networks',
  'Scott Brown (presenter) School of Behavioral Science, University of Newcastle, Australia\nAndrew Heathcote School of Behavioral Science, University of Newcastle, Australia'],
 ['H206 A Solution for Choice Probabilities and Decision Times for Diffusion Models with\nThree Alternatives',
  'Jerome R. Busemeyer Psycho

In [40]:
entry_list = []
LOC_MARKERS = ['H2', 'Carmichael']
for e, [title, auth_aff] in enumerate(title_and_auth_aff_list):
  cleaned_title = clean_title(title)
  authors, affiliations = split_author_affiliation(auth_aff)

  entry_list.append({
        "year": 2001,
        "author(s)": ', '.join(authors),
        "affiliation(s)": ', '.join(affiliations),
        "title": cleaned_title,
        "abstract": first_pass_list[e + 1]['abstract_text']
    })

In [41]:
df = pd.DataFrame(entry_list, columns=["year", "author(s)", "affiliation(s)", "title", "abstract"])

In [42]:
df.head(7)

Unnamed: 0,year,author(s),affiliation(s),title,abstract
0,2001,"Kelly Addis, Michael Kahana","Volen Center for Complex Systems, Brandeis Uni...",Decomposing the Learning Curve in Serial Recal...,Our current understanding of serial learning r...
1,2001,"Albert J. Ahumada,",Jr. NASA Ames Research Center,Functional Measurement of Color Masking,Norman Anderson developed functional measureme...
2,2001,Bruce L. Bachelder,"Psychological & Educational Services, Morganto...",Crucial Errors in Miller's (1956) Application ...,"According to Shiffrin & Nosofsky (1994, p. 360..."
3,2001,"Scott Brown, Andrew Heathcote","School of Behavioral Science, University of Ne...",Evidence for a Localized Learning Mechanism fo...,"Recently, several models of response time (RT)..."
4,2001,Jerome R. Busemeyer,"Psychology Department, Indiana University",A Solution for Choice Probabilities and Decisi...,There is a convergence of evidence favoring th...
5,2001,"Jeremy B. Caplan, Michael J. Kahana","Volen Center for Complex Systems, Brandeis Uni...",Probing Serial Order Memory,"Serial recall, a task widely used to study ser..."
6,2001,Richard A. Chechile,"Psychology Department, Tufts University",The Measure and Mismeasure of Storage and Retr...,Chechile & Meyer (1976) developed a class of m...
7,2001,"Russell M. Church, Kimberly Kirkpatrick","Department of Psychology, Brown University, De...",A Description and Evaluation of Packet Theory,Simple timing and conditioning procedures can ...
8,2001,James E. Corter,"Teachers College, Columbia University",Mixed Risky/Risk-Free Strategies are Non-Optimal,"Yuh-Jia Chen Teachers College, Columbia Univer..."
9,2001,Denis Cousineau,"Departement de psychologie, Université de Mont...",Redundancy Conjecture and Super-Capacity Rate ...,"Both Miller inequality (Miller, 1982) and the ..."


## Convert to CSV file

In [157]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/smp{year}_program.csv", index=False)