<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2020_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Suppresses output from pip installs
%%capture

!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install wordfreq
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import regex
import pandas as pd
import unicodedata
import wordfreq
from rapidfuzz import process, fuzz

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [4]:
LOCATIONS = [
    'United States of America', 'United States', 'Switzerland', 'Japan', 'Bremen',
    'Berlin, Germany', 'Heidelberg, Germany', 'Germany', 'Berlin', 'Norway',
    'Turkey', 'Belgium', 'Italy', 'Israel', 'New Brunswick, New Jersey',
    'Australia', 'The Netherlands', 'USA', 'US', 'Netherlands, The', 'Netherlands',
    'United Kingdom', 'Singapore', 'France', 'Dayton OH', 'Dayton', 'India',
    'Taiwan, Republic of China', 'Austria', 'Canada', 'Denmark', 'Spain',
    'Edmonton', 'Bloomington, Indiana', 'Indiana', 'Russian Federation',
    'University Park, Pennsylvania', 'California', 'San Francisco, California',
    'Taipei, Taiwan', 'Charlottesville, Virginia', 'New York, New York',
    'Toronto, Ontario', 'New Haven, Connecticut', 'Ann Arbor, Michigan', 'Ohio',
    'Ottawa, Ontario', 'Houston, Texas', 'UK', 'New Brunswick, Piscataway, NJ',
    'Finland', 'Iceland', 'Mexico', 'South Korea'
]

# compile once
LOCATION_RE = re.compile(
    r',\s*(?:' + '|'.join(map(re.escape, LOCATIONS)) + r')\b',
    re.I
)

def remove_locations(entry: str) -> str:
    return LOCATION_RE.sub('', entry).strip()

In [5]:
def parse_authors(line: str) -> list[str]:
    line = line.strip()

    # Only ICCM posters use "and" in the list
    if ' and ' in line:
      line = line.replace(' and ', ', ')
      line = line.replace(',', ';')

    authors = []

    for part in line.split(';'):
        part = part.strip()
        if not part:
            continue

        # Convert "Last, First Middle" → "First Middle Last"
        if ',' in part:
            last, first = part.split(',', 1)
            authors.append(f"{first.strip()} {last.strip()}")
        else:
            authors.append(part)  # fallback, just in case

    return authors

In [6]:
def parse_labeled_authors(text: str):
  # Splits authors on commas outside of the brackets for affiliation labels
  authors = re.split(r',\s*(?![^(]*\))', text)
  out = []

  for a in authors:
      a = a.strip()
      if not a:
          continue

      # Extracts all numbers
      indices = [int(x) for x in re.findall(r'\d+', a)]

      # Cleans author names
      # Removes parentheses containing digits (e.g., (1), (2))
      name = re.sub(r'\(\s*\d+(?:\s*,\s*\d+)*\s*\)', '', a)

      # Removes empty parentheses
      name = re.sub(r'\(\s*\)', '', name)

      # Normalizes whitespace
      name = re.sub(r'\s+', ' ', name).strip()

      out.append((name, indices))

  return out

In [7]:
def parse_affiliation_dict(aff_text: str) -> dict[int, str]:
    out = {}

    affs = re.split(r';\s', aff_text)
    for aff in affs:
      num = int(aff[0])
      aff_name = aff.split(': ', 1)[1]
      out[num] = aff_name

    return out

In [8]:
def make_aff_list(authors, aff_dict):
    author_names = []
    author_affiliations = []

    for name, indices in authors:
        author_names.append(name)

        affs = [
            aff_dict[i]
            for i in indices
            if i in aff_dict
        ]

        # join multiple affiliations for the SAME author with " and "
        author_affiliations.append(" / ".join(affs))

    new_authors = ", ".join(author_names)
    new_affiliations = "; ".join(author_affiliations)

    return new_authors, new_affiliations

In [9]:
def remove_page_break_text(text: str) -> str:
    if not text:
        return text

    # Replaces with '\n\n' if '##' is after the page number
    text = re.sub(r'\n\n\d{1,3} \n\n(?=##)', '\n\n', text)

    # Replaces with '\n\n##' if '**' is after
    text = re.sub(r'\n\n\d{1,3} \n\n(?=\*\*)', '\n\n## ', text)

    # Replaces with nothing for the rest
    text = re.sub(r'\n\n\d{1,3} \n\n', '', text)

    return text.strip()

In [10]:
def remove_picture_text(text: str) -> str:
    if not text:
        return text

    text = re.sub(
        r'''
        \n\n\*\*==>\spicture\s\[\d{2}\sx\s\d{2}\]\sintentionally\somitted\s<==\*\*\n\n
        ''',
        '\n\n',
        text,
        flags=re.VERBOSE
    )

    text = re.sub(
        r'''
        \*\*-----\sStart\sof\spicture\stext\s-----\*\*
        <br>\n[A-Z][A-Z]<br>
        ''',
        '',
        text,
        flags=re.VERBOSE
    )

    text = re.sub(
        r'''
        \*\*-----\sEnd\sof\spicture\stext\s-----\*\*<br>\n\n\n#?#?
        ''',
        '##',
        text,
        flags=re.VERBOSE
    )

    return text

In [11]:
topics = [
  "Group Dynamics", "Optimal Experimental Design", "Systems and Architectures",
  "Computational Model-Based Cognitive Neuroscience", "Cognitive Neuromodeling",
  "Theory Development", "Learning", "Optimality in Choice", "Joint Modeling",
  "Psychometrics", "Categorization", "Decision Making", "Memory",
  "Memory Research Methods", "Metascience", "Recognition Memory", "Judgment",
  "Statistics", "Reaction Time Analysis", "Reaction Time Models",
  "Applied MathPsych", "Axiomatics and Formal Analysis", "ICCM Session I",
  "ICCM Session II", "ICCM Session III", "ICCM Session IV", "ICCM Session V",
  "CT", "CS", "ICCM Virtual poster abstracts", "MathPsych Virtual poster abstracts"
]

def get_title(text):
  topic_pattern = "|".join(re.escape(t) for t in topics)

  pattern = rf"^\*\*(?:{topic_pattern})\*\*"

  if re.match(pattern, text):
    title_index = 1
  else:
    title_index = 0

  title_line = text.split('\n\n')[title_index]
  title = re.search(r'\*\*(.*?)\*\*', title_line).group()

  return title.strip('**')

In [12]:
def clean_text(text, fix_whitespace=False):
    if not text:
        return text

    text = fix_ligatures(text)
    text = remove_page_break_text(text)
    text = remove_picture_text(text)

    # Get rid of breaks inbetween newlines
    text = re.sub(r'\n*<br>\n*', '\n\n', text)

    text = text.replace("\'", "'")

    if fix_whitespace:
      text = re.sub(r'\n{3,}', '\n\n', text)

    text = text.strip()

    return text

In [13]:
LIGATURE_MAP = {
    "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬀ": "ff", "ﬅ": "ft", "ﬆ": "st",
    "Æ": "ffi", "¨u": "ü", "¨a": "ä", "´e": "é", "`e": "è", "`a": "à", "¨o": "ö",
    "˚a": "å", "c¸": "ç", '“': '"', '”': '"', "’": "'", '˜n': 'ñ', 'ˇs': 'š',
    "âĂŸ": "'", "``": '"', "↵": "ff", "✏": "ffl"
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [14]:
def rehyphenate_words(text, words_to_hyphenate):
    for word, hyphenated_word in words_to_hyphenate:
        text = text.replace(word, hyphenated_word)

    return text

In [15]:
# Checks if valid word using Zipf frequency
def is_probably_valid(word, threshold=2.5):
    return wordfreq.zipf_frequency(word, "en") > threshold  # smaller number cuts off
                                                            # more words, bigger is
                                                            # more lenient

# Program

196 entries total (18 symposium talks, 60 concerted sessions, 60 talks, and 58 posters)

I'm not exactly sure what a concerted session is but I think because this was the first virtual one, maybe it's synchronous talks?

They're using justified text again, so will have to check the hyphenated words at line breaks.




In [17]:
year = '2020'
file_path = pdfs_path + f'Smp{year}.pdf'

## Grab text from the pdf



In [18]:
program = pymupdf.open(file_path)   # original PDF
program_text = pymupdf4llm.to_markdown(program)

In [30]:
program_text[61_000:63_000]

'**|Business meeting||\n\n\n\n39 \n\n40 \n\n## **MathPsych Virtual talk abstracts** \n\n## **Group Dynamics** \n\n## **Model-based wisdom of the crowd for sequential decisions** \n\n## **CS** \n\n_**Lee, Michael David; Coon, Jeff; Thomas, Bobby; Westfall, Holly Anne**_ \n\n## University of California, Irvine \n\nWe use cognitive models to apply the wisdom of the crowd to three sequential decision making problems: bandit problems, optimal stopping problems, and the Balloon Analogue Risk Task (BART). In each of these problems, people make a sequence of choices under uncertainty, with individual differences in decision making that depend on different attitudes toward risk. Each of the problems also has a known optimal decision-making strategy. Standard methods for the wisdom of the crowd, based on taking the modal behavior, are generally not applicable to these problems, because of their sequential nature. For example, the state-space of a bandit problem can be so large that, even for a l

## Find words to re-hyphenate

In [31]:
pattern = re.compile(r'([A-Za-z]+)-\n\s*([A-Za-z]+)')
possible_hyphenated_words = []

counter = 0
for p, page in enumerate(program[42:]):  # these are the pages with abstracts only
  text = fix_ligatures(page.get_text('text'))
  matches = pattern.findall(text)

  for left, right in matches:
    word = f"{left}{right}"
    hyphenated_word = f"{left}-{right}"
    if not is_probably_valid(word, threshold=2.8):
      possible_hyphenated_words.append([word, hyphenated_word])

      print(f"{counter:>3}: Page {p+42:<3} {hyphenated_word:<30} {word}")
      counter += 1

  0: Page 43  statisti-cians                 statisticians
  1: Page 44  utility-maximizing             utilitymaximizing
  2: Page 45  decision-making                decisionmaking
  3: Page 46  and-mortar                     andmortar
  4: Page 46  com-putationally               computationally
  5: Page 48  confus-ability                 confusability
  6: Page 51  likelihood-free                likelihoodfree
  7: Page 51  brain-behavior                 brainbehavior
  8: Page 53  decision-making                decisionmaking
  9: Page 61  Pavlo-vian                     Pavlovian
 10: Page 63  memory-driven                  memorydriven
 11: Page 64  non-independent                nonindependent
 12: Page 71  intercorre-lation              intercorrelation
 13: Page 72  Wisconsin-Madison              WisconsinMadison
 14: Page 72  pre-suppose                    presuppose
 15: Page 75  template-matching              templatematching
 16: Page 75  integration-to                 inte

In [32]:
# Pick indices of words to rehyphenate
indices = [(1,3), (6,7), 10, 13, (15,20), 23, 24, 26, 27, 29, (31,33), 35, 38, (43,46), 50, (56,61), 65, 68]
words_to_hyphenate = [
    possible_hyphenated_words[i]
    for item in indices
    for i in ([item] if isinstance(item, int) else range(*item))
]

In [33]:
for word, hyphenated_word in words_to_hyphenate:
  print(f'{word:<30} {hyphenated_word}')

utilitymaximizing              utility-maximizing
decisionmaking                 decision-making
likelihoodfree                 likelihood-free
memorydriven                   memory-driven
WisconsinMadison               Wisconsin-Madison
templatematching               template-matching
integrationto                  integration-to
EigenNet                       Eigen-Net
bigrams                        bi-grams
offorgetting                   of-forgetting
participantlevel               participant-level
nontarget                      non-target
driftdiffusion                 drift-diffusion
higherweighted                 higher-weighted
highvalue                      high-value
modelstimulus                  model-stimulus
nondecision                    non-decision
multiplechoice                 multiple-choice
wellknown                      well-known
PaulChristian                  Paul-Christian
datagenerating                 data-generating
pairedassociates               paired-asso

## Split up into talk entries

In [34]:
program_abstracts_start = program_text.split('\n\n## **Group Dynamics** \n\n## ')[1]

# Rehyphenate words that need hyphen at line breaks
program_abstracts_start = rehyphenate_words(program_abstracts_start, words_to_hyphenate)

# Adds period to the end of this abstract bc it's missing one
program_abstracts_start = program_abstracts_start.replace('spatio-temporal dynamics',
                                                          'spatio-temporal dynamics.')
# Gets rid of page break and pictured omitted text
program_abstracts_start = clean_text(program_abstracts_start, fix_whitespace=True)

# Splits each abstract entry
abstract_entries = program_abstracts_start.split('. \n\n## ')[:-3]
abstract_entries = [entry for entry in abstract_entries if len(entry) > 300]

In [35]:
abstract_entries[:2]

['**Model-based wisdom of the crowd for sequential decisions** \n\n## **CS** \n\n_**Lee, Michael David; Coon, Jeff; Thomas, Bobby; Westfall, Holly Anne**_ \n\n## University of California, Irvine \n\nWe use cognitive models to apply the wisdom of the crowd to three sequential decision making problems: bandit problems, optimal stopping problems, and the Balloon Analogue Risk Task (BART). In each of these problems, people make a sequence of choices under uncertainty, with individual differences in decision making that depend on different attitudes toward risk. Each of the problems also has a known optimal decision-making strategy. Standard methods for the wisdom of the crowd, based on taking the modal behavior, are generally not applicable to these problems, because of their sequential nature. For example, the state-space of a bandit problem can be so large that, even for a large crowd of people, there will be game states that no individual encountered, and so there is no behavior to aggr

## Sort authors, affiliations, title, and abstract

In [36]:
parsed_entries = []

for e, entry in enumerate(abstract_entries):
  # The easiest place to split is between authors and affiliations
  title_auth, affs_abstract = re.split(r'(?<=\*\*_)\s\n?\n?#?#?', entry, maxsplit=1)

  title = get_title(title_auth)

  # Authors surrounded by _**
  authors = re.search(r'_\*\*(.*?)\*\*_', title_auth).group()
  authors = parse_authors(authors.strip('_').strip('**'))
  authors = ', '. join(authors)

  if '**_' in affs_abstract:
    affs_abstract = re.split(r'\*\*_\s\n*#?#?', affs_abstract, maxsplit=1)[1].strip()

  affiliations, abstract = affs_abstract.split('\n\n', 1)
  affiliations = remove_locations(affiliations.strip())
  abstract = abstract.strip()

  # Fixes authors and affiliations if using numbered labels
  if re.search(r'^1', affiliations):
    author_label_tuples = parse_labeled_authors(authors)
    aff_dict = parse_affiliation_dict(affiliations)
    authors, affiliations = make_aff_list(author_label_tuples, aff_dict)

  parsed_entries.append({
      'year': year,
      'author(s)': authors,
      'affiliation(s)': affiliations,
      'title': title,
      'type': '',
      'abstract': clean_text(abstract) + '.'
  })

In [37]:
parsed_entries[:2]

[{'year': '2020',
  'author(s)': 'Michael David Lee, Jeff Coon, Bobby Thomas, Holly Anne Westfall',
  'affiliation(s)': 'University of California, Irvine',
  'title': 'Model-based wisdom of the crowd for sequential decisions',
  'type': '',
  'abstract': 'We use cognitive models to apply the wisdom of the crowd to three sequential decision making problems: bandit problems, optimal stopping problems, and the Balloon Analogue Risk Task (BART). In each of these problems, people make a sequence of choices under uncertainty, with individual differences in decision making that depend on different attitudes toward risk. Each of the problems also has a known optimal decision-making strategy. Standard methods for the wisdom of the crowd, based on taking the modal behavior, are generally not applicable to these problems, because of their sequential nature. For example, the state-space of a bandit problem can be so large that, even for a large crowd of people, there will be game states that no in

# Create df and convert to csv

In [38]:
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [39]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2020,"Michael David Lee, Jeff Coon, Bobby Thomas, Ho...","University of California, Irvine",Model-based wisdom of the crowd for sequential...,,We use cognitive models to apply the wisdom of...
1,2020,"Alex Davis, Niles Guo, Meagan Mauter",Carnegie Mellon University; Carnegie Mellon Un...,Public policy recommendation by optimizing an ...,,A classical utilitarian perspective on public ...
2,2020,"Murray Bennett, Rachel Mullard, Scott Brown, A...",University of Newcastle,An Iterated Prospect Theory Model for the Dutc...,,Dutch auctions are used in many industries. Go...
3,2020,"Keith Ransom, Amy Perfors",University of Melbourne,The polarising effect of epistemic vigilance i...,,While seeing may be believing for reasoners wi...
4,2020,"Jay I. Myung, Mark A. Pitt",Ohio State University,An Introduction to Optimal Experimental Design,,Progress in science depends on well-designed e...


In [40]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)