<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2021_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Suppresses output from pip installs
%%capture

!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install wordfreq
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import regex
import pandas as pd
import unicodedata
import wordfreq
from rapidfuzz import process, fuzz

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [4]:
LOCATIONS = [
    'United States of America', 'United States', 'Switzerland', 'Japan', 'Bremen',
    'Berlin, Germany', 'Heidelberg, Germany', 'Germany', 'Berlin', 'Norway',
    'Turkey', 'Belgium', 'Italy', 'Israel', 'New Brunswick, New Jersey',
    'Australia', 'The Netherlands', 'USA', 'US', 'Netherlands, The', 'Netherlands',
    'United Kingdom', 'Singapore', 'France', 'Dayton OH', 'Dayton', 'India',
    'Taiwan, Republic of China', 'Austria', 'Canada', 'Denmark', 'Spain',
    'Edmonton', 'Bloomington, Indiana', 'Indiana', 'Russian Federation',
    'University Park, Pennsylvania', 'California', 'San Francisco, California',
    'Taipei, Taiwan', 'Charlottesville, Virginia', 'New York, New York',
    'Toronto, Ontario', 'New Haven, Connecticut', 'Ann Arbor, Michigan', 'Ohio',
    'Ottawa, Ontario', 'Houston, Texas', 'UK', 'New Brunswick, Piscataway, NJ',
    'Finland', 'Iceland', 'Mexico', 'South Korea'
]

# compile once
LOCATION_RE = re.compile(
    r',\s*(?:' + '|'.join(map(re.escape, LOCATIONS)) + r')\b',
    re.I
)

def remove_locations(entry: str) -> str:
    return LOCATION_RE.sub('', entry).strip()

In [5]:
def reorder_author(author: str) -> str:
    """
    Convert 'Last, First Middle' → 'First Middle Last'
    """
    last, given = author.split(",", 1)
    return f"{given.strip()} {last.strip()}"

In [6]:
def cut_affiliation_at_author(affil: str) -> str | None:
    """
    Keep affiliation text up to (but not including) the next author.
    """
    m = AUTHOR_ONLY_RE.search(affil)
    if m:
        affil = affil[:m.start()]

    affil = affil.strip(" ;,-")
    return affil or 'none'

In [7]:
LAST_NAME = r"""
(?:
    (?:van|de)\s+\p{L}+(?:-\p{L}+)*(?:\s+\p{L}+)?  # van/de X or de la X
    |
    \p{L}+(?:-\p{L}+)*                             # single-word last name
)
"""

GIVEN_NAMES = rf"""
(?:\p{{L}}+(?:-\p{{L}}+)*|\p{{L}}\.)               # first given token
(?:
    \s+
    (?!{LAST_NAME}\s*,)                            # STOP before next author
    (?:\p{{L}}+(?:-\p{{L}}+)*|\p{{L}}\.)
)*
"""

ALT_NAME = r"""
(?:\s*\([^)]*\))?                                  # optional alt name
"""

AUTHOR_CORE = rf"""
(?P<author>
    {LAST_NAME}
    ,\s*
    {GIVEN_NAMES}
    {ALT_NAME}
)
"""

AUTHOR_ONLY_RE = regex.compile(
    rf"""
    {AUTHOR_CORE}
    """,
    regex.VERBOSE | regex.IGNORECASE,
)

def parse_authors_affiliations(text: str):
    """
    Returns:
        authors: list[str]            # "First Last"
        affiliations: list[str|None]  # aligned with authors
    """

    authors = []
    affiliations = []

    # Split ONLY on en dashes
    chunks = [c.strip() for c in text.split("–")]

    for i, chunk in enumerate(chunks):
        # Find ALL authors in this chunk
        matches = list(AUTHOR_ONLY_RE.finditer(chunk))
        if not matches:
            continue

        for j, m in enumerate(matches):
            raw_author = m.group("author")
            author = reorder_author(raw_author)

            # Only the LAST author before a dash gets the affiliation
            if j == len(matches) - 1 and i + 1 < len(chunks):
                raw_affil = chunks[i + 1].strip()
                affil = cut_affiliation_at_author(raw_affil)
            else:
                affil = 'none'

            if author not in authors:
                authors.append(author)
                affiliations.append(affil)

    return authors, affiliations

In [8]:
def remove_page_break_text(text: str) -> str:
    if not text:
        return text

    # Replaces with nothing for the rest
    text = re.sub(r'\n\n\d{1,3} \n\n', '\n\n', text)

    return text.strip()

In [9]:
def remove_picture_text(text: str) -> str:
    if not text:
        return text

    text = re.sub(
        r'''
        \n\n\*\*==>\spicture\s\[\d{2}\sx\s\d{2}\]\sintentionally\somitted\s<==\*\*\n\n
        ''',
        '\n\n',
        text,
        flags=re.VERBOSE
    )

    text = re.sub(
        r'''
        \*\*-----\sStart\sof\spicture\stext\s-----\*\*
        <br>\n[A-Z][A-Z]<br>
        ''',
        '',
        text,
        flags=re.VERBOSE
    )

    text = re.sub(
        r'''
        \*\*-----\sEnd\sof\spicture\stext\s-----\*\*<br>\n\n\n#?#?
        ''',
        '##',
        text,
        flags=re.VERBOSE
    )

    return text

In [10]:
type_labels = ['MT', 'FT', 'PO', 'PA']

def remove_label(text: str) -> str:
  if not text:
      return text

  label_patt = "|".join(type_labels)
  pattern = rf"\*\*(?:{label_patt})\*\*"

  text = re.sub(pattern, '', text)

  return text

In [11]:
def clean_text(text, fix_whitespace=False, delete_whitespace=False):
    if not text:
        return text

    text = fix_ligatures(text)
    text = remove_page_break_text(text)
    text = remove_picture_text(text)
    text = remove_label(text)

    # Gets rid of breaks in titles
    text = re.sub(r'\*\*\s\n\n##\s\*\*', ' ', text)

    text = text.replace('||\n|---|---|\n|', ' ')
    text = text.replace('||\n|', ' ')

    if fix_whitespace:
      text = re.sub(r'\n\s+', '\n\n', text)

    if delete_whitespace:
      text = re.sub(r'\s{2,}', ' ', text)

    text = text.strip()

    return text

In [12]:
LIGATURE_MAP = {
    "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬀ": "ff", "ﬅ": "ft", "ﬆ": "st",
    "Æ": "ffi", "¨u": "ü", "¨a": "ä", "´e": "é", "`e": "è", "`a": "à", "¨o": "ö",
    "˚a": "å", "c¸": "ç", '“': '"', '”': '"', "’": "'", '˜n': 'ñ', 'ˇs': 'š',
    "âĂŸ": "'", "``": '"', "↵": "ff", "✏": "ffl", "‘": "'"
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [13]:
def rehyphenate_words(text, words_to_hyphenate):
    for word, hyphenated_word in words_to_hyphenate:
        text = text.replace(word, hyphenated_word)

    return text

In [14]:
# Checks if valid word using Zipf frequency
def is_probably_valid(word, threshold=2.5):
    return wordfreq.zipf_frequency(word, "en") > threshold  # smaller number cuts off
                                                            # more words, bigger is
                                                            # more lenient

# Program

204 entries total (4 keynote talks,  symposium talks,  talks,  fast talks, and  posters)

Has justified text so need to check hyphenated words at line breaks, affiliations are listed after names after an en dash. Some authors don't have affiliation listed.

This one has a keynote by Prof. Allen :)




In [15]:
year = '2021'
file_path = pdfs_path + f'smp{year}_program.pdf'

## Grab text from the pdf



In [16]:
program = pymupdf.open(file_path)   # original PDF
program_text = pymupdf4llm.to_markdown(program)

In [27]:
program_text[73_600:75_000]

'AM EDT|||\n|**July 9**<br>12:00PM EDT|SE|Friday mixer|\n\n\n\n49 \n\n**Abstracts** \n\n_MT: MathPsych Talk, FT: MathPsych Fast talk, PO: ICCM Poster, PA: ICCM Paper_ Abstracts are printed in alphabetical order by title. \n\n## **A predictive processing implementation of the common model of cognition** \n\n**PO** \n\nKelly, Alex – Carleton University Ororbia, Alex \n\nSession: _ICCM: Poster session_ – live on Thursday, July 8, at 10:00AM EDT \n\nWe present how a cognitive architecture can be built from the neural circuit models proposed under the frameworks of holographic memory and neural generative coding. Specifically, we draw inspiration from well-known cognitive architectures such as ACT-R, Soar, Leabra, and Nengo, as well as the common model of cognition, to propose the kernel that might drive a complex, modular system that would prove useful for developing intelligent agents that tackle statistical learning tasks, as well as for answering questions and testing hypotheses in cogn

## Find words to re-hyphenate

In [28]:
pattern = re.compile(r'([A-Za-z]+)-\n\s*([A-Za-z]+)')
possible_hyphenated_words = []

counter = 0
for p, page in enumerate(program[50:]):  # these are the pages with abstracts only
  text = fix_ligatures(page.get_text('text'))
  matches = pattern.findall(text)

  for left, right in matches:
    word = f"{left}{right}"
    hyphenated_word = f"{left}-{right}"
    if not is_probably_valid(word, threshold=2.8):
      possible_hyphenated_words.append([word, hyphenated_word])

      print(f"{counter:>3}: Page {p+51:<3} {hyphenated_word:<30} {word}")
      counter += 1

  0: Page 51  physiologi-cally               physiologically
  1: Page 51  Schachter-Singer               SchachterSinger
  2: Page 52  dis-cards                      discards
  3: Page 55  semi-Markov                    semiMarkov
  4: Page 58  laten-cies                     latencies
  5: Page 60  nonnega-tivity                 nonnegativity
  6: Page 69  informationally-equivalent     informationallyequivalent
  7: Page 71  co-occurrence                  cooccurrence
  8: Page 78  non-decision                   nondecision
  9: Page 80  frequen-tist                   frequentist
 10: Page 86  computa-tions                  computations
 11: Page 94  sampling-based                 samplingbased
 12: Page 94  mechanis-tic                   mechanistic
 13: Page 96  mus-cimol                      muscimol
 14: Page 100 intertempo-ral                 intertemporal
 15: Page 105 noncon-textuality              noncontextuality
 16: Page 107 neurocompu-tational            neurocomputationa

In [29]:
# Pick indices of words to rehyphenate
indices = [1, 3, (6,8), 11, 17, 18, (21,23), 28, 30, 34]
words_to_hyphenate = [
    possible_hyphenated_words[i]
    for item in indices
    for i in ([item] if isinstance(item, int) else range(*item))
]

In [30]:
for word, hyphenated_word in words_to_hyphenate:
  print(f'{word:<30} {hyphenated_word}')

SchachterSinger                Schachter-Singer
semiMarkov                     semi-Markov
informationallyequivalent      informationally-equivalent
cooccurrence                   co-occurrence
samplingbased                  sampling-based
synchronizationcontinuation    synchronization-continuation
highdimensional                high-dimensional
modelinferred                  model-inferred
withincategory                 within-category
triallevel                     trial-level
nonsymbolic                    non-symbolic
thebest                        the-best


## Split up into talk entries

In [31]:
program_abstracts_start = program_text.split('title. \n\n## ')[1]

# Rehyphenate words that need hyphen at line breaks
program_abstracts_start = rehyphenate_words(program_abstracts_start, words_to_hyphenate)

# Gets rid of page break and pictured omitted text
program_abstracts_start = clean_text(program_abstracts_start, fix_whitespace=True)

# Splits each abstract entry
abstract_entries = re.split(r'\s\n\n##\s(?=\*\*)', program_abstracts_start)[:-4]
# abstract_entries = [entry for entry in abstract_entries if len(entry) > 300]

In [32]:
abstract_entries[:2]

['**A predictive processing implementation of the common model of cognition** \n\nKelly, Alex – Carleton University Ororbia, Alex \n\nSession: _ICCM: Poster session_ – live on Thursday, July 8, at 10:00AM EDT \n\nWe present how a cognitive architecture can be built from the neural circuit models proposed under the frameworks of holographic memory and neural generative coding. Specifically, we draw inspiration from well-known cognitive architectures such as ACT-R, Soar, Leabra, and Nengo, as well as the common model of cognition, to propose the kernel that might drive a complex, modular system that would prove useful for developing intelligent agents that tackle statistical learning tasks, as well as for answering questions and testing hypotheses in cognitive science and computational neuroscience.',
 '**A Bayesian account of two-factor theory of emotion process** \n\nYing, Lance – University of Michigan - Ann Arbor Zhang, Jun – University of Michigan  \n\nSession: _Fast Talk session_ –

## Sort authors, affiliations, title, and abstract

In [33]:
parsed_entries = []

for e, entry in enumerate(abstract_entries):
  entry_info, abstract = re.split(r'EDT(?: |\|\|)\n\n', entry, maxsplit=1)

  # Title is the only bolded text
  title = re.search(r'\*\*(.*?)\*\*', entry_info).group().strip('**')

  no_session = re.split(r'(?:\s|\|\|\n\||\n\n##|\n\n-\s)Session:', entry_info)[0].strip()
  auth_aff = re.split(r'\n\n', no_session)[-1]
  no_loc_auth_aff = remove_locations(auth_aff)

  authors, affiliations = parse_authors_affiliations(no_loc_auth_aff)

  parsed_entries.append({
      'year': year,
      'author(s)': ', '.join(authors),
      'affiliation(s)': '; '.join(affiliations),
      'title': title,
      'type': '',
      'abstract': clean_text(abstract, delete_whitespace=True)
  })

In [34]:
parsed_entries[:2]

[{'year': '2021',
  'author(s)': 'Alex Kelly, Alex Ororbia',
  'affiliation(s)': 'Carleton University; none',
  'title': 'A predictive processing implementation of the common model of cognition',
  'type': '',
  'abstract': 'We present how a cognitive architecture can be built from the neural circuit models proposed under the frameworks of holographic memory and neural generative coding. Specifically, we draw inspiration from well-known cognitive architectures such as ACT-R, Soar, Leabra, and Nengo, as well as the common model of cognition, to propose the kernel that might drive a complex, modular system that would prove useful for developing intelligent agents that tackle statistical learning tasks, as well as for answering questions and testing hypotheses in cognitive science and computational neuroscience.'},
 {'year': '2021',
  'author(s)': 'Lance Ying, Jun Zhang',
  'affiliation(s)': 'University of Michigan - Ann Arbor; University of Michigan',
  'title': 'A Bayesian account of tw

# Create df and convert to csv

In [35]:
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [36]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2021,"Alex Kelly, Alex Ororbia",Carleton University; none,A predictive processing implementation of the ...,,We present how a cognitive architecture can be...
1,2021,"Lance Ying, Jun Zhang",University of Michigan - Ann Arbor; University...,A Bayesian account of two-factor theory of emo...,,Bayesian inference has been used in the past t...
2,2021,"Jeff Coon, Irvine California, Michael Lee",University of; none; University of,A Bayesian method for measuring risk propensit...,,The Balloon Analogue Risk Task (BART) is widel...
3,2021,"Sabina J Sloman, Robert Goldstone, Cleotilde (...",Carnegie Mellon University; none; Carnegie Mel...,A cognitive computational model of collective ...,,Many of the decisions we make in day-to-day li...
4,2021,"James Yearsley, University of London City",none; none,A computational model of the IEDS task helps s...,,The Intra-Extra-dimensional set shift task (IE...


In [37]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)