<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2022_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Suppresses output from pip installs
%%capture

!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install wordfreq
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import regex
import pandas as pd
import unicodedata
import wordfreq
from rapidfuzz import process, fuzz

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [4]:
LOCATIONS = [
    'United States of America', 'United States', 'Switzerland', 'Japan', 'Bremen',
    'Berlin, Germany', 'Heidelberg, Germany', 'Germany', 'Berlin', 'Norway',
    'Turkey', 'Belgium', 'Italy', 'Israel', 'New Brunswick, New Jersey',
    'Australia', 'The Netherlands', 'USA', 'US', 'Netherlands, The', 'Netherlands',
    'United Kingdom', 'Singapore', 'France', 'Dayton OH', 'Dayton', 'India',
    'Taiwan, Republic of China', 'Austria', 'Canada', 'Denmark', 'Spain',
    'Edmonton', 'Bloomington, Indiana', 'Indiana', 'Russian Federation',
    'University Park, Pennsylvania', 'California', 'San Francisco, California',
    'Taipei, Taiwan', 'Charlottesville, Virginia', 'New York, New York',
    'Toronto, Ontario', 'New Haven, Connecticut', 'Ann Arbor, Michigan', 'Ohio',
    'Ottawa, Ontario', 'Houston, Texas', 'UK', 'New Brunswick, Piscataway, NJ',
    'Finland', 'Iceland', 'Mexico', 'South Korea'
]

# compile once
LOCATION_RE = re.compile(
    r',\s*(?:' + '|'.join(map(re.escape, LOCATIONS)) + r')\b',
    re.I
)

def remove_locations(entry: str) -> str:
    return LOCATION_RE.sub('', entry).strip()

In [5]:
def reorder_author(author: str) -> str:
    """
    Convert 'Last, First Middle' → 'First Middle Last'
    """
    last, given = author.split(",", 1)
    return f"{given.strip()} {last.strip()}"

In [8]:
def remove_page_break_text(text: str) -> str:
    if not text:
        return text

    # Replaces with nothing for the rest
    text = re.sub(r'\n\n\d{1,3} \n\n', '\n\n', text)

    return text.strip()

In [9]:
def remove_picture_text(text: str) -> str:
    if not text:
        return text

    text = re.sub(
        r'''
        \n\n\*\*==>\spicture\s\[\d{2}\sx\s\d{2}\]\sintentionally\somitted\s<==\*\*\n\n
        ''',
        '\n\n',
        text,
        flags=re.VERBOSE
    )

    text = re.sub(
        r'''
        \*\*-----\sStart\sof\spicture\stext\s-----\*\*
        <br>
        ''',
        '',
        text,
        flags=re.VERBOSE
    )

    text = re.sub(
        r'''
        \*\*-----\sEnd\sof\spicture\stext\s-----\*\*<br>
        ''',
        '##',
        text,
        flags=re.VERBOSE
    )

    return text

In [103]:
def clean_text(text, fix_whitespace=False, delete_whitespace=False):
    if not text:
        return text

    text = fix_ligatures(text)
    text = remove_page_break_text(text)
    # text = remove_picture_text(text)  # there's only on entry with picture
                                        # omitted text and the format is a little different

    # # Gets rid of breaks in titles
    # text = re.sub(r'\*\*\s\n\n##\s\*\*', ' ', text)

    if fix_whitespace:
      text = re.sub(r'\n\s+', '\n\n', text)

    if delete_whitespace:
      text = re.sub(r'\s{2,}', ' ', text)

    text = text.strip()

    return text

In [104]:
LIGATURE_MAP = {
    "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl", "ﬀ": "ff", "ﬅ": "ft", "ﬆ": "st",
    "Æ": "ffi", "¨u": "ü", "¨a": "ä", "´e": "é", "`e": "è", "`a": "à", "¨o": "ö",
    "˚a": "å", "c¸": "ç", '“': '"', '”': '"', "’": "'", '˜n': 'ñ', 'ˇs': 'š',
    "âĂŸ": "'", "``": '"', "↵": "ff", "✏": "ffl", "‘": "'"
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

In [105]:
def rehyphenate_words(text, words_to_hyphenate):
    for word, hyphenated_word in words_to_hyphenate:
        text = text.replace(word, hyphenated_word)

    return text

In [106]:
# Checks if valid word using Zipf frequency
def is_probably_valid(word, threshold=2.5):
    return wordfreq.zipf_frequency(word, "en") > threshold  # smaller number cuts off
                                                            # more words, bigger is
                                                            # more lenient

# Program

93 entries total (47 talks, 16 fast talks, 9 extended abstracts, and 21 full paper abstracts)

Has justified text so need to check hyphenated words at line breaks, affiliations are listed after names after an en dash. Some authors don't have affiliation listed.




In [27]:
year = '2022'
file_path = pdfs_path + f'smp{year}_program.pdf'

## Grab text from the pdf



In [16]:
program = pymupdf.open(file_path)   # original PDF
program_text = pymupdf4llm.to_markdown(program)

In [91]:
program_text[44_200:46_000]

'*‘Talk’ abstracts** \n\n## **Hybrid-similarity exemplar model for predicting individual-item recognition in a high-dimensional category domain** \n\nParticipants learned to classify a set of rock images into geologically-defined science categories. We then investigated the nature of their category-based memory representations by collecting old-new recognition data in a subsequent transfer phase. An exemplar model provided better qualitative accounts of the old-new recognition data than did a prototype or clustering model. However, to account for the variability in recognition probabilities among the old training items themselves, a hybrid-similarity exemplar model was needed that took account of distinctive features present in the items. The study is among the first to use computational models for making detailed quantitative predictions of old-new recognition probabilities for individual items embedded in complex, high-dimensional similarity spaces. \n\n**Nosofsky, Robert M.** _India

## Find words to re-hyphenate

In [92]:
pattern = re.compile(r'([A-Za-z]+)-\n\s*([A-Za-z]+)')
possible_hyphenated_words = []

counter = 0
for p, page in enumerate(program[28:]):  # these are the pages with abstracts only
  text = fix_ligatures(page.get_text('text'))
  matches = pattern.findall(text)

  for left, right in matches:
    word = f"{left}{right}"
    hyphenated_word = f"{left}-{right}"
    if not is_probably_valid(word, threshold=2.8):
      possible_hyphenated_words.append([word, hyphenated_word])

      print(f"{counter:>3}: Page {p+29:<3} {hyphenated_word:<30} {word}")
      counter += 1

  0: Page 29  exem-plar                      exemplar
  1: Page 29  base-rate                      baserate
  2: Page 29  multidimen-sional              multidimensional
  3: Page 32  di-mensionality                dimensionality
  4: Page 32  jumping-to                     jumpingto
  5: Page 33  au-tomates                     automates
  6: Page 33  conceptual-izations            conceptualizations
  7: Page 33  topic-neutral                  topicneutral
  8: Page 34  sub-tractive                   subtractive
  9: Page 34  an-thropometrics               anthropometrics
 10: Page 35  aggre-gating                   aggregating
 11: Page 36  Electroencephalog-raphy        Electroencephalography
 12: Page 37  distri-butional                distributional
 13: Page 37  elec-troencephalography        electroencephalography
 14: Page 37  gen-erative                    generative
 15: Page 37  elec-tromyographical           electromyographical
 16: Page 37  between-condition              b

In [93]:
# Pick indices of words to rehyphenate
indices = [1, 4, 7, 16, 17, (25, 28), 32, 34, 35, 42, 47]
words_to_hyphenate = [
    possible_hyphenated_words[i]
    for item in indices
    for i in ([item] if isinstance(item, int) else range(*item))
]

In [94]:
for word, hyphenated_word in words_to_hyphenate:
  print(f'{word:<30} {hyphenated_word}')

baserate                       base-rate
jumpingto                      jumping-to
topicneutral                   topic-neutral
betweencondition               between-condition
valuebased                     value-based
multiattribute                 multi-attribute
ValenceWeighted                Valence-Weighted
largerlater                    larger-later
bestperforming                 best-performing
mcstan                         mc-stan
finegrained                    fine-grained
decisionmaking                 decision-making
jumpingto                      jumping-to


## Split up into talk entries

In [95]:
program_abstracts_start = program_text.split('‘Talk’ abstracts** \n\n## ')[1]

# Rehyphenate words that need hyphen at line breaks
program_abstracts_start = rehyphenate_words(program_abstracts_start, words_to_hyphenate)

# Gets rid of page break and pictured omitted text
program_abstracts_start = clean_text(program_abstracts_start, fix_whitespace=True)

# Splits each abstract entry
abstract_entries = re.split(r'EDT\s*##\s(?=\*\*)', program_abstracts_start)[:-1]

In [96]:
# Splits up entries containing multiple entries
final_abstract_entries = []

for entry in abstract_entries:
  if re.search(r"##\s+\*\*([^,\n]+)\*\*", entry):
    new_split_entry = re.split(r"\s\n\s*##\s+(?=\*\*[^,\n]+\*\*)", entry)
    final_abstract_entries.extend(new_split_entry)
  else:
    final_abstract_entries.append(entry)

# Filters out a heading for extended abstracts
final_abstract_entries = [entry for entry in final_abstract_entries if len(entry) > 100]

In [97]:
final_abstract_entries[:2]

['**Hybrid-similarity exemplar model for predicting individual-item recognition in a high-dimensional category domain** \n\nParticipants learned to classify a set of rock images into geologically-defined science categories. We then investigated the nature of their category-based memory representations by collecting old-new recognition data in a subsequent transfer phase. An exemplar model provided better qualitative accounts of the old-new recognition data than did a prototype or clustering model. However, to account for the variability in recognition probabilities among the old training items themselves, a hybrid-similarity exemplar model was needed that took account of distinctive features present in the items. The study is among the first to use computational models for making detailed quantitative predictions of old-new recognition probabilities for individual items embedded in complex, high-dimensional similarity spaces. \n\n**Nosofsky, Robert M.** _Indiana University_ \n\n**Meagh

## Sort authors, affiliations, title, and abstract

In [98]:
parsed_entries = []

for e, entry in enumerate(final_abstract_entries):
  title, rest_of_entry = re.split(r'\n\n', entry, maxsplit=1)

  # Title is the only bolded text
  title = re.search(r'\*\*(.*?)\*\*', title).group().strip('**')

  no_session = re.sub(
      r'\n\nSession:\s*(.*?)\s*\d{2}:\d{2}\s(EDT)?\s?',
      '',
      rest_of_entry)

  authors = re.findall(r'\*\*([^,\n*]+,[^,\n*]+)\*\*', no_session)
  authors = [reorder_author(author) for author in authors]

  affiliations = re.findall(r'_(.*?)_', no_session)
  affiliations = [remove_locations(affiliation) for affiliation in affiliations]
  if len(set(affiliations)) == 1:
    final_affiliations = affiliations[0]
  else:
    final_affiliations = '; '.join(affiliations)

  abstract = re.sub(r'\*\*[^,\n*]+,[^,\n*]+\*\*', '', no_session)
  abstract = re.sub(r'_(.*?)_', '', abstract)

  parsed_entries.append({
      'year': year,
      'author(s)': ', '.join(authors),
      'affiliation(s)': final_affiliations,
      'title': title,
      'type': '',
      'abstract': clean_text(abstract, delete_whitespace=True)
  })

In [99]:
parsed_entries[:2]

[{'year': '2022',
  'author(s)': 'Robert M. Nosofsky, Brian Meagher',
  'affiliation(s)': 'Indiana University; Indiana University Bloomington',
  'title': 'Hybrid-similarity exemplar model for predicting individual-item recognition in a high-dimensional category domain',
  'type': '',
  'abstract': 'Participants learned to classify a set of rock images into geologically-defined science categories. We then investigated the nature of their category-based memory representations by collecting old-new recognition data in a subsequent transfer phase. An exemplar model provided better qualitative accounts of the old-new recognition data than did a prototype or clustering model. However, to account for the variability in recognition probabilities among the old training items themselves, a hybrid-similarity exemplar model was needed that took account of distinctive features present in the items. The study is among the first to use computational models for making detailed quantitative prediction

# Create df and convert to csv

In [100]:
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [101]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2022,"Robert M. Nosofsky, Brian Meagher",Indiana University; Indiana University Bloomin...,Hybrid-similarity exemplar model for predictin...,,Participants learned to classify a set of rock...
1,2022,"Anne Voormann, Mikhail Spektor, Christoph Klauer",University of Warwick; University of Freiburg,A validation study of paired-word recognition ...,,How do people recognize objects they have enco...
2,2022,"Madison Paron, James Paron, Mike Kahana",University of Pennsylvania,A context-based model of recall and decisions,,Existing models of memory posit separate proce...
3,2022,"Yiyang Chen, Mario Peruggia, Trisha Van Zandt",University of Kansas; The Ohio State Universit...,Mutual interference in working memory updating...,,We built a hierarchical Bayesian model for the...
4,2022,Rahul Bhui,Massachusetts Institute of Technology; Harvard...,Ambiguity and confirmation bias in reward lear...,,We tend to interpret feedback in ways that con...


In [102]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)