<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2007_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
!pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import pandas as pd
import unicodedata
from rapidfuzz import process, fuzz

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting pymupdf-layout
  Downloading pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (2.8 kB)
Collecting PyMuPDF==1.26.6 (from pymupdf-layout)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting onnxruntime (from pymupdf-layout)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->pymupdf-layout)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->pymupdf-layout)
  Downlo

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [4]:
def clean_text(text):
    if not text:
        return text

    text = re.sub(r'\s*\n\s*', ' ', text)    # replace newlines with spaces
    text = re.sub(r'-\s+', '-', text) # get rid of space in hyphenated words
                                      # broken by line breaks
    text = text.strip()

    return text

In [5]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": 'ffi'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

# Program

114 total entries. Abstract pdf is organized very nicely; each page is for one presentation's abstract and starts with presenter name (but I'll ignore this because just one of the authors), presentation type (talk, symposium, etc.), presentation day/time, then title, authors each on own line, comma, their affiliation, then abstract.

## Grab text from the pdf

In [6]:
year = '2007'
abstracts = pymupdf.open(pdfs_path + f'smp{year}_abstracts.pdf')

In [7]:
abstract_pages_text = []

for page in abstracts:
  abstract_pages_text.append(page.get_text('text').strip())

In [8]:
abstract_pages_text[:2]

['Presenter:  Nando De Freitas \nPresentation type:  Symposium \nPresentation date/time:  7/26  9:00-9:50 \n  \nModern Monte Carlo Methods \n  \nNando De Freitas, UBC\n \n  \nIn this talk I will introduce modern Monte Carlo methods, including state-of-the-art \nsequential Monte Carlo (SMC) and trans-dimensional Markov chain Monte Carlo \n(MCMC). After laying out the foundation, I will show how these flexible techniques\nare ideally suited for carrying out computation in sophisticated probabilistic models of\ncognition. In particular, I will show how they can be used to learn models with time-\nvarying properties, unknown number of variables, and (possibly unknown) complex\nrelational and hierarchical structures. I will also show how these methods can be used\nto attack problems in stochastic decision making, such as active learning,\nexperimental design, optimal control and sequential Markov decision processes.',
 "Presenter:  Lawrence DeCarlo \nPresentation type:  Talk \nPresentation 

## Clean up entries

In [9]:
parsed_entries = []

for entry in abstract_pages_text:
  entry = fix_ligatures(entry.strip())
  if entry:
    info, abstract = re.split(r"\n\s*\n\s*\n", entry, 1)

    # Splits into 3 sections:
    # presentation details (presenter, type, day/time), title,
    # then listed authors and affiliations
    listed_info, title, auth_and_aff = re.split(r"\n\s*\n", info)

    pres_type = listed_info.split("\n")[1].split(": ")[1].strip()

    split_auth_aff = auth_and_aff.split('\n')
    authors = [entry.split(',', 1)[0] for entry in split_auth_aff]
    affs = [entry.split(',', 1)[1].strip() for entry in split_auth_aff]

    parsed_entries.append({
        "year": year,
        "author(s)": ", ".join(authors),
        "affiliation(s)": "; ".join(affs),
        "title": clean_text(title),
        "type": pres_type.lower(),
        "abstract": clean_text(abstract)
    })

In [11]:
parsed_entries[:2]

[{'year': '2007',
  'author(s)': 'Nando De Freitas',
  'affiliation(s)': 'UBC',
  'title': 'Modern Monte Carlo Methods',
  'type': 'symposium',
  'abstract': 'In this talk I will introduce modern Monte Carlo methods, including state-of-the-art sequential Monte Carlo (SMC) and trans-dimensional Markov chain Monte Carlo (MCMC). After laying out the foundation, I will show how these flexible techniques are ideally suited for carrying out computation in sophisticated probabilistic models of cognition. In particular, I will show how they can be used to learn models with time-varying properties, unknown number of variables, and (possibly unknown) complex relational and hierarchical structures. I will also show how these methods can be used to attack problems in stochastic decision making, such as active learning, experimental design, optimal control and sequential Markov decision processes.'},
 {'year': '2007',
  'author(s)': 'Lawrence DeCarlo',
  'affiliation(s)': 'Teachers College, Columbi

# Create df and convert to csv

In [12]:
# Skip first entry because extraneous text
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [13]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2007,Nando De Freitas,UBC,Modern Monte Carlo Methods,symposium,In this talk I will introduce modern Monte Car...
1,2007,Lawrence DeCarlo,"Teachers College, Columbia University",On Some Mixture SDT Models for Associative-Rec...,talk,Participants in associative-recognition tasks ...
2,2007,"Zygmunt Pizlo, Edward Carpenter, David Foldes,...",Purdue University; Purdue University; Purdue U...,Traveling Salesman Problem in real and VR space,talk,TSP on a Euclidean plane is solved quite well ...
3,2007,"Yoonhee Jang, John Wixted, David Huber","University of California, San Diego; Universit...","Testing the unequal-variance, dual-process, an...",talk,Three models have been advanced to explain the...
4,2007,"Oh-Sang Kwon, Zygmunt Pizlo, Howard Zelaznik, ...",Purdue University; Purdue University; Purdue U...,Pyramid model of the transfer of skilled movement,talk,Generalized Motor Program theory (Schmidt 1975...


In [14]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)