<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp2012_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Top of Script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pymupdf
!pip install pymupdf-layout
!pip install pymupdf4llm
# !pip install rapidfuzz
import glob
import os
import pathlib
import pymupdf
import pymupdf.layout
import pymupdf4llm
import re
import pandas as pd
import unicodedata
# from rapidfuzz import process, fuzz

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting pymupdf-layout
  Downloading pymupdf_layout-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (2.8 kB)
Collecting PyMuPDF==1.26.6 (from pymupdf-layout)
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting onnxruntime (from pymupdf-layout)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->pymupdf-layout)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->pymupdf-layout)
  Downlo

In [3]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# Functions
Created with help from GPT 5.2, but some are my own code just turned into a function.

In [4]:
AFFILIATION_KEYWORDS = [
    "University", "College", "Department", "Center", "Institute",
    "Laboratory", "School", "Hospital", "UC", "Centre", "Research",
    "Corporation", "Defence", "Université", "Universite", "Universiy"
]
AFFILIATION_KEYWORDS = re.compile(r'\b(' + '|'.join(AFFILIATION_KEYWORDS) + r')\b',
                                  re.I)

def looks_like_affiliation(chunk):
    return bool(AFFILIATION_KEYWORDS.search(chunk))

In [5]:
def normalize_whitespace(s: str) -> str:
    return " ".join(s.replace("\n", "").split())

In [6]:
def normalize_affiliations(entry: str) -> str:
    return re.sub(
        r'(University of California)\s*,?\s*'
        r'(Irvine|Davis|Berkeley|Los Angeles|San Diego|Santa Barbara|Santa Cruz|Riverside|Merced)',
        r'\1, \2',
        entry
    )

In [74]:
def split_authors_affiliations(entry: str) -> tuple[str, str]:
    entry = normalize_whitespace(entry)
    entry = normalize_affiliations(entry)

    # split only on AND / & / commas WITH SPACES
    tokens = re.split(r'\s+(?:and|&)\s+|,\s+(?=[A-Z])', entry)

    authors = []
    affiliations = []

    for token in tokens:
        token = token.strip()

        # if token is one word and we already have an affiliation then attach
        if len(token.split()) == 1 and affiliations and token[0].isupper():
            affiliations[-1] = affiliations[-1] + ", " + token
            continue

        if looks_like_affiliation(token):
            affiliations.append(token)
        else:
            authors.append(token)

    if len(set(affiliations)) == 1:
        affiliations = affiliations[0]
    else:
      affiliations = "; ".join(affiliations)

    return (
        ", ".join(authors),
        affiliations
    )

In [81]:
def remove_trailing_text(text):
  no_trailing_junk = entry.split(".")[:-1]

  return ".".join(no_trailing_junk)

In [8]:
def clean_text(text):
    if not text:
        return text

    text = re.sub(r' \n\n\d{1,3} \n\n', ' ', text)  # Remove page breaks with page number
    text = re.sub(r'\s*\n\s*', ' ', text)    # Replace newlines with spaces

    text = re.sub(r'-\s+', '', text)         # Get rid of "- "; will fix actual
                                             # hyphenated words manually

    text = re.sub(r'\s{2}', ' ', text)       # Collapse two adjacent spaces into one

    text = re.sub(r'\.\s*##.*$', '.', text,\
                  flags=re.DOTALL)           # Gets rid of extraneous text after
                                             # last sentence
    text = text.strip()
    text = fix_ligatures(text)

    return text

In [9]:
LIGATURE_MAP = {
    "ﬁ": "fi",
    "ﬂ": "fl",
    "ﬃ": "ffi",
    "ﬄ": "ffl",
    "ﬀ": "ff",
    "ﬅ": "ft",
    "ﬆ": "st",
    "Æ": 'ffi'
}

def fix_ligatures(text):
    # Replace known ligatures
    for bad, good in LIGATURE_MAP.items():
        text = text.replace(bad, good)

    # Replace any private-use ligature (common in PDFs)
    cleaned_chars = []
    for ch in text:
        name = unicodedata.name(ch, "")
        if "LIGATURE" in name.upper():
            # Try to break it apart: remove spaces and lowercase
            base = name.split("LIGATURE")[-1]
            base = base.replace(" ", "").lower()
            cleaned_chars.append(base)
        else:
            cleaned_chars.append(ch)

    return "".join(cleaned_chars)

# Program

122 total entries (96 talks and 26 posters)

This one is almost identical format to 2011 program, so basically used same code with some modifications. Markdown still doesn't show bold or italic text.

## Grab text from the pdf

In [10]:
year = '2012'
program = pymupdf.open(pdfs_path + f'smp{year}_program.pdf')

In [13]:
program_text = pymupdf4llm.to_text(program)

In [67]:
program_text[:1000]

'Abstracts For Talks \n\n(91) \n\nDAY, TIME PLACE \n\nDimensional information-theoretic measures of affective expressivity. Jihun Hamm, Ohio State Univeristy, Christian Kohler, University of Pennsylvania, Ruben Gur, University of Pennsylvania, Ragini Verma, University of Pennsylvania . Objective: Abilities in recognition and expression of emotions have crucial impact on social lives of individuals. While several objective measures of affect recognition are available, measures of deficits in expressivity are mostly observerbased. We present two dimensional measures of facial expressivity ambiguity (vs consistency) and distinctiveness (vs inseparability) computed objectively from information-theory. We use these measures to study expressivity deficits in schizophrenia compared to a healthy comparison group. Method: The sample included 28 schizophrenia patients without tardive dyskinesia or acute extrapyramidal symptoms and 26 healthy controls matched by gender, race, age, and parental ed

## Split text into presentation entries

In [41]:
entries = re.split(r'\s*DAY, TIME PLACE\s*', program_text)[1:]

split_entries = [part
                 for entry in entries
                 for part in re.split(r'\s*PLACE\s*', entry)
                 ]

In [42]:
fixed_entries = []

for entry in split_entries:
  if '<==\n\n' in entry:
    fixed_entries.extend(entry.split('<==\n\n'))
  else:
    fixed_entries.append(entry)

In [50]:
filtered_entries = [entry for entry in fixed_entries if len(entry.strip()) > 100]

In [51]:
filtered_entries[:2]

['Dimensional information-theoretic measures of affective expressivity. Jihun Hamm, Ohio State Univeristy, Christian Kohler, University of Pennsylvania, Ruben Gur, University of Pennsylvania, Ragini Verma, University of Pennsylvania . Objective: Abilities in recognition and expression of emotions have crucial impact on social lives of individuals. While several objective measures of affect recognition are available, measures of deficits in expressivity are mostly observerbased. We present two dimensional measures of facial expressivity ambiguity (vs consistency) and distinctiveness (vs inseparability) computed objectively from information-theory. We use these measures to study expressivity deficits in schizophrenia compared to a healthy comparison group. Method: The sample included 28 schizophrenia patients without tardive dyskinesia or acute extrapyramidal symptoms and 26 healthy controls matched by gender, race, age, and parental education. Subjects were administered facial emotion r

## Sort authors, affiliations, title, and abstract

In [82]:
parsed_entries = []

for e, entry in enumerate(filtered_entries):
  cleaned_entry = remove_trailing_text(entry)

  title, rest_of_entry = cleaned_entry.split('.', 1)
  # Gets rid of topic in between square brackets before the title
  if ']' in title:
    cleaned_title = re.split(r']\s*', title)[1]
  else:
    cleaned_title = title

  # Splits by period between the authors and affiliations and the abstract
  auth_and_aff, abstract = re.split(r'(?<!\b[A-Z])\.\s+(?=[A-Z])', rest_of_entry, 1)

  authors, affiliations = split_authors_affiliations(auth_and_aff)
  cleaned_abstract = clean_text(abstract)

  parsed_entries.append({
    'year': year,
    'author(s)': authors,
    'affiliation(s)': affiliations,
    'title': cleaned_title,
    'type': '',
    'abstract': cleaned_abstract + '.'  # add back last period
  })

In [83]:
parsed_entries[:2]

[{'year': '2012',
  'author(s)': 'Jihun Hamm, Ohio State Univeristy, Christian Kohler, Ruben Gur, Ragini Verma',
  'affiliation(s)': 'University of Pennsylvania',
  'title': 'Dimensional information-theoretic measures of affective expressivity',
  'type': '',
  'abstract': 'Objective: Abilities in recognition and expression of emotions have crucial impact on social lives of individuals. While several objective measures of affect recognition are available, measures of deficits in expressivity are mostly observerbased. We present two dimensional measures of facial expressivity ambiguity (vs consistency) and distinctiveness (vs inseparability) computed objectively from information-theory. We use these measures to study expressivity deficits in schizophrenia compared to a healthy comparison group. Method: The sample included 28 schizophrenia patients without tardive dyskinesia or acute extrapyramidal symptoms and 26 healthy controls matched by gender, race, age, and parental education. Sub

# Create df and convert to csv

In [84]:
df = pd.DataFrame(parsed_entries, columns=["year", "author(s)", "affiliation(s)", "title", "type", "abstract"])

In [85]:
df.head()

Unnamed: 0,year,author(s),affiliation(s),title,type,abstract
0,2012,"Jihun Hamm, Ohio State Univeristy, Christian K...",University of Pennsylvania,Dimensional information-theoretic measures of ...,,Objective: Abilities in recognition and expres...
1,2012,"Jihun Hamm, Ohio State Univeristy, Benjamin St...",Ohio State University,Automatic Annotation of Daily Activity from Sm...,,We present automatic annotation methods of dai...
2,2012,"Brent Kievit-Kylar, Michael Jones",Indiana University,A Continuous Holographic Vector Model of Seman...,,We explore an extension of the BEAGLE model (J...
3,2012,Fabio Leite,The Ohio State University at Lima,Accounts of two diffusion-process models to st...,,I analyzed response time and accuracy data fro...
4,2012,"Guy Hawkins, Scott Brown, Tony Marley, Andrew ...",University of Newcastle; University of Newcast...,Accumulator models for consumer preference and...,,Consumer preferences for goods or services are...


In [86]:
df.to_csv(f"/content/drive/MyDrive/math_psych_work/csv/smp{year}_program.csv", index=False)