<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp1999_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install pymupdf
import glob
import os
import pathlib
import pymupdf
import re
import pandas as pd

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.6


In [5]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# 1999 program

## Grab text from smp1999_program.pdf

In [10]:
smp1999_program = pymupdf.open(pdfs_path + 'smp1999_program.pdf')

# For these, indices are text[page_num][piece_of_text]
smp1999_program_text = []

# Grabs text from each page in the doc
for page in smp1999_program:
  smp1999_program_text.append(page.get_text('blocks'))  # adds text from each page
smp1999_program.close()

In [12]:
smp1999_program_text[0][13:18]

[(94.05929565429688,
  440.87060546875,
  249.5148468017578,
  520.4420166015625,
  'A. A. J. MARLEY, McGill\nUniversity, and R. DUNCAN\nLUCE, University of California\nIrvine: A simple axiomatization\nof binary rank-dependent\nexpected utility of gains (losses)\n',
  13,
  0),
 (266.7142639160156,
  454.3827209472656,
  498.1812438964844,
  506.9299011230469,
  'HANS COLONIUS, Oldenburg University:\nVisual-auditory interaction in space and in time:\nAn integrated modeling approach for divided\nand focused attention tasks\n',
  14,
  0),
 (514.4366455078125,
  474.65093994140625,
  517.4393310546875,
  486.6617126464844,
  '.\n',
  15,
  0),
 (40.761451721191406,
  559.4771118164062,
  76.7938003540039,
  571.4878540039062,
  '8:25am\n',
  16,
  0),
 (94.05929565429688,
  532.4528198242188,
  244.5069122314453,
  598.5120849609375,
  "R. DUNCAN LUCE and\nROBERT SNEDDON,\nUniversity of California Irvine:\nReduction invariance and\nPrelec's weighting function\n",
  17,
  0)]

## Clean up entries

In [14]:
smp1999_program_strings = []
for block in smp1999_program_text:
  for i, entry_tuple in enumerate(block):
    # Extract the text string from the tuple entry
    entry = entry_tuple[4]
    # Turns string into list of words
    words = entry.split()

    # Skip the empty table entries
    if (words == ['.']) or (not words):
      continue

    smp1999_program_strings.append(' '.join(words))  # turns the word list back into one string

In [15]:
smp1999_program_strings[12:20]

['A. A. J. MARLEY, McGill University, and R. DUNCAN LUCE, University of California Irvine: A simple axiomatization of binary rank-dependent expected utility of gains (losses)',
 'HANS COLONIUS, Oldenburg University: Visual-auditory interaction in space and in time: An integrated modeling approach for divided and focused attention tasks',
 '8:25am',
 "R. DUNCAN LUCE and ROBERT SNEDDON, University of California Irvine: Reduction invariance and Prelec's weighting function",
 'BRUCE BRIDGEMAN, University of California, Santa Cruz: Sensory memory arises from lateral inhibition',
 '8:50am',
 "WILLIAM H. BATCHELDER and BETHANY KNAPP, University of California Irvine: Predicting response time phenomena in binary choice from Luce's choice theory",
 'JENNIFER MCLEAN, University of Washington: Processing capacity of visual perception and memory encoding']

## Collect authors, affiliations, and title
First as a dictionary -> data frame -> .csv file

In [17]:
smp1999_program_list = []
to_fix_list = []

# Filters out a good number of entries that aren't about the talks
smp1999_filtered_program_strings = [entry for entry in smp1999_program_strings\
                                    if len(entry.split()) > 7]  # 7 is a good length that gets rid
                                                                # of most entries that aren't talks

sample_strings = smp1999_filtered_program_strings

for string in sample_strings:
  authors_and_title = string.split(':', 1)  # splits authors and affiliations from title

  # Saves titles that were broken up by page break or don't have colon for fixing later manually
  if len(authors_and_title) < 2:
    to_fix_list.append(string)
    continue


  talk_title = authors_and_title[1].strip()   # adds title w/o the extra space
                                              # at the beginning from splitting

  # Checks for authors in title
  title_list = talk_title.split()
  for word in title_list:
    if word.isupper() and (len(word) > 4):
      to_fix_list.append([word, string])
      break

  # Separates authors and affiliations
  and_author_split = authors_and_title[0].split(' and ')
  # Saves the comma in some schools (UCs, CSUs, etc.)
  saving_aff_comma = [
                      re.sub(r'(University[^,]+),\s+', r'\1<<COMMA>>', a)\
                      for a in and_author_split
                      ]
  saving_aff_comma = [a.replace('California State University, ',\
                                'California State University<<COMMA>>')\
                      for a in saving_aff_comma]
  auth_aff_list = [item for a in saving_aff_comma for item in a.split(', ')]
  auth_aff_list = [a.replace("<<COMMA>>", ", ") for a in auth_aff_list] # puts comma back

  initial_authors = []
  affiliations = []
  for a in auth_aff_list:
    if a.isupper():
      fixed_author = re.sub(r'^[^a-zA-Z]', '', a).strip() # gets rid of any extra non-letter
                                                          # characters at the beginning
      initial_authors.append(fixed_author)
    else:
      affiliations.append(a.strip(','))

  # Checks for authors in any affiliations
  for a in affiliations:
    word_list = a.split()
    for word in word_list:
      if word.isupper():
        to_fix_list.append([a, string])
        break

  # Fixes splitting author name and suffix (e.g., JR.) separated by comma
  fixed_authors = []
  for a in initial_authors:
    if (fixed_authors) and ('JR.' in a):
      fixed_authors[-1] += ", " + a
    else:
      fixed_authors.append(a.strip(','))

  talk_dict = {
      'author(s)': ', '.join(fixed_authors),  # joining so entries don't look like python lists
      'affiliation(s)': ', '.join(affiliations),
      'title': talk_title
  }

  smp1999_program_list.append(talk_dict)

In [18]:
smp1999_program_list[:5]

[{'author(s)': '',
  'affiliation(s)': '8',
  'title': '00am Choice, Measurement, and Statistics Information Processing I .'},
 {'author(s)': 'A. A. J. MARLEY, R. DUNCAN LUCE',
  'affiliation(s)': 'McGill University, University of California Irvine',
  'title': 'A simple axiomatization of binary rank-dependent expected utility of gains (losses)'},
 {'author(s)': 'HANS COLONIUS',
  'affiliation(s)': 'Oldenburg University',
  'title': 'Visual-auditory interaction in space and in time: An integrated modeling approach for divided and focused attention tasks'},
 {'author(s)': 'R. DUNCAN LUCE, ROBERT SNEDDON',
  'affiliation(s)': 'University of California Irvine',
  'title': "Reduction invariance and Prelec's weighting function"},
 {'author(s)': 'BRUCE BRIDGEMAN',
  'affiliation(s)': 'University of California, Santa Cruz',
  'title': 'Sensory memory arises from lateral inhibition'}]

In [23]:
# Convert to data frame
smp1999_df = pd.DataFrame(smp1999_program_list)
smp1999_df[:12]

Unnamed: 0,author(s),affiliation(s),title
0,,8,"00am Choice, Measurement, and Statistics Infor..."
1,"A. A. J. MARLEY, R. DUNCAN LUCE","McGill University, University of California Ir...",A simple axiomatization of binary rank-depende...
2,HANS COLONIUS,Oldenburg University,Visual-auditory interaction in space and in ti...
3,"R. DUNCAN LUCE, ROBERT SNEDDON",University of California Irvine,Reduction invariance and Prelec's weighting fu...
4,BRUCE BRIDGEMAN,"University of California, Santa Cruz",Sensory memory arises from lateral inhibition
5,"WILLIAM H. BATCHELDER, BETHANY KNAPP",University of California Irvine,Predicting response time phenomena in binary c...
6,JENNIFER MCLEAN,University of Washington,Processing capacity of visual perception and m...
7,,9,"15am MATTHEW JONES and JUN ZHANG, University o..."
8,"DAVID HUBER, KEITH LYLE, RICHARD SHIFFRIN",Indiana University,Short-term priming: Data and a model for bias ...
9,GEORGE KARABATSOS,Louisiana State University Medical Center,Representational measurement theory and item- ...


In [24]:
to_fix_list

['32nd Annual Meeting of the Society for Mathematical',
 ['MATTHEW',
  '9:15am MATTHEW JONES and JUN ZHANG, University of Michigan: Learning to cooperate'],
 "in a prisoner's dilemma game under Markov framework",
 ['NASA Ames Research Center',
  'ALBERT J. AHUMADA, JR., NASA Ames Research Center: A ﬂexible, nonparametric deﬁnition of threshold'],
 ['MARTIN S. BANKS University of California at Berkeley',
  'BENJAMIN T. BACKUS, Stanford University, and MARTIN S. BANKS University of California at Berkeley: Estimator reliability theory predicts the perceived slant of cue-conﬂict stereo surfaces'],
 ['J. W. Goethe University Frankfurt',
  'ANDREAS KLEIN, J. W. Goethe University Frankfurt:The LMS method for SEM with latent interaction effects'],
 ['LOTFI',
  'Plenary Talk: LOTFI ZADEH, University of California, Berkeley, Toward a computational theory of perceptions'],
 ['HANS COLONIUS University of Oldenburg',
  'EHTIBAR N. DZHAFAROV, Purdue University, and HANS COLONIUS University of Oldenb

## Convert to CSV file

In [25]:
smp1999_df.to_csv("/content/drive/MyDrive/math_psych_work/smp1999_program.csv", index=False)