<a href="https://colab.research.google.com/github/lizaoh/smp_program_data/blob/main/smp1999_extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pymupdf
import glob
import os
import pathlib
import pymupdf
import re
import pandas as pd



In [None]:
pdfs_path = '/content/drive/MyDrive/math_psych_work/Conference Programs/'

# 1999

## Program

It looks like column names / bolded text are all considered one line/entry, and are separated by new lines.

In [4]:
smp1999_program = pymupdf.open(pdfs_path + 'smp1999_program.pdf')
smp1999_program_pages = smp1999_program[:]

# For these, indices are text[page_num][piece_of_text]
smp1999_program_text = []

for page in smp1999_program_pages:
  smp1999_program_text.append(page.get_text('blocks'))  # adds text from each page
smp1999_program.close()

In [189]:
smp1999_program_text[0][13:20]

[(94.05929565429688,
  440.87060546875,
  249.5148468017578,
  520.4420166015625,
  'A. A. J. MARLEY, McGill\nUniversity, and R. DUNCAN\nLUCE, University of California\nIrvine: A simple axiomatization\nof binary rank-dependent\nexpected utility of gains (losses)\n',
  13,
  0),
 (266.7142639160156,
  454.3827209472656,
  498.1812438964844,
  506.9299011230469,
  'HANS COLONIUS, Oldenburg University:\nVisual-auditory interaction in space and in time:\nAn integrated modeling approach for divided\nand focused attention tasks\n',
  14,
  0),
 (514.4366455078125,
  474.65093994140625,
  517.4393310546875,
  486.6617126464844,
  '.\n',
  15,
  0),
 (40.761451721191406,
  559.4771118164062,
  76.7938003540039,
  571.4878540039062,
  '8:25am\n',
  16,
  0),
 (94.05929565429688,
  532.4528198242188,
  244.5069122314453,
  598.5120849609375,
  "R. DUNCAN LUCE and\nROBERT SNEDDON,\nUniversity of California Irvine:\nReduction invariance and\nPrelec's weighting function\n",
  17,
  0),
 (266.714263

### Gathering authors, affiliations, and titles on all entries

In [129]:
# Cleans up the entries
smp1999_program_strings = []
for block in smp1999_program_text:
  for i, entry_tuple in enumerate(block):
    # Extract the text string from the tuple entry
    entry = entry_tuple[4]
    # Turns string into list of words
    words = entry.split()
    # Removes times (i.e., 9:15am; one entry starts with a time and then author name)
    for word in words:
      if (word[0].isdigit()):
        words.remove(word)

    # Skip the empty table entries
    if (words == ['.']) or (not words):
      continue

    smp1999_program_strings.append(' '.join(words))  # turns the word list back into one string


In [180]:
smp1999_program_strings[12:25]

['A. A. J. MARLEY, McGill University, and R. DUNCAN LUCE, University of California Irvine: A simple axiomatization of binary rank-dependent expected utility of gains (losses)',
 'HANS COLONIUS, Oldenburg University: Visual-auditory interaction in space and in time: An integrated modeling approach for divided and focused attention tasks',
 "R. DUNCAN LUCE and ROBERT SNEDDON, University of California Irvine: Reduction invariance and Prelec's weighting function",
 'BRUCE BRIDGEMAN, University of California, Santa Cruz: Sensory memory arises from lateral inhibition',
 "WILLIAM H. BATCHELDER and BETHANY KNAPP, University of California Irvine: Predicting response time phenomena in binary choice from Luce's choice theory",
 'JENNIFER MCLEAN, University of Washington: Processing capacity of visual perception and memory encoding',
 'MATTHEW JONES and JUN ZHANG, University of Michigan: Learning to cooperate',
 'DAVID HUBER, KEITH LYLE, and RICHARD SHIFFRIN, Indiana University: Short-term priming

In [196]:
smp1999_program_list = []
to_fix_list = []

# Filters out a good number of entries that aren't about the talks
smp1999_filtered_program_strings = [entry for entry in smp1999_program_strings\
                                    if len(entry.split()) > 7]  # 7 is a good length that gets rid
                                                                # of most entries that aren't talks

sample_strings = smp1999_filtered_program_strings

for string in sample_strings:
  authors_and_title = string.split(':', 1)  # splits authors and affiliations from title

  # Saves titles that were broken up by page break or don't have colon for fixing later manually
  if len(authors_and_title) < 2:
    to_fix_list.append(string)
    continue


  talk_title = authors_and_title[1].strip()   # adds title w/o the extra space
                                              # at the beginning from splitting

  # Checks for authors in title
  title_list = talk_title.split()
  for word in title_list:
    if word.isupper() and (len(word) > 4):
      to_fix_list.append([word, string])
      break

  # Separates authors and affiliations
  and_author_split = authors_and_title[0].split(' and ')
  # Saves the comma in some schools (UCs, CSUs, etc.)
  saving_aff_comma = [
                      re.sub(r'(University[^,]+),\s+', r'\1<<COMMA>>', a)\
                      for a in and_author_split
                      ]
  saving_aff_comma = [a.replace('California State University, ',\
                                'California State University<<COMMA>>')\
                      for a in saving_aff_comma]
  auth_aff_list = [item for a in saving_aff_comma for item in a.split(', ')]
  auth_aff_list = [a.replace("<<COMMA>>", ", ") for a in auth_aff_list] # puts comma back

  initial_authors = []
  affiliations = []
  for a in auth_aff_list:
    if a.isupper():
      fixed_author = re.sub(r'^[^a-zA-Z]', '', a).strip() # gets rid of any extra non-letter
                                                          # characters at the beginning
      initial_authors.append(fixed_author)
    else:
      affiliations.append(a.strip(','))

  # Checks for authors in any affiliations
  for a in affiliations:
    word_list = a.split()
    for word in word_list:
      if word.isupper():
        to_fix_list.append([a, string])
        break

  # Fixes splitting author name and suffix (JR.) separated by comma
  fixed_authors = []
  for a in initial_authors:
    if (fixed_authors) and ('JR.' in a):
      fixed_authors[-1] += ", " + a
    else:
      fixed_authors.append(a.strip(','))

  talk_dict = {
      'author(s)': ', '.join(fixed_authors),
      'affiliation(s)': ', '.join(affiliations),
      'title': talk_title
  }

  smp1999_program_list.append(talk_dict)


In [200]:
smp1999_program_list[:5]

[{'author(s)': 'A. A. J. MARLEY, R. DUNCAN LUCE',
  'affiliation(s)': 'McGill University, University of California Irvine',
  'title': 'A simple axiomatization of binary rank-dependent expected utility of gains (losses)'},
 {'author(s)': 'HANS COLONIUS',
  'affiliation(s)': 'Oldenburg University',
  'title': 'Visual-auditory interaction in space and in time: An integrated modeling approach for divided and focused attention tasks'},
 {'author(s)': 'R. DUNCAN LUCE, ROBERT SNEDDON',
  'affiliation(s)': 'University of California Irvine',
  'title': "Reduction invariance and Prelec's weighting function"},
 {'author(s)': 'BRUCE BRIDGEMAN',
  'affiliation(s)': 'University of California, Santa Cruz',
  'title': 'Sensory memory arises from lateral inhibition'},
 {'author(s)': 'WILLIAM H. BATCHELDER, BETHANY KNAPP',
  'affiliation(s)': 'University of California Irvine',
  'title': "Predicting response time phenomena in binary choice from Luce's choice theory"}]

In [201]:
smp1999_df = pd.DataFrame(smp1999_program_list)
smp1999_df[:10]

Unnamed: 0,author(s),affiliation(s),title
0,"A. A. J. MARLEY, R. DUNCAN LUCE","McGill University, University of California Ir...",A simple axiomatization of binary rank-depende...
1,HANS COLONIUS,Oldenburg University,Visual-auditory interaction in space and in ti...
2,"R. DUNCAN LUCE, ROBERT SNEDDON",University of California Irvine,Reduction invariance and Prelec's weighting fu...
3,BRUCE BRIDGEMAN,"University of California, Santa Cruz",Sensory memory arises from lateral inhibition
4,"WILLIAM H. BATCHELDER, BETHANY KNAPP",University of California Irvine,Predicting response time phenomena in binary c...
5,JENNIFER MCLEAN,University of Washington,Processing capacity of visual perception and m...
6,"MATTHEW JONES, JUN ZHANG",University of Michigan,Learning to cooperate
7,"DAVID HUBER, KEITH LYLE, RICHARD SHIFFRIN",Indiana University,Short-term priming: Data and a model for bias ...
8,GEORGE KARABATSOS,Louisiana State University Medical Center,Representational measurement theory and item- ...
9,"JAVIER R. MOVELLAN, JAMES L. MCCLELLAND","University of California, San Diego, Carnegie ...",Factorability of information sources: Analysis...


In [202]:
to_fix_list

['Choice, Measurement, and Statistics Information Processing I .',
 "in a prisoner's dilemma game under Markov framework",
 ['NASA Ames Research Center',
  'ALBERT J. AHUMADA, JR., NASA Ames Research Center: A ﬂexible, nonparametric deﬁnition of threshold'],
 ['MARTIN S. BANKS University of California at Berkeley',
  'BENJAMIN T. BACKUS, Stanford University, and MARTIN S. BANKS University of California at Berkeley: Estimator reliability theory predicts the perceived slant of cue-conﬂict stereo surfaces'],
 ['J. W. Goethe University Frankfurt',
  'ANDREAS KLEIN, J. W. Goethe University Frankfurt:The LMS method for SEM with latent interaction effects'],
 ['LOTFI',
  'Plenary Talk: LOTFI ZADEH, University of California, Berkeley, Toward a computational theory of perceptions'],
 'Special Symposium on Learning Models Information Processing II .',
 ['HANS COLONIUS University of Oldenburg',
  'EHTIBAR N. DZHAFAROV, Purdue University, and HANS COLONIUS University of Oldenburg: Fechnerian metri

In [203]:
smp1999_df.to_csv("/content/drive/MyDrive/math_psych_work/smp1999_program.csv", index=False)

## Abstracts

In [236]:
num_pages = 1
smp1999_abstracts = pymupdf.open(pdfs_path + 'smp1999_abstracts.pdf')
smp1999_abstract_pages = smp1999_abstracts[:num_pages]

# For these, indices are text[page_num][piece_of_text]
abstract_block_text = []

for page in smp1999_abstract_pages:
  abstract_block_text.append(page.get_text('text'))  # contains text plus some info about rectangles?
smp1999_abstracts.close()

In [237]:
abstract_block_text

['SMP\n\'99\n1\n32\nnd\nAnn\nual\nMeeting\nof\nthe\nSo\nciet\ny\nfor\nMathematical\nPsyc\nhology:\nAbstracts\n29\nJuly\nto\n1\nAugust,\n1999\nUniversity\nof\nCalifornia,\nSanta\nCruz\n\x0f\nF\nrida\ny\n,\n30\nJuly\n1\n.\n8:00am:\nChoice,\nMeasurement,\nand\nSt\na\ntistics\n\x0f\nA.\nA.\nJ.\nMarley\n,\nMcGil\nl\nUniversity,\nand\nR.\nDuncan\nLuce,\nUniversity\nof\nCalifornia,\nIrvine,\nA\nsimple\naxiomatization\nof\nbinary\nrank-dep\nenden\nt\nexp\nected\nutilit\ny\nof\ngains\n(losses).\nF\nor\nbinary\ngam\nbles\ncomp\nosed\nonly\nof\ngains\n(losses)\nrelativ\ne\nto\na\nstatus\nquo,\nthe\nrank-\ndep\nenden\nt\nexp\nected-utilit\ny\nmo\ndel\nwith\na\nrep-\nresen\ntation\nthat\nis\ndense\nin\nin\nterv\nals\nis\nsho\nwn\nto\nb\ne\nequiv\nalen\nt\nto\nten\nelemen\ntary\nprop\nerties\nplus\nev\nen\nt\ncomm\nutativit\ny\nand\na\ngam\nble\nparti-\ntion\nassumption.\nThe\npro\nof\nreduces\nto\na\n(diÆ-\ncult)\nfunctional\nequation\nthat\nhas\nb\neen\nsolv\ned\nb\ny\nAczel,\nMaksa,\nand\nP\nales

In [211]:
abs_test_string = 'tiv\ne\nto\nunilaterally\ndeviate.\nThat\nthe\nD/D\npair\nis\n'
abs_test_string.replace('\n', ' ')

'tiv e to unilaterally deviate. That the D/D pair is '

In [209]:
abstract_strings = []
for block in abstract_block_text:
  for i, entry_tuple in enumerate(block):
    # Extract the text string from the tuple entry (always the 5th one in the tuple)
    entry = entry_tuple[4]
    cleaned_entry = entry.replace('.\n', ' ').replace('\n', '')
    print(entry)
      # words = entry.split()


SMP
'99
1

32

nd

Ann
ual
Meeting
of
the
So
ciet
y
for
Mathematical

Psyc
hology:
Abstracts

29
July
to
1
August,
1999

University
of
California,
Santa
Cruz


F
rida
y
,
30
July

1

.

8:00am:
Choice,
Measurement,
and

St
a
tistics


A.
A.
J.
Marley
,
McGil
l
University,
and

R.
Duncan
Luce,
University
of
California,

Irvine,
A
simple
axiomatization
of
binary

rank-dep
enden
t
exp
ected
utilit
y
of
gains

(losses).

F
or
binary
gam
bles
comp
osed
only
of
gains

(losses)
relativ
e
to
a
status
quo,
the
rank-

dep
enden
t
exp
ected-utilit
y
mo
del
with
a
rep-

resen
tation
that
is
dense
in
in
terv
als
is
sho
wn

to
b
e
equiv
alen
t
to
ten
elemen
tary
prop
erties

plus
ev
en
t
comm
utativit
y
and
a
gam
ble
parti-

tion
assumption.
The
pro
of
reduces
to
a
(diÆ-

cult)
functional
equation
that
has
b
een
solv
ed
b
y

Aczel,
Maksa,
and
P
ales
(submitted).


R.
Duncan
Luce
and
Rob
ert
Sneddon,
Univer-

sity
of
California,
Irvine,
Reduction
in
v
ari-

ance
and
Prelec's
w
eigh
ting
function.


# Scratch

### Testing on one table entry

In [20]:
entry_dict = {}

# Two authors
test_string = 'A. A. J. MARLEY, McGill\nUniversity, and R. DUNCAN\nLUCE, University of California\nIrvine:\
               A simple axiomatization\nof binary rank-dependent\nexpected utility of gains (losses)\n'
no_newlines = test_string.rstrip().replace('\n', ' ')
authors_and_title = no_newlines.split(':', 1)   # splits string by first instance of ':' into one part with
                                                # authors and affiliations, and another part with title
entry_dict['title'] = authors_and_title[1].lstrip()   # adds title w/o the extra space at the beginning

authors = []
affiliations = []
split_authors = authors_and_title[0].split('and', 1)  # splits authors and their affiliation

# Adds author and affiliation
for author in split_authors:
  author = author.strip()   # get rid of leading and trailing whitespace
  split_author = author.split(',')
  if '' in split_author:
    split_author.remove('')  # removes empty strings
  author_affiliation = [a.strip() for a in split_author]  # list of authors and affiliations

  for a in author_affiliation:
    if a.isupper():
      authors.append(a)
    else:
      affiliations.append(a)

entry_dict['author(s)'] = authors
entry_dict['affiliation(s)'] = affiliations


In [21]:
entry_dict

{'title': 'A simple axiomatization of binary rank-dependent expected utility of gains (losses)',
 'author(s)': ['A. A. J. MARLEY', 'R. DUNCAN LUCE'],
 'affiliation(s)': ['McGill University', 'University of California Irvine']}

In [None]:
      # all_caps_word_count = 0
      # for word in words:
      #   if word.isupper() and word.isalpha():
      #     all_caps_word_count += 1
      # if all_caps_word_count > 1:
      #   print(entry.rstrip().replace('\n', ' '))

In [None]:
for page_num in range(len(smp1999_program)):
    page = smp1999_program.load_page(page_num)
    text_dict = page.get_text("dict") # Get text as a dictionary

    for block in text_dict["blocks"]:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    # Check if the italic flag (2) is set
                    if span["flags"] & pymupdf.TEXT_FONT_ITALIC:
                        print(f"Page {page_num + 1}, Text: '{span['text']}' is italic.")