In [1]:
import requests
import json
import pickle

In [2]:
base_url = "https://archives.jacobspillow.org/admin/"

with open('login', 'r') as f:
    lines = f.readlines()
    username = lines[0].strip()
    password = lines[1].strip()

Logging in with credentials in order to retrieve information

In [5]:
# Login query
query = """
query {
  login(username: "%s", password: "%s") {
    jwt
    refresh
    user {
      id
      fname
      lname
      email
    }
  }
}
""" % (username, password)

# Send login request

headers = {"Content-Type": "application/json", "Accept": "application/json"}
data = json.dumps({"query": query})
response = requests.post(f"{base_url}/service/Auth", headers=headers, data=data)

# Check for successful response
if response.status_code == 200:
   data = json.loads(response.text)
   jwt = data["data"]["login"]["jwt"]
else:
  print(f"Error logging in: {response.text}")

In [6]:
def find_all_entities(entities_types):
  """
  Searches for entities of specified types and returns their preferred labels.

  Args:
      entities_types (str): Comma-separated string of entity types to search for.

  Returns:
      list: List of entity preferred labels (display names).
  """

  query_browse_entity = f"""
  query {{
    search(
      table: "ca_entities",
      search: "a",
      restrictToTypes: ["{entities_types}"],
      bundles: ["ca_entities.preferred_labels.displayname"],
      start: 0
    ) {{
      result {{
        id,
        table,
        idno,
        bundles {{
          code,
          name,
          dataType,
          values {{
            value,
            locale
          }}
        }}
      }}
    }}
  }}
  """

  headers = {
      "Content-Type": "application/json",
      "Accept": "application/json",
      "Authorization": f"Bearer {jwt}"
  }

  data = json.dumps({"query": query_browse_entity})
  response = requests.post(f"{base_url}/service/Search", headers=headers, data=data)

  if response.status_code == 200:
    try:
      parsed_data = json.loads(response.content)
      tables = parsed_data.get("data", {}).get("search", {}).get("result", [])
      print(tables)
      return [table["bundles"][0]["values"][0]["value"] for table in tables]
    except (KeyError, json.JSONDecodeError) as e:
      print(f"Error parsing response: {e}")
      return []  # Handle parsing errors gracefully

  else:
    print(f"Error logging in: {response.text}")

  return []  # Explicitly return empty list on errors for clarity

In [7]:
find_all_entities("individual")

[{'id': 27024, 'table': 'ca_entities', 'idno': 'A. Epstein', 'bundles': [{'code': 'ca_entities.preferred_labels.displayname', 'name': 'Display name', 'dataType': 'Text', 'values': [{'value': 'A. Epstein', 'locale': 'en_US'}]}]}, {'id': 27103, 'table': 'ca_entities', 'idno': 'A. John Geraci', 'bundles': [{'code': 'ca_entities.preferred_labels.displayname', 'name': 'Display name', 'dataType': 'Text', 'values': [{'value': 'A. John Geraci', 'locale': 'en_US'}]}]}, {'id': 51, 'table': 'ca_entities', 'idno': 'Reeves, John A.', 'bundles': [{'code': 'ca_entities.preferred_labels.displayname', 'name': 'Display name', 'dataType': 'Text', 'values': [{'value': 'Reeves, John A.', 'locale': 'en_US'}]}]}, {'id': 60, 'table': 'ca_entities', 'idno': 'Gates, Alice A.', 'bundles': [{'code': 'ca_entities.preferred_labels.displayname', 'name': 'Display name', 'dataType': 'Text', 'values': [{'value': 'Gates, Alice A.', 'locale': 'en_US'}]}]}, {'id': 75, 'table': 'ca_entities', 'idno': 'Nagler, A. M.', 'bund

['A. Epstein',
 'A. John Geraci',
 'Reeves, John A.',
 'Gates, Alice A.',
 'Nagler, A. M.',
 'Twysden, A. E.',
 'Hall, A. George',
 'Franks, A. H.',
 'Palme, A.',
 'Richard A. Long',
 'Flint, Janet A.',
 'Phillips, Patricia A.',
 'Smith, Ralph A.',
 'Harris, Jane A.',
 'Ewing, William A',
 'Joost A. M. Meerloo',
 'Rowe, Patricia A.',
 'Mansfield, Evelyn A.',
 'Papageorgiou, C. A.',
 'A. J. Pischl',
 'Zorn, Friedrich A.',
 'Clark, VeVe A.',
 'Logan, Gene A.',
 'Coton, A. V.',
 'Westrup, J. A.',
 'Bernard A. Drew',
 'Thurston, H. A.',
 'Loeffler, Pauline A.',
 'Wilson, A. E.',
 'Steiner, George A.',
 'Pokrovsky, Boris A.',
 'David, Martin A.',
 'A. E. Johnson',
 'W. A. Propert',
 'Miller, Kamae A., ed.',
 'Eyes Of A Blue Dog Dance Theater',
 'Camille A. Brown',
 'Samuel A. Miller',
 'William A. Ewing',
 'Susan A. Manning',
 'Timothy A. Fischer',
 'George A. Lawrence',
 'Charles A Doran',
 'Jennifer A. Cooper',
 'A. K. Salim',
 'A. Pla Alvarez',
 'A. Roger Smith',
 'A. Beriot',
 'A. Leroy

In [33]:
# grabbing all companies entities as well as individuals entities

companies_names = find_all_entities("organization")
entities_names = find_all_entities("individual")

In [32]:
print(companies_names)

['American Dance Therapy Association', 'Dance Heritage Coalition', 'American Dance Festival', 'Sir Isaac Pitman &amp; Sons, Ltd.', 'Ballet National De Espana', 'Eine Ausstellung des U. S. Informationsdienstes', 'American Square Dance Society', 'Chicago Dance Coalition', 'Sasaki Associates', 'Acadmie Des Sciences De Bulgarie', 'Dance/USA', "Association Francaise d'Action Artistique", 'Denig Design Associates', "Jacob's Pillow Dance Festival", 'Blue Mountain Arts Collection', 'Dance Films Association, Inc.', 'Victor Talking Machine Company', 'National Endowment for the Arts', 'Kaplan, Barbara Paige', 'American Council for the Arts', 'Academy of Dance on Film', 'Melvin Ballou Gilbert', 'American Association of Museums', 'American Theatre Planning Board, Inc.', "Theatre National de l'Opera de Paris", 'New York City Ballet', 'Le Concours Choregraphique International de Bagnolet', 'Language of Dance Center', 'Language of Dance Association', 'Dana Press', 'Harwood Academic Publishers', 'Alexa

In [34]:
print(entities_names)

['Canner, Norma', 'Silbermann, Alphons', 'Lucy Venable', 'Fred Berk', 'Kostrovitskaya, Vera S.', 'Walter Terry', 'Rennert, Jack', 'Baer, Nancy Van Norman', 'George Balanchine', 'Mason, Francis', 'Dominic, Zoe', 'Winkler-Betzendahl, Madeline', 'Duke University', 'Levine, Mindy', 'Whang, Vanessa', 'Yaniv Cohen', 'Doris Hering', 'Chailley, Jacques', 'Clement Crisp', 'Sainsbury, Anya', 'Williams, Peter', 'Merica Briffa', 'Jane Sherman', 'Archer, Cathaline Alford', 'Mitchell J. Mulholland', 'Brahms, Caryl', 'Simon, S.J.', 'The Museum of Broadcasting', 'Wiley, Roland John', 'Ken Mandelbaum', 'Hartnoll, Phyllis', 'Margot Fonteyn', 'Ortegel, Sister Adelaide', 'Holly Brubach', 'Annie Leibovitz', 'G.B.L. Wilson', 'Gay Morris', 'Menuhin, Diana Gould', 'Hartley, Lee Ann', 'Hayes, Elizabeth R.', 'Scoville, Jon', 'White, Kenneth', 'Preston, Valerie F.L.G', 'Hagood, Thomas K.', 'Macfall, Haldane', 'Ann Hutchinson Guest', 'Freedley, George', 'Reeves, John A.', 'Mary Grace Swift', 'Fait, Hollis F.', 'S

Checking out to see whether there's an intersection between entities names and companies names

It is much common for companies entities to be submitted as individual entities by mistakes

In [35]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

print(intersection(entities_names, companies_names))

['Ballet Theatre', "Lotte Goslar's Pantomime Circus", 'Bob Berky', 'Donald Byrd', 'Beppie Blankert', 'Ryoko Kudo', 'Billy Tipton Memorial Saxophone Quartet', 'Earnest T. Morgan', 'Rosy Co', 'Mia Michaels', 'Airjazz', 'Kathy Rose', 'Pink Inc.', 'Indo-American Dance Group', 'Manhattan Festival Ballet', 'The National Ballet', 'Batoto Yetu', 'CoisCeim Dance Theatre', 'Harkness Ballet', 'Toronto Dance Theatre', 'Bludance Theatre', 'Merry-Go-Rounders', 'Pittsburgh Ballet Theatre', 'Les Ballet Jazz De Montreal', 'Complexions Contemporary Ballet', 'Tania Perez-Salas Compania De Danza', 'Groupe Emile Dubois', 'Cantarella School of Dance']


In [41]:
# saving all entities name
pickle.dump(entities_names, open("all_indivs.pickle", 'wb')) 
pickle.dump(companies_names, open("all_comps.pickle", 'wb')) 