In [2]:
# Standard library imports
import json
import os
import pandas as pd
from tqdm import tqdm

### Accessing the Art Institute of Chicago's data dump
The Art Institute of Chicago has a downloadable data dump to search thru and find what you want to access
before making any API calls. I looked thru this to try and find that artworks I could use for classifcation. 

In [3]:
# Path JSONL file 
file_path = 'allArtworks.jsonl'

# Read the JSONL file
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Create a pandas DataFrame
datadump_df = pd.DataFrame(data)

print(datadump_df.info())

datadump_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132769 entries, 0 to 132768
Data columns (total 5 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   id                     132769 non-null  int64 
 1   title                  132768 non-null  object
 2   main_reference_number  132769 non-null  object
 3   department_title       126212 non-null  object
 4   artist_title           114074 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.1+ MB
None


Unnamed: 0,id,title,main_reference_number,department_title,artist_title
0,4,Priest and Boy,1880.1,Prints and Drawings,Lawrence Carmichael Earle
1,9,"Interior of St. Mark's, Venice",1887.232,Arts of the Americas,David Dalhoff Neal
2,11,Self-Portrait,1887.234,Arts of the Americas,Walter Shirlaw
3,16,The Fall of the Giants,1887.249,Prints and Drawings,Salvator Rosa
4,19,"View of Ponte Lugano on the Anio, from Views o...",1887.252,Prints and Drawings,Giovanni Battista Piranesi


In [4]:
datadump_df['department_title'].value_counts()

department_title
Prints and Drawings                                  51217
Photography and Media                                23783
Arts of Asia                                         16375
Textiles                                             11547
Architecture and Design                               5913
Applied Arts of Europe                                5507
Arts of the Americas                                  4246
Arts of the Ancient Mediterranean and Byzantium       2195
Contemporary Art                                      1721
Arts of Africa                                        1493
Painting and Sculpture of Europe                      1382
Modern Art                                             463
AIC Archives                                           242
Ryerson and Burnham Libraries Special Collections      124
Modern and Contemporary Art                              4
Name: count, dtype: int64

Unfortunately, the main dataframe does not contain Styles, however the data dump also comes with
the json for every piece of artwork. I load all into a dataframe and filter for the artwork this way. I can use the 'artwork_type_title'
to only get paintings or prints, then 'style_titles' and 'style_title' for the style of artwork. 

In [5]:
# Set the path to the directory containing JSON files
json_dir = r"C:\Users\16148\Downloads\artic-api-data\json\artworks"

# Initialize an empty list to store the data from each JSON file
data_list = []

# Iterate through all JSON files in the directory
for filename in tqdm(os.listdir(json_dir), desc="Loading files"):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                # Load the JSON data
                json_data = json.load(file)
                # Append the data to our list
                data_list.append(json_data)
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file: {filename}")

# Create a DataFrame from the list of JSON data
all_df = pd.DataFrame(data_list)

# Display the first few rows and basic information about the DataFrame
print(all_df.info())
all_df.head()

Loading files: 100%|██████████| 132769/132769 [39:05<00:00, 56.61it/s]   


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132769 entries, 0 to 132768
Data columns (total 99 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   id                             132769 non-null  int64  
 1   api_model                      132769 non-null  object 
 2   api_link                       132769 non-null  object 
 3   is_boosted                     132769 non-null  bool   
 4   title                          132768 non-null  object 
 5   alt_titles                     1800 non-null    object 
 6   thumbnail                      118916 non-null  object 
 7   main_reference_number          132769 non-null  object 
 8   has_not_been_viewed_much       132769 non-null  bool   
 9   boost_rank                     20 non-null      float64
 10  date_start                     127671 non-null  float64
 11  date_end                       127671 non-null  float64
 12  date_display                  

Unnamed: 0,id,api_model,api_link,is_boosted,title,alt_titles,thumbnail,main_reference_number,has_not_been_viewed_much,boost_rank,...,video_ids,text_ids,section_ids,section_titles,site_ids,suggest_autocomplete_all,source_updated_at,updated_at,timestamp,suggest_autocomplete_boosted
0,100,artworks,https://api.artic.edu/api/v1/artworks/100,False,Simon Vouet,,"{'lqip': 'data:image/gif;base64,R0lGODlhBAAFAP...",1887.716,False,,...,[],[],[],[],[],"[{'input': ['1887.716'], 'contexts': {'groupin...",2022-09-21T19:18:44-05:00,2023-12-21T00:56:04-06:00,2024-10-20T00:29:18-05:00,
1,10000,artworks,https://api.artic.edu/api/v1/artworks/10000,False,Metamorphosis,,"{'lqip': 'data:image/gif;base64,R0lGODlhBAAFAP...",1959.167,False,,...,[],[],[],[],[],"[{'input': ['1959.167'], 'contexts': {'groupin...",2024-02-01T17:36:16-06:00,2024-10-19T23:23:12-05:00,2024-10-20T00:34:09-05:00,
2,100000,artworks,https://api.artic.edu/api/v1/artworks/100000,False,Commodus,,,1982.2082,True,,...,[],[],[],[],[],"[{'input': ['1982.2082'], 'contexts': {'groupi...",2023-05-08T18:45:08-05:00,2024-10-19T23:25:06-05:00,2024-10-20T01:21:29-05:00,
3,100001,artworks,https://api.artic.edu/api/v1/artworks/100001,False,Commodus,,,1982.2083,True,,...,[],[],[],[],[],"[{'input': ['1982.2083'], 'contexts': {'groupi...",2023-05-08T18:45:08-05:00,2024-10-19T23:25:06-05:00,2024-10-20T01:21:29-05:00,
4,100002,artworks,https://api.artic.edu/api/v1/artworks/100002,False,Commodus,,,1982.2084,True,,...,[],[],[],[],[],"[{'input': ['1982.2084'], 'contexts': {'groupi...",2023-05-08T18:45:08-05:00,2024-10-19T23:25:06-05:00,2024-10-20T01:21:29-05:00,


In [7]:
# Then save as a csv
all_df.to_csv("all_artworks_aic.csv", index=False)