In [None]:
#start with importing the required libraries 
import pandas as pd
import openai as oi
import chromadb, chardet
from chromadb.utils import embedding_functions
import os, json 
import requests, time
import math
from bs4 import BeautifulSoup
from pathlib import Path
from IPython.display import Image

#creating global variables for further use 
OPENAI_API_KEY="OPENAI_KEY"
RESET_FILE_STATUS = -1
deptID = 10
deptName = "Egyptian_Art"
query = "egypt"
root_path = "LOCAL_STORAGE_PATH"+str(deptID)
stride = 1000 

In [None]:
#method to format a clause to correct json parseable format 
def parseClause(value): 
    return str(value).replace("[","").replace("]","").replace("'","\"")

In [None]:
#call OpenAI text embedding 
def text_embedding(text):
    openai_client = oi.OpenAI(api_key = OPENAI_API_KEY)
    response = openai_client.embeddings.create(model="text-embedding-ada-002", input=text)

In [None]:
#parse the file statuses as a data structure (dictionary) from the reference 
def parse_file_status(): 
    file_status_file = open(root_path+"/"+deptName+"_Ref.txt","r")
    file_status_list = file_status_file.readlines()
    status_dictionary = {}
    for line in file_status_list: 
        fileName, fileStatus = line.split(",")
        cr_counter = getCounter(fileName)
        status_dictionary[cr_counter] = int(fileStatus)
    file_status_file.close()
    return status_dictionary 

In [None]:
#parse the countern (used as a partitioning index) from the fileName 
def getCounter(fileName): 
    return fileName.split(".")[0].replace(deptName,"")

In [None]:
#get the file status of the file wiht the specified counter/partition
def get_file_entry(counter): 
    file_status_file = open(root_path+"/"+deptName+"_Ref.txt","r")
    file_status_list = file_status_file.readlines()
    for file_status in file_status_list: 
        fileName = file_status.split(",")[0]
        curr_counter = getCounter(fileName)
        if(curr_counter == counter):
            file_status_file.close()
            if(int(file_status.split(",")[1])==1): 
                return True
            else: 
                return False

In [None]:
def update_file_entry(counter) : 
    refFileName = root_path+"/"+deptName+"_Ref.txt"
    parsed_status = parse_file_status()
    if(counter!=RESET_FILE_STATUS):
        parsed_status[counter] = 1                 
    status_file = open(refFileName,"w")
    prefix = deptName
    suffix = ".0.csv"
    for key in parsed_status: 
        if(counter==RESET_FILE_STATUS):
            status_file.write(prefix+str(key)+suffix+", 0"+os.linesep)
        else:
            status_file.write(prefix+str(key)+suffix+", "+str(parsed_status[key])+os.linesep)
    status_file.close()

In [None]:
#creates a reference file that stores the status of each object
def createRefFile():
    with open(root_path+"/"+deptName+"_Ref.txt", "w") as f:
        list_of_files = sorted(os.listdir(root_path+"/Files/"))
        prefix = deptName
        suffix = ".0.csv"
        for file in list_of_files: 
            f.write(str(file)+", 0"+os.linesep)
        f.close()

In [None]:
#init the client for chromadb
client = chromadb.PersistentClient(path="<PATH_TO_CHROMADB>")
client.heartbeat()

from chromadb.utils import embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                model_name="text-embedding-ada-002", 
                api_key = OPENAI_API_KEY
            )

In [None]:
#create a file for storing the artwork items by monet from the MET API 
url = "https://collectionapi.metmuseum.org/public/collection/v1/search?departmentId="+str(deptID)+"&q="+query
payload = {}
headers = {'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate'}

#start the GET request to fetch the list of objectIDs 
response1 = requests.request("GET", url, data=payload, headers = headers)

object_id_strings = response1.text.split(",")
objectIDs = json.loads(response1.text)['objectIDs']
objectIDs.sort()

print("Total number of Objects is "+str(len(objectIDs)))

Path(root_path).mkdir(parents=True, exist_ok=True)
Path(root_path+"/Images/").mkdir(parents=True, exist_ok=True)
Path(root_path+"/Files/").mkdir(parents=True, exist_ok=True)

#iterate over all objects fetched 
root_file_path = root_path+"/Files/"+deptName
file = open(root_file_path+"0.0.csv","w")
x=0
#x = 
print("Starting the for loop")
for objectID in objectIDs: 
    if(objectID>=544725): 
        break
#     if(objectID<548911):
#         x = x+1
#         continue;
    if(x%stride==0 and x!=0): 
        print(x)
        file.close()
        file_path = root_file_path+str(x/stride)+".csv"
        if os.path. exists(file_path):
            os.remove(file_path)
        file = open(file_path,"w")
    #sleep for 1 second so as not to overwhelm the server with requests 
    #time.sleep(1)
    x= x+1
    #start teh get request to fetch this object details 
    url = 'https://collectionapi.metmuseum.org/public/collection/v1/objects/'+str(objectID)
    payload = {}
    response = requests.request("GET", url, headers=headers, data=payload)
    #write the response to the file, with a delimeter of comma and new line 
    file.write(response.text+",/n")

file.close()


In [None]:
#create the reference file 
createRefFile()

In [None]:
#optionally delete the collection and reset the file statuses in the reference file in case of a pause 
#client.delete_collection(name = deptName)
#update_file_entry(RESET_FILE_STATUS)

#create the collection, or get it in case of a pause 
collection = client.get_or_create_collection(name=deptName, embedding_function=openai_ef)

In [None]:
#create the headers for a browser get request call 
headers = {
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 
        'Upgrade-Insecure-Requests': '1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate'
}

#collection = client.get_or_create_collection(name="egypt_artwork_met_w_images", embedding_function=openai_ef)
collection = client.get_collection(name=deptName, embedding_function=openai_ef)


fileList = os.listdir(root_path+"/Files/")
#a counter created for the paritioning index used for the images 

for fileName in fileList:
    try:
        current_file = open(root_path+"/Files/"+fileName,"r")
        try:
            counter = getCounter(fileName)
        except Exception as e: 
            print("Error with file name")
            print(fileName)
        
        print("Counter: "+str(counter))
        file_status = get_file_entry(counter)
        if(file_status==True):
            current_file.close()
            print("file has been loaded "+str(counter))
            continue
        
        #read the json file and create a list of json tags by splitting at new line 
        artworks = current_file.readline().split(",/n")
        current_file.close()
        
        #parse to json 
        json_ = json.loads(artworks[0])
        
        #parse the header to create the dataframe of artwork collection 
        artwork_keys = ("part_index,"+str(json_.keys()).replace("dict_keys([","").replace("])","").replace("'","").replace(" ","")).split(",")
        
        #create the dataframe with the index as the objectID 
        d_artwork = pd.DataFrame(columns = artwork_keys, index=[0])
        d_artwork.set_index('objectID')
        d_artwork.set_index('part_index')
        
        #create a dataframe for the objects and images 
        d_artwork_documents = pd.DataFrame(columns = ['part_index','objectID','objectDesc', 'image'])
        d_artwork_documents.set_index('objectID')
        d_artwork_documents.set_index('part_index')
    
        #creating a list of dictionnaries for the metadatas to be added to the collection 
        list_of_dictionaries = []
        
        for artwork in artworks: 
            
            #pass the invalid inputs 
            if (str(artwork) == "{\"message\":\"Not a valid object\"}" or len(artwork)==0): 
                continue
            
            #load each artwork into a parsed json object, create a one liner of current artwork and concat with the 
            #list of artwork collection df 
            try: 
                json_ = json.loads(artwork)
            except Exception as e: 
                if("codec can't decode byte" in str(e)): 
                    encoding = chardet.detect(json_.encode())['encoding']
                    a.encode(encoding).strip()
          
            #create a new row for the artwork dataframe
            d_newrow_artwork = pd.DataFrame(columns= artwork_keys, index=[0])
            d_newrow_artwork.loc[0] = json_
            
            #concatinate the new artwork record to the artowrks dataframe 
            d_artwork = pd.concat([d_artwork,d_newrow_artwork], ignore_index=True)

            #include the new metadata record
            list_of_dictionaries.append({'part_index': counter, 'id':d_newrow_artwork.loc[0]['objectID']})
            
            #formulate the docuemnt to be used by the vector db 
            currentDesc = "The title of this artwork is "+str(d_newrow_artwork.loc[0]['title'])+"."
            if(str(d_newrow_artwork.loc[0]['artistDisplayName'])!=""): 
                "It is the work of "+ str(d_newrow_artwork.loc[0]['artistDisplayName']) +".  "
            if(str(d_newrow_artwork.loc[0]['accessionYear'])!=""):
                "It resides at the MET in NYC, it was included to the collection on "+str(d_newrow_artwork.loc[0]['accessionYear'])+". "
            if(str(d_newrow_artwork.loc[0]['department'])!=""):
                "It is part of the department of "+str(d_newrow_artwork.loc[0]['department'])+ ". "
            if(str(d_newrow_artwork.loc[0]['period'])!=""): 
                "It is part of the period of "+str(d_newrow_artwork.loc[0]['period'])+ ". "
            if(str(d_newrow_artwork.loc[0]['dynasty'])!=""): 
                "It is part of the dynasty of "+str(d_newrow_artwork.loc[0]['dynasty'])+ ". "
            if(str(d_newrow_artwork.loc[0]['reign'])!=""): 
                "It is part of the reign of "+str(d_newrow_artwork.loc[0]['reign'])+ ". "
            if(str(d_newrow_artwork.loc[0]['objectDate'])!=""): 
                "It was created on "+str(d_newrow_artwork.loc[0]['objectDate'])+ ". "
            
            #add the current row to the dataframe for artworks 
            d_newrow_artwork_documents = pd.DataFrame(columns= ['objectID','objectDesc', 'image'], index=[0])
            d_newrow_artwork_documents.set_index('objectID')    
            d_newrow_artwork_documents.loc[0] = [str(d_newrow_artwork.loc[0]['objectID']), currentDesc, d_newrow_artwork.loc[0]['objectURL']]
            d_artwork_documents = pd.concat([d_artwork_documents,d_newrow_artwork_documents], ignore_index=True)
            
            
            try: 
                #Download the image when applicable in a folder that has the name of the part_index
                currentURL = d_newrow_artwork.loc[0]['primaryImage']
                currentObjectID = d_newrow_artwork.loc[0]['objectID']
            
                #in case of image, create path and file, unless it has been stored already  
                if(len(currentURL)!=0):
                    pathName = root_path+"/Images/P"+str(counter)
                    path = Path(pathName)
                    if not path.exists():
                        path.mkdir(parents=True)
                    filePath = pathName+"/"+str(currentObjectID)+'.jpg'
                    if(Path(filePath).exists() == False): 
                        with open(filePath, 'wb') as handle:
                            response1 = requests.get(currentURL, stream=True, headers = headers)
                            if not response1.ok:
                                print(str(response1))
                            for block in response1.iter_content(1024):
                                if not block:
                                    break
                                handle.write(block)
            except Exception as e: 
                print(e)
            
    
#             #Optionally include this block for fetching tags and constituents, commented as it is not used for this dept. 

#             #check for the tags associated with this artwork, pass if there is none 
#             tags = d_newrow_artwork['tags']
#             if tags[0] is not None:
#                 currentDesc = currentDesc + " It is decribed by the tags:"
#                 for tag in tags[0]:
#                     if tag is not None:
#                         #parse the contents of the tags into a json object 
#                         updatedTags = parseClause(tag).replace("None","\" \"")
#                         json_ = json.loads(updatedTags)    
#                         currentDesc = currentDesc + str(json_['term']) + " which has this reference " + str(json_['AAT_URL'])
#             #check for the constituents associated with this artwork, pass if there is none 
#             constituents = d_newrow_artwork['constituents']
#             if constituents[0] is not None: 
#                 currentDesc = currentDesc + ". It is associated with these constituents: "
#                 for constituent in constituents[0]:
#                     if constituent is not None:            
#                         #parse the contents of the constituents into a json object 
#                         updatedConst = parseClause(constituent).replace("None","\" \"")
#                         json_ = json.loads(updatedConst)            
#                         #associate the artwork with the constituent in the many to many list of tuples 
#                         currentDesc = currentDesc + str(json_['name']) + " with the role of " + str(json_['role']) + " which has this reference " + str(json_['constituentULAN_URL'])
    
            
        #drop the row of nulls 
        d_artwork.drop(index=d_artwork.index[0], axis=0, inplace=True)
        
#       time.sleep(1)
        #add the current stride of artworks to the vector db
        try:
            collection.add(
                documents = d_artwork_documents['objectDesc'].tolist(),
                metadatas = list_of_dictionaries,
                ids = d_artwork_documents['objectID'].tolist())
            
        except Exception as e: 
                #rerun the add to vectordb in case of throttle in OpenAI during the add operation 
                if("Please try again in 7m12s" in str(e)): 
                    print("Waiting 7m12s to start")
                    time.sleep(432)
                    documents = d_artwork_documents['objectDesc'].tolist(),
                    metadatas = list_of_dictionaries,
                    ids = d_artwork_documents['objectID'].tolist()
                elif("Please try again in 20s" in str(e)):  
                    print("Waiting 20s to start")
                    time.sleep(20)
                    documents = d_artwork_documents['objectDesc'].tolist(),
                    metadatas = list_of_dictionaries,
                    ids = d_artwork_documents['objectID'].tolist()
                else: 
                    print(e)
                    
        #update the reference file that the file has been added to the collection and update image paritioning index
        update_file_entry(counter)
        
    except Exception as e:
        print(e)
