This cell loads all the important libraries

In [None]:
from exif import Image
from iptcinfo3 import IPTCInfo
import os
import xml.etree.ElementTree as ET
from datetime import datetime
from datetime import timedelta
import requests
import json
import shutil
import time
import zipfile

thumbsDir = 'thumbs'
fullDir = 'full'
logDir = 'logs'
observationDir = 'observations'
speciesDir = 'species'
outputDir = "output"
diveLog = ET.parse(logDir + '/' + 'log.uddf')
ORCID = "https://orcid.org/0000-0002-3639-2080"
root = diveLog.getroot()
worms_endpoint = "https://www.marinespecies.org/rest"
taxonHierarchy = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]

def getWoRMSAphiaID(taxon):
    import requests
    import json
    result = {}
    aphiaID = ""
    worms_endpoint = "https://www.marinespecies.org/rest/"
    operation = "AphiaIDByName/"
    url = worms_endpoint + operation + taxon + "?marine_only=true"
    response = requests.request("GET", url)
    if ( response.status_code == 200 ):
        aphiaID = response.text
    elif ( response.status_code == 206 ):
        operation = "AphiaRecordsByName/"
        url = worms_endpoint + operation + taxon + "?marine_only=true&like=false"
        response = requests.request("GET", url)
        if ( response.status_code == 200 ):
            data = json.loads(response.text)
            for record in data:
                if (record["status"] == "accepted"):
                    aphiaID = record["AphiaID"]
    result["response_code"] = response.status_code
    result["AphiaID"] = aphiaID
    return aphiaID

def getWoRMSTaxonData(taxon):
    import requests
    aphiaID = getWoRMSAphiaID(taxon)
    worms_endpoint = "https://www.marinespecies.org/rest/"
    operation = "AphiaRecordByAphiaID/"

    url = worms_endpoint + operation + str(aphiaID) + "?marine_only=true"

    response = requests.request("GET", url)
    return response.text

def getEOLID(taxon):
    import requests
    import json
    EOLID = 0
    url = "https://eol.org/api/search/1.0.json?page=1&key=&exact=true&q=" + taxon
    
    eol_endpoint = "https://eol.org/api/"
    operation = "search/1.0.json?page=1&key=&exact=true&q="
    url = eol_endpoint + operation + taxon

    response = requests.request("GET", url)
    
    if (response.status_code == 200):
        data = json.loads(response.text)
        if ( data["totalResults"] > 0):
            EOLID = data["results"][0]["id"]
    return EOLID

def getAWikipediaURL( hierarchy ):
    import time
    import wikipediaapi
    wiki = wikipediaapi.Wikipedia('en')
    url = ""
    for taxon in hierarchy:
        page = wiki.page(taxon)
        if ( page.exists() ):
            url = page.fullurl
            break
        time.sleep(0.1)
    return url

Spit out photo metadata

In [None]:
with os.scandir(fullDir) as it:
    for entry in it:
        if not entry.name.startswith('.') and entry.is_file():
            with open(fullDir + '/' + entry.name, 'rb') as photo:
                thisPhotoEXIF = Image(photo)
                thisPhotoIPTC = IPTCInfo(photo)
                print(thisPhotoIPTC["object name"])
                occurrenceRemark = ""
                try:
                    occurrenceRemark = thisPhotoEXIF.get("image_description")
                except:
                    print("Couldn't extract image description")
                print(occurrenceRemark)
                photoDateTime = datetime.fromisoformat(thisPhotoEXIF.datetime_original.replace(":", "-", 2))
                print(entry.name + ' was taken at ' + photoDateTime.isoformat() + " by " + thisPhotoEXIF.artist)

Grab latitude and longitude for a given siteID

In [None]:
siteID = "b88e3083"
siteName = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}name").text
latitude = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}geography/{http://www.streit.cc/uddf/3.2/}latitude").text
latitude = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}geography/{http://www.streit.cc/uddf/3.2/}longitude").text
print(siteName + " is at " + latitude + ", " + longitude)

Get dive site name for all dives

In [None]:
for dive in root.iter('{http://www.streit.cc/uddf/3.2/}dive'):
    diveID = dive.get('id')
    siteID = dive.find('{http://www.streit.cc/uddf/3.2/}informationbeforedive').find('{http://www.streit.cc/uddf/3.2/}link').get('ref')
    if siteID is not "":
        siteName = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}name").text
    else:
        siteName = "no logged location"
    print('Dive ' + diveID + ' (' + siteName + ")")
    

Output details of all dive sites

In [None]:
for site in root.iter('{http://www.streit.cc/uddf/3.2/}site'):
    print(site.get('id'))
    for name in site.findall('{http://www.streit.cc/uddf/3.2/}name'):
        print(name.text)
    for geography in site.findall('{http://www.streit.cc/uddf/3.2/}geography'):
        latitude = geography.find('{http://www.streit.cc/uddf/3.2/}latitude').text
        longitude = geography.find('{http://www.streit.cc/uddf/3.2/}longitude').text
        print(latitude + ', ' + longitude)
    print()

Match time stamp to a particular dive and dive site

In [None]:
target = datetime.fromisoformat("2022-02-05 10:24:04")

print("Target is " + target.isoformat())
print()
for dive in root.iter('{http://www.streit.cc/uddf/3.2/}dive'):
    diveID = dive.get('id')
    diveStart = datetime.fromisoformat(dive.find('{http://www.streit.cc/uddf/3.2/}informationbeforedive').find('{http://www.streit.cc/uddf/3.2/}datetime').text)
    diveDuration = dive.find('{http://www.streit.cc/uddf/3.2/}informationafterdive').find('{http://www.streit.cc/uddf/3.2/}diveduration').text
    diveTimeDeltaDuration = timedelta(seconds=int(diveDuration))
    diveEnd = diveStart + diveTimeDeltaDuration
    print('target: ' + target.isoformat() + ' start: ' + diveStart.isoformat() + ' duration: ' + str(float(diveDuration)/60) + ' end: ' + diveEnd.isoformat())
    if (diveStart < target and target < diveEnd):
        print("Matching dive ID: " + diveID)
        siteID = dive.find('{http://www.streit.cc/uddf/3.2/}informationbeforedive').find('{http://www.streit.cc/uddf/3.2/}link').get('ref')
        print("Site ID is: " + siteID)
        siteName = siteName = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}name").text
        print("Site name is: " + siteName)


In [None]:
photoDateTime = datetime.fromisoformat("2022-02-05 10:00:19")
diveID = "idm564921344"
previousDepth = 0.0

siteID = dive.find('{http://www.streit.cc/uddf/3.2/}informationbeforedive/{http://www.streit.cc/uddf/3.2/}link').get('ref')
if siteID != "":
    siteName = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}name").text
    latitude = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}geography/{http://www.streit.cc/uddf/3.2/}latitude").text
    longitude = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}geography/{http://www.streit.cc/uddf/3.2/}longitude").text
dive = root.find("./{http://www.streit.cc/uddf/3.2/}profiledata/*[@id='" + diveID+ "']/{http://www.streit.cc/uddf/3.2/}dive")
diveStart = datetime.fromisoformat(root.find("./{http://www.streit.cc/uddf/3.2/}profiledata/*[@id='" + diveID+ "']/{http://www.streit.cc/uddf/3.2/}dive/{http://www.streit.cc/uddf/3.2/}informationbeforedive/{http://www.streit.cc/uddf/3.2/}datetime").text)
samples = root.find("./{http://www.streit.cc/uddf/3.2/}profiledata/*[@id='" + diveID+ "']/{http://www.streit.cc/uddf/3.2/}dive/{http://www.streit.cc/uddf/3.2/}samples")
previousTime = diveStart
for waypoint in samples:
    interval = diveStart + timedelta(seconds=int(waypoint.find("{http://www.streit.cc/uddf/3.2/}divetime").text))
    depth = float(waypoint.find("{http://www.streit.cc/uddf/3.2/}depth").text)
    if ( waypoint.find("{http://www.streit.cc/uddf/3.2/}temperature") != None ):
        temperature = round(float(waypoint.find("{http://www.streit.cc/uddf/3.2/}temperature").text) - 273)
    if ( previousTime <= photoDateTime and photoDateTime <= interval ):
        print(interval.isoformat() + ": from " + str(previousDepth) + " to " + str(depth))
        print("Now!")
        break
    previousDepth = float(waypoint.find("{http://www.streit.cc/uddf/3.2/}depth").text)
    previousTime = interval
print("Photo taken at depth of " + str(previousDepth) + "m and " + str(depth) + "m at (" + str(latitude) + ", " + str(longitude) + ")")
if temperature != None:
    print("The temperature was " + str(temperature) + "C")

In [None]:
species = "Hypselodoris saintvincentia"

filename = species
originalName = species
if ( '.' in species):
    identificationQualifier = species.split(' ', 1)[1]
    scientificName = species.split(' ', 1)[0]
else:
    scientificName = species
    identificationQualifier = ""

data = json.loads(getWoRMSTaxonData(scientificName))

taxonRank = data["rank"]
taxonRank = taxonRank.lower()
rankNumber = taxonHierarchy.index(taxonRank)

aphiaID = data["AphiaID"]
family = ""
genus = ""
specificEpithet = ""
scientificName = data["scientificname"]
scientificNameAuthorship = data["authority"]
taxonRemarks = data["citation"]

if (rankNumber >= 4):
    family = data["family"]
if (rankNumber >= 5):
    genus = data["genus"]
if (rankNumber >= 6):
    specificEpithet = scientificName.split(" ")[1]

checkName = (scientificName + " " + identificationQualifier).strip()

if ( originalName != checkName):
    print("It looks like " + originalName + " has been reclassified to " + checkName + "! I will copy the photo files, but you will need to update your original metadata yourself.")
    filename = checkName
    directories = [ thumbsDir, fullDir ]
    for directory in directories:
        with os.scandir(directory) as it:
            for entry in it:
                if entry.name.startswith(originalName) and entry.is_file():
                    print(directory + "/" + entry.name + " will be copied to " + directory + "/" + entry.name.replace(originalName, checkName))
                    shutil.copy(directory + "/" + entry.name, directory + "/" + entry.name.replace(originalName, checkName))
                    with open(directory + '/' + entry.name.replace(originalName, checkName), 'rb') as photo:
                        thisPhotoIPTC = IPTCInfo(photo)
                        thisPhotoObjectName = thisPhotoIPTC["object name"].decode("UTF-8")
                        print(thisPhotoObjectName + " will be changed to " + thisPhotoObjectName.replace(originalName, checkName))
                        thisPhotoIPTC["object name"] = bytes(thisPhotoObjectName.replace(originalName, checkName), "UTF-8")
                        thisPhotoIPTC.save()

    

eolID = getEOLID(taxon)
hierarchy = [ scientificName, genus, family ]

wikipediaURL = getAWikipediaURL(hierarchy)

print("Filename: " + filename)
print("---")
print("scientificName: " + scientificName)
print("family: " + family)
print("genus: " + genus)
print("specificEpithet: " + specificEpithet)
print("taxonRank: " + taxonRank)
print("scientificNameAuthorship: " + scientificNameAuthorship)
print("identificationQualifier: " + identificationQualifier)
print("colours:")
print("aphiaID: " + str(aphiaID))
print("eolID: " + str(eolID))
print("wikipediaURL: " + wikipediaURL)
print("taxonRemarks: " + taxonRemarks)
print("---")

Go through all the photos and create a metadata file for each one.

In [None]:
with os.scandir(thumbsDir) as it:
    for entry in it:
        if not entry.name.startswith('.') and entry.is_file():
            with open(thumbsDir + '/' + entry.name, 'rb') as photo:
                fileWritten = False
                filename = entry.name.rsplit('.', 1)[0]
                scientificName = filename.split('-')[0].capitalize()
                if ( '.' in scientificName):
                    identificationQualifier = scientificName.split(' ', 1)[1]
                    scientificName = scientificName.split(' ', 1)[0]
                else:
                    identificationQualifier = ""
                thisPhoto = Image(photo)
                occurrenceRemark = " "
                try:
                    occurrenceRemark = thisPhoto.get("image_description")
                    if (type(occurrenceRemark) != None):
                        occurrenceRemark = occurrenceRemark.replace("\n"," ")
                except:
                    print("Couldn't extract image description")
                photoDateTime = datetime.fromisoformat(thisPhoto.datetime_original.replace(":", "-", 2))
                try:
                    photographer = thisPhoto.artist
                except:
                    print("Couldn't extract photographer")
                # print(entry.name + ' was taken at: ' + photoDateTime.isoformat())
                for dive in root.iter('{http://www.streit.cc/uddf/3.2/}dive'):
                    diveID = dive.get('id')
                    diveStart = datetime.fromisoformat(dive.find('{http://www.streit.cc/uddf/3.2/}informationbeforedive/{http://www.streit.cc/uddf/3.2/}datetime').text)
                    diveDuration = dive.find('{http://www.streit.cc/uddf/3.2/}informationafterdive/{http://www.streit.cc/uddf/3.2/}diveduration').text
                    diveTimeDeltaDuration = timedelta(seconds=int(diveDuration))
                    diveEnd = diveStart + diveTimeDeltaDuration
                    if (diveStart <= photoDateTime and photoDateTime <= diveEnd):
                        siteID = dive.find('{http://www.streit.cc/uddf/3.2/}informationbeforedive/{http://www.streit.cc/uddf/3.2/}link').get('ref')
                        if siteID != "":
                            siteName = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}name").text
                            latitude = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}geography/{http://www.streit.cc/uddf/3.2/}latitude").text
                            longitude = root.find("./{http://www.streit.cc/uddf/3.2/}divesite/*[@id='" + siteID+ "']/{http://www.streit.cc/uddf/3.2/}geography/{http://www.streit.cc/uddf/3.2/}longitude").text
                            # print("Matching dive " + diveID + " found at " + siteName + " with coordinates " + latitude + ", " + longitude + " starting at " + diveStart.isoformat())
                            previousDepth = 0.0
                            samples = root.find("./{http://www.streit.cc/uddf/3.2/}profiledata/*[@id='" + diveID+ "']/{http://www.streit.cc/uddf/3.2/}dive/{http://www.streit.cc/uddf/3.2/}samples")
                            previousTime = diveStart
                            for waypoint in samples:
                                interval = diveStart + timedelta(seconds=int(waypoint.find("{http://www.streit.cc/uddf/3.2/}divetime").text))
                                depth = float(waypoint.find("{http://www.streit.cc/uddf/3.2/}depth").text)
                                if ( waypoint.find("{http://www.streit.cc/uddf/3.2/}temperature") != None ):
                                    temperature = round(float(waypoint.find("{http://www.streit.cc/uddf/3.2/}temperature").text) - 273)
                                if ( previousTime <= photoDateTime and photoDateTime <= interval ):
                                    # print("Photo taken in interval between " + previousTime.isoformat() + " and " + interval.isoformat())
                                    break
                                previousDepth = float(waypoint.find("{http://www.streit.cc/uddf/3.2/}depth").text)
                                previousTime = interval
                            with open(observationDir + "/" + filename + ".md", "w") as f:
                                f.write("---\n")
                                f.write("# Record-level terms\n")
                                f.write("type: StillImage\n")
                                f.write("basisOfRecord: HumanObservation\n")
                                f.write("# Occurrence terms\n")
                                f.write("recordedBy: " + (photographer if photographer != None else "") + "\n")
                                f.write("recordedByID: " + ORCID + "\n")
                                f.write('occurrenceRemarks: "' + (occurrenceRemark if occurrenceRemark != None else "") + '"\n')
                                f.write("# Event terms\n")
                                f.write("eventDateTime: " + photoDateTime.isoformat() + "\n")
                                f.write("year: " + str(photoDateTime.year) + "\n")
                                f.write("month: " + str(photoDateTime.month) + "\n")
                                f.write("day: " + str(photoDateTime.day) + "\n")
                                f.write("# Location terms\n")
                                f.write("locationRemarks: " + str(siteName) + "\n")
                                f.write("minimumDepthInMeters: " + str(previousDepth) + "\n")
                                f.write("maximumDepthInMeters: " + str(depth) + "\n")
                                f.write("decimalLatitude: " + str(latitude) + "\n")
                                f.write("decimalLatitude: " + str(longitude) + "\n")
                                f.write("temperature: " + str(temperature) + "\n")
                                f.write("# Identification terms\n")
                                f.write("identifiedBy: \n")
                                f.write("identifiedByID: \n")
                                f.write("# Taxon terms\n")
                                f.write("scientificName: " + scientificName + "\n")
                                f.write("identificationQualifier: " + identificationQualifier + "\n")
                                f.write("taxonRank: " + ("species" if identificationQualifier == None else "genus") + "\n")
                                f.write("---\n")
                                print(observationDir + "/" + filename + ".md" + " written")
                                fileWritten = True
                        else:
                            print("Matching dive " + diveID + " has no logged location! Cannot write data file.")
            if ( fileWritten == False ):
                    print("No file was written for " + filename + "! Perhaps the photo didn't match a logged dive?")
                        

Create species entries for all the obervations

In [None]:
speciesList = list()
with os.scandir(observationDir) as it:
    for entry in it:
        if not entry.name.startswith('.') and entry.is_file():
            with open(observationDir + '/' + entry.name, 'rb') as observation:
                filename = entry.name.rsplit('.', 1)[0]
                scientificName = filename.split('-')[0].capitalize()
                if ( scientificName not in speciesList ):
                    speciesList.append(scientificName)
speciesList.sort()
for species in speciesList:
    filename = species
    originalName = species
    if ( '.' in species):
        identificationQualifier = species.split(' ', 1)[1]
        scientificName = species.split(' ', 1)[0]
    else:
        scientificName = species
        identificationQualifier = ""
        
    data = json.loads(getWoRMSTaxonData(scientificName))
    
    taxonRank = data["rank"]
    taxonRank = taxonRank.lower()
    rankNumber = taxonHierarchy.index(taxonRank)

    aphiaID = data["AphiaID"]
    family = ""
    genus = ""
    specificEpithet = ""
    scientificName = data["scientificname"]
    scientificNameAuthorship = data["authority"]
    taxonRemarks = data["citation"]

    if (rankNumber >= 4):
        family = data["family"]
    if (rankNumber >= 5):
        genus = data["genus"]
    if (rankNumber >= 6):
        specificEpithet = scientificName.split(" ")[1]

    checkName = (scientificName + " " + identificationQualifier).strip()
    
    if ( originalName != checkName):
        print("It looks like " + originalName + " has been reclassified to " + checkName + "!")

    eolID = getEOLID(scientificName)
    hierarchy = [ scientificName, genus, family ]

    wikipediaURL = getAWikipediaURL(hierarchy)
        
    with open(speciesDir + "/" + filename + ".md", "w") as f:
        f.write("---\n")
        f.write("scientificName: " + scientificName + "\n")
        f.write("family: " + family + "\n")
        f.write("genus: " + genus + "\n")
        f.write("specificEpithet: " + specificEpithet + "\n")
        f.write("taxonRank: " + taxonRank + "\n")
        f.write("scientificNameAuthorship: " + scientificNameAuthorship + "\n")
        f.write("identificationQualifier: " + identificationQualifier + "\n")
        f.write("colours:\n")
        f.write("aphiaID: " + str(aphiaID) + "\n")
        f.write("eolID: " + str(eolID) + "\n")
        f.write('wikipediaURL: "' + wikipediaURL + '"\n')
        f.write('taxonRemarks: "' + taxonRemarks + '"\n')
        f.write("---\n")
    print(speciesDir + "/" + filename + ".md" + " written")

    time.sleep(0.1)
    
print ( str(len(speciesList)) + " total species" )



Sanitise file names and zip outputs for easy downloading

In [None]:
directories = [ fullDir, thumbsDir, observationDir, speciesDir ]

# Empty target directories first

for directory in directories:
    with os.scandir(outputDir + "/" + directory) as it:
        for entry in it:
            if not entry.name.startswith('.') and entry.is_file():
                os.remove(entry)
                
with os.scandir(outputDir) as it:
    for entry in it:
        if not entry.name.startswith('.') and entry.is_file():
            os.remove(entry)

for directory in directories:
    with os.scandir(directory) as it:
        for entry in it:
            if not entry.name.startswith('.') and entry.is_file():
                originalName = entry.name
                fileName = originalName.rsplit(".",1)[0]
                extension = originalName.rsplit(".",1)[1]
                newName = ".".join([fileName.lower().replace(".",""), extension])
                print(directory + "/" + originalName + " -> " + outputDir + "/" + directory + "/" + newName)
                shutil.copy(directory + "/" + originalName, outputDir + "/" + directory + "/" + newName)

with zipfile.ZipFile(outputDir + '/output.zip', 'w') as outputzip:
    for directory in directories:
        with os.scandir(outputDir + "/" + directory) as it:
            for entry in it:
                if not entry.name.startswith('.') and entry.is_file():
                    outputzip.write(outputDir + "/" + directory + "/" + entry.name )              
    