# A notebook to generate a CSV template with info from samples given a filter, and moving upward and downward


Give the output file name (with path)

In [1]:
filename="/Users/pierrespc/Desktop/2021-12-03_Sites"
f = open(filename, 'w')

## Preparing the note book

Please enter the one-line file where your token is saved in the following cell

In [2]:
tokenFile="/Users/pierrespc/Documents/PostDocPasteur/aDNA/Import_eLAB/API_FUNCTIONALITIES/credentials/tokenELAB"

Now preparing all required python libs

In [3]:
import os
import json
import requests
import csv
import pandas
import numpy
from apiclient import discovery, errors
from httplib2 import Http
from oauth2client import client, file, tools
import os.path

token = format(open(tokenFile,"r").readline().strip())
url = "https://elab-dev.pasteur.fr/api/v1/"
headers1 = {'Authorization': token, 'Accept': 'application/json','Content-Type':'application/json'}
headers2 = {'Authorization': token, 'Accept': 'application/json'}




Prepare all the eLab-API keys necessary to down and upload data. Get list of sample types user is interested in.

In [4]:
def BadRequest(myReq,code=200):
    return(myReq.status_code !=code)


r = requests.get(url + "sampleTypes", headers = headers2)
if BadRequest(r,200):
    r.raise_for_status()
dictType = r.json().get("data")


Now we get all registered ID for all types.
(I am lazy now to try to find a clever way to process only the types we need downstream)

In [5]:
registered = {}
for it in dictType:
    name = it.get("name")
    ID = it.get("sampleTypeID")
    print(name + " --> " + format(ID))
    r = requests.get(url + "samples" , headers = headers2, params = {'sampleTypeID': ID})
    if BadRequest:
        r.raise_for_status()
    data = r.json()
    myList = {}
    for sam in data.get("data"):
        if format(sam.get("name")) in myList.keys():
            print(name + ": " + sam.get("name") + " duplicated")
            break
        myList[format(sam.get("name"))]=format(sam.get("sampleID"))
    registered[name] = myList
print("finished")


Individual --> 39466
Site --> 39468
Skeleton Element --> 39469
Extract --> 39470
Indexed Library --> 39494
Library pool --> 39495
Non Indexed Library --> 39556
Bone pellet --> 39599
finished


## Getting which info will be saved in the output table

### Now we get the sample types for which we will output the info and the features we want to retrieve for each sample Type

In [7]:
types = {}
for typ in dictType:
    prompt="?"
    while prompt not in ["y","n"]:
        prompt=input("interested in getting info from "+typ.get("name")+"? y/n")
    if prompt == "y":
        typName=format(typ.get("name"))
        types[typName] = {"key":format(typ.get("sampleTypeID")),
                                          "meta":{},
                                          "data":{}}
        r = requests.get(url + "sampleTypes/" + types[typName]["key"] + "/meta", headers = headers2)
        if BadRequest(r,200):
            r.raise_for_status()
        data = r.json()
        for feat in data.get("data"):
            if feat.get("sampleDataType") == "SAMPLELINK":
                continue
            prompt="?"
            while prompt not in ["y","n"]:
                prompt=input("interested in outputing META feature "+feat.get("key")+"? y/n")
            if prompt == "y":
                types[typName]["meta"][feat.get("key")]=feat.get("sampleTypeMetaID")
        for feat in ["description","Quantity","note"]:
            prompt="?"
            while prompt not in ["y","n"]:
                prompt=input("interested in outputing notMETA feature "+feat+"? y/n")
            if prompt == "y":
                types[typName]["data"][feat]=""
print(types)




interested in getting info from Individual? y/nn
interested in getting info from Site? y/ny
interested in outputing META feature Pictures? y/ny
interested in outputing META feature Main geographic region? y/ny
interested in outputing META feature Country? y/ny
interested in outputing META feature Province / Region? y/ny
interested in outputing META feature Locality? y/ny
interested in outputing META feature Latitude? y/ny
interested in outputing META feature Longitude? y/ny
interested in outputing META feature Site type? y/ny
interested in outputing notMETA feature description? y/ny
interested in outputing notMETA feature Quantity? y/ny
interested in outputing notMETA feature note? y/ny
interested in getting info from Skeleton Element? y/nn
interested in getting info from Extract? y/nn
interested in getting info from Indexed Library? y/nn
interested in getting info from Library pool? y/nn
interested in getting info from Non Indexed Library? y/nn
interested in getting info from Bone pel

## Defining the filters

Some function defintion (just run the following cells without wondering)

In [8]:
from datetime import datetime

def CheckDate(Date):
    if Date =="?":
        return(False)
    else:
        tmp=Date.split("-")
        tmp=[int(i) for i in tmp]
        return(tmp[0] > 2020 and tmp[0] < 2030 and tmp[1] > 0 and tmp[1] < 13 and tmp[2] >0 and tmp[2]<32)

def getDateFilter():
    wrongEntry=True
    while wrongEntry:
        MostRecent="?"
        while MostRecent != "9999-12-31" and not CheckDate(MostRecent):
            MostRecent=input("Enter the most recent date, i.e. we will filter IN samples before that date (type Any if no filter )")
            if MostRecent == "Any":
                MostRecent="9999-12-31"
        MostRecent=datetime.strptime(MostRecent,'%Y-%m-%d')
        Eldest="?"
        while Eldest != "0001-01-01" and not CheckDate(Eldest):
            Eldest=input("Enter the eldest date, i.e. i.e. we will filter IN samples after that date (type Any if no filter )")
            if Eldest == "Any":
                Eldest="0001-01-01"
        Eldest=datetime.strptime(Eldest,'%Y-%m-%d')
        if Eldest<MostRecent:
            wrongEntry=False
        else:
            print("you entered a mostRecent date more ancient and EldestDate")
            
    return({"MostRecent":MostRecent,"Eldest":Eldest})

In [9]:
def getOptionFilter(possibleChoices):
    print(len(possibleChoices))
    wrongEntry=True
    while wrongEntry:
        print("possible choices")
        index=0
        for value in possibleChoices:
            index=index+1
            print(format(index)+":"+value)
        listEntered=input("enter your choice(s) (the number(s) separated by space)").split()
        listEntered=[int(i)-1 for i in listEntered ]
        if min(listEntered) <0 or max(listEntered)>=len(possibleChoices):
            print("you entered choices out of range")
        else:
            wrongEntry=False
    return([possibleChoices[i] for i in listEntered])


###for now we cover just the case where a given string is in the feature (no filter for NOT, OR, AND, NOT ANY, etc...)
def getTextFilter():
    return(input("enter a string to find in the field"))


In [10]:
import re

def getLinkFilter(sampleType,allIDs,link):
    parentPattern={"Site":{"pattern":"Any","typeParent":"None"},
                   "Individual":{"pattern":'[A][R][0-9][0-9][0-9][0-9]',"typeParent":"Site"},
                   "Skeleton Element":{"pattern":'[A][R][0-9][0-9][0-9][0-9][.][0-9]',"typeParent":"Individual"},
                   "Extract":{"pattern":'[A][R][0-9][0-9][0-9][0-9][.][0-9][.][0-9]',"typeParent":"Skeleton Element"}}

    if sampleType not in parentPattern.keys():
        raise(sampleType+" not covered to retrieve its parent sample")
    if link:
        typeToCheck=parentPattern[sampleType][typeParent]
    else:
        typeToCheck=sampleType
                   
    listType="?"
    while not listType in ["prompt","file"]:        
        listType=input("will you enter IDs one by one or a file (prompt/file)?")
    wrongEntry=True
    while wrongEntry:
        if listType=="file":
            listIDfile=open(input("file with parent file"),"r").readlines()
            listID=[]
            for i in listIDfile:
                listID.append(i.strip())
        else:
            listID=input("enter the parent sample IDs separated by <space>/<space>, must match pattern "+parentPattern[typeToCheck]["pattern"])
            listID=listID.split(" / ")
        wrongEntry=False
        for id in listID:
            ###check all id match pattern
            if not (re.match(parentPattern[typeToCheck]["pattern"],id) or parentPattern[typeToCheck]["pattern"] == "Any"):
                print("wrong pattern for "+id+" expected: "+parentPattern[typeToCheck]["pattern"])
                wrongEntry=True
                ###check all id already registered
            if not id in allIDs.keys():
                print(id+" not registered in eLab")
                wrongEntry=True
        if wrongEntry:
            print("change those ids either in the file or in the prompted list")
     
    bound="?"
    while bound not in ["notin","in"]:
        bound=input("keep or remove those IDS (in/notin)?")
    return({"rule":bound,"list":listID})

In [11]:
def getQuantityFilter():
    wrongEntry=True
    while wrongEntry:
        quanti=float(input("enter a quantity"))
        bound=input("enter a bound (less, more, exact)")
        if bound in ["less","more","exact"]:
            wrongEntry=False
    return({"rule":bound,"quantity":quanti})



### On which field and Sample type you want to filter?

In [12]:
listFilter={}
levelSeq=['Library pool', 'Indexed Library', 'Non Indexed Library', 'Extract','Skeleton Element', 'Individual', 'Site']
for typ in dictType:
    typName=typ.get("name")
    if typName == "Bone pellet":
        continue
    typID=typ.get("sampleTypeID")
    level=levelSeq.index(typName)
    print(typName+" " +format(level))
    typeFilter="?"
    while typeFilter not in ["y","n"]:
        typeFilter=input("Do you want to apply a filter for "+typName+"?")
    if typeFilter == "y":
        listFilter[typName]={}
        r = requests.get(url + "sampleTypes/" + format(typID) + "/meta", headers = headers2)
        if BadRequest(r,200):
            r.raise_for_status()
        data = r.json()
        for meta in data.get("data"):
            typeFilter="?"
            while typeFilter not in ["y","n"]:
                typeFilter=input("for "+ typName+", is there a filter for "+meta.get("key")+"?")
                if typeFilter == "y":
                    listFilter[typName][meta.get("key")]={}
                    r = requests.get(url + "sampleTypes/" + format(typID) + "/meta/"+format(meta.get("sampleTypeMetaID")), headers = headers2)
                    if BadRequest(r,200):
                        r.raise_for_status()
                        
                    listFilter[typName][meta.get("key")]["type"]=r.json().get("sampleDataType")

                    if r.json().get("sampleDataType") == "DATE":
                        listFilter[typName][meta.get("key")]["filter"]=getDateFilter()                           
                    elif r.json().get("sampleDataType") == "CHECKBOX":
                        listFilter[typName][meta.get("key")]["filter"]=getOptionFilter(r.json().get("optionValues"))
                    elif r.json().get("sampleDataType") == "COMBO":
                        listFilter[typName][meta.get("key")]["filter"]=getOptionFilter(r.json().get("optionValues"))
                    elif r.json().get("sampleDataType") == "TEXT":
                        listFilter[typName][meta.get("key")]["filter"]=getTextFilter()
                    elif r.json().get("sampleDataType") == "SAMPLELINK":
                        parentType=levelSeq[level+1]
                        listFilter[typName][meta.get("key")]["filter"]=getLinkFilter(typName,registered[parentType],True)
                    else:
                        print(r.json().get("sampleDataType")+" not covered")
                        break
        
        for feat in ["description","Quantity","note","name"]:
            typeFilter="?"
            while typeFilter not in ["y","n"]:
                typeFilter=input("for "+ typName+", is there a filter for "+feat+"?")
            if typeFilter == "y":
                listFilter[typName][feat]={}
                if feat in ["Observation","Note"]:
                    listFilter[typName][feat]["type"]="TEXT"
                    listFilter[typName][feat]["filter"]=getTextFilter()
                elif feat == "Quantity":
                    listFilter[typName][feat]["type"]="QUANTITY"
                    listFilter[typName][feat]["filter"]=getQuantityFilter()
                else:
                    listFilter[typName][feat]["type"]="NAME"
                    listFilter[typName][feat]["filter"]=getLinkFilter(typName,registered[typName],False)
        if len(listFilter[typName])==0:
            print("you finally decided not to filter for anything for "+typName)
            del(listFilter[typName])


Individual 5
Do you want to apply a filter for Individual?n
Site 6
Do you want to apply a filter for Site?y
for Site, is there a filter for Pictures?n
for Site, is there a filter for Main geographic region?n
for Site, is there a filter for Country?y
2
possible choices
1:Chile
2:Argentina
enter your choice(s) (the number(s) separated by space)1 2
for Site, is there a filter for Province / Region?n
for Site, is there a filter for Locality?n
for Site, is there a filter for Latitude?n
for Site, is there a filter for Longitude?n
for Site, is there a filter for Site type?n
for Site, is there a filter for description?n
for Site, is there a filter for Quantity?n
for Site, is there a filter for note?n
for Site, is there a filter for name?n
Skeleton Element 4
Do you want to apply a filter for Skeleton Element?n
Extract 3
Do you want to apply a filter for Extract?n
Indexed Library 1
Do you want to apply a filter for Indexed Library?n
Library pool 0
Do you want to apply a filter for Library pool?n

## Let's parse the database, filter the entry and output 

Following cell is just some function definitions

In [13]:
def filterText(value,filter):
    return(filter in value)

def filterQuantity(value,thres,ruler):
    if ruler == "exact":
        return(value==thres)
    elif ruler == "less":
        return(value<=thres)
    elif ruler == "more":
        return(value>=thres)
    else:
        raise(ruler+ " not recognized")

def filterDate(value,filter):
    value=datetime.strptime(value,'%Y-%m-%d')
    return(value<=filter["MostRecent"] and value>=filter["Eldest"])

def filterLink(value,listNAM,ruler):
    value=value.split("|")[0]
    if ruler=="in":
        return(value in listNAM)
    elif ruler=="notin":
        return(value not in listNAM)
    else:
        raise()

        
def filterName(value,listNAM,ruler):
    if ruler=="in":
        return(value in listNAM)
    elif ruler=="notin":
        return(value not in listNAM)
    else:
        raise()
    

def filterCombo(value,filter):
    return(value in filter)

def filterCheckbox(value,filter):
    AllFound=True
    for i in value:
        if i not in filter:
            AllFound=False
    return(AllFound)


In [14]:
import pandas as pd

startRecord=False
filteredEntries={}

levelNum=0
listNextStepKept="FIRSTlevelParsed"
## first we get all entries that match filter for each type
for level in levelSeq:
    levelNum=levelNum+1
    ###check if needed to record entries for that level
    if level not in listFilter.keys() and level not in types.keys() and not startRecord:
        print(level+" skipped")
        continue
    else:
        startRecord=True
        filteredEntries[level]={level:[]}
        if level!=levelSeq[len(levelSeq)-1]:          
            filteredEntries[level][levelSeq[levelNum]]=[]
            #filteredEntries[level]["parent"]=[]
        if level in types.keys():
            for entry in types[level]["meta"]:
                filteredEntries[level][level+"_"+entry]=[]
            for entry in types[level]["data"]:
                filteredEntries[level][level+"_"+entry]=[]
        print("parsing "+ level)
        #for sample,idSam in prout.items():
        for sample,idSam in registered[level].items():
            if listNextStepKept=="FIRSTlevelParsed":
                filterIN=True
            else:
                filterIN=filterName(sample,
                                    listNextStepKept,
                                    "in")
            if not filterIN:
                continue
            ##if no filter for that we keep the entry by default
            if level in listFilter.keys():
                r=requests.get(url+"/samples/get?sampleID="+idSam,headers=headers2)
                if BadRequest(r,200):
                    r.raise_for_status()
                ###filtering for observation and note (not meta data)
                if "name" in listFilter[level].keys():
                    new=filterName(sample,
                                   listFilter[level]["name"]["filter"]["list"],
                                   listFilter[level]["name"]["filter"]["rule"])
                    filterIN=filterIN and new
                if "description" in listFilter[level].keys() or "note" in listFilter[level].keys():
                    for filterTy in ["description","note"]:
                        if filterTy in listFilter[level].keys():
                            print(filterTy+" "+format(new))
                            new=filterText(r.json.get(filterTy),listFilter[level][filterTy]["filter"])
                            filterIN=filterIN and new
                if not filterIN:
                    continue


                ###filtering for quantity (not meta data)                        
                if "Quantity" in listFilter[level].keys():
                    r=requests.get(url + "samples/" + idSam + "/quantity", headers = headers2)
                    if BadRequest(r,200):
                        r.raise_for_status()
                    new=filterQuantity(r.json().get("amount"),
                                   listFilter[level]["Quantity"]["filter"]["quantity"],
                                   listFilter[level]["Quantity"]["filter"]["rule"])
                    #print("Quantity "+format(new))
                    filterIN=filterIN and new
                if not filterIN:
                    continue

                ###filtering for meta data fields
                r=requests.get(url+"/samples/"+idSam+"/meta",headers=headers2)
                if r.status_code != 200:
                    r.raise_for_status()
                for meta in r.json().get("data"):
                    if meta.get("key") in listFilter[level].keys():
                        if listFilter[level][meta.get("key")]["type"] == "DATE":
                            new=filterDate(meta.get("value"),listFilter[level][meta.get("key")]["filter"])
                        elif listFilter[level][meta.get("key")]["type"] == "TEXT":
                            new=filterText(meta.get("value"),listFilter[level][meta.get("key")]["filter"])
                        elif listFilter[level][meta.get("key")]["type"] == "SAMPLELINK":
                            new=filterLink(meta.get("value"),
                                           listFilter[level][meta.get("key")]["filter"]["list"],
                                           listFilter[level][meta.get("key")]["filter"]["rule"])
                        elif listFilter[level][meta.get("key")]["type"] == "COMBO":
                            new=filterCombo(meta.get("value"),listFilter[level][meta.get("key")]["filter"])
                        elif listFilter[level][meta.get("key")]["type"] == "CHECKBOX":
                            new=filterCheckbox(meta.get("value"),listFilter[level][meta.get("key")]["filter"])
                        else:
                            raise(listFilter[level][meta.get("key")]["type"]+" not covered")
                        #print(meta.get("key")+" "+format(new)+" "+format(meta.get("value")))                        
                        filterIN=filterIN and new
                if not filterIN:
                    continue

            ###if that entry passed the filter we record the required fields (and the parent sample)

            #print(sample+"-->IN")
            ##adding the name by default
            filteredEntries[level][level].append(sample)
            
            r=requests.get(url+"/samples/"+idSam+"/meta",headers=headers2)
            if r.status_code != 200:
                r.raise_for_status()
                
            ###now adding metadata and data requested by user
            for meta in r.json().get("data"):
                ##adding the the parent by default
                if meta.get("sampleDataType")=="SAMPLELINK":
                    filteredEntries[level][levelSeq[levelNum]].append(meta.get("value").split("|")[0])
                    #filteredEntries[level]["parent"].append(meta.get("value").split("|")[0])
            if level in types.keys():
                for meta in r.json().get("data"):
                    ##adding the meta field that the user specified
                    if meta.get("key") in types[level]["meta"]:
                        filteredEntries[level][level+"_"+meta.get("key")].append(meta.get("value"))
                ##adding the data field that the user specified
                if "description" in types[level]["data"] or "note" in types[level]["data"]:
                    r=requests.get(url+"/samples/"+idSam,headers=headers2)
                    if r.status_code != 200:
                        r.raise_for_status()
                    for dataTy in ["description","note"]:
                        if dataTy in types[level]["data"]:
                            filteredEntries[level][level+"_"+dataTy].append(r.json().get(dataTy))
                if "Quantity" in types[level]["data"]:
                    r=requests.get(url+"/samples/"+idSam+"/quantity",headers=headers2)
                    if r.status_code != 200:
                        r.raise_for_status()
                    filteredEntries[level][level+"_Quantity"].append(format(r.json().get("amount"))+r.json().get("unit"))
        print("we have "+format(len(filteredEntries[level][level]))+" remaining")
        # we register the parent samples from that list
        if level != "Site":
            listNextStepKept=filteredEntries[level][levelSeq[levelNum]]
        filteredEntries[level]["df"]=pd.DataFrame(filteredEntries[level])
        

Library pool skipped
Indexed Library skipped
Non Indexed Library skipped
Extract skipped
Skeleton Element skipped
Individual skipped
parsing Site
we have 164 remaining


ValueError: arrays must all be same length

In [15]:
        
###Now we merge the different data frames obtained for each level into an unique table!
Starting=True
for level in levelSeq:
    if level not in types.keys() and Starting:
        print(level+" skipped")
        continue
    if Starting:
        out=filteredEntries[level]["df"]
        Starting=False
    else:
        out=filteredEntries[level]["df"].merge(out,how='inner',on=level)
        
out.drop_duplicates()        
## And we can write!
##first some comments to register the filters:
jiter=0
for level in listFilter.keys():
    jiter=jiter+1
    f.writelines("#"+format(jiter)+". filters at: "+level+"\n")
    iter=0
    for fifi in listFilter[level].keys():
        iter=iter+1
        f.writelines("#    -"+format(jiter)+"."+format(iter)+". "+fifi+":"+format(listFilter[level][fifi]["filter"])+"\n")
f.close()
out.to_csv(filename, sep='\t', na_rep='NA',mode='a')
        
out


Library pool skipped
Indexed Library skipped
Non Indexed Library skipped
Extract skipped
Skeleton Element skipped
Individual skipped


KeyError: 'df'

In [21]:
for key in filteredEntries["Site"].keys():
    print(key+" "+format(len(filteredEntries["Site"][key])))
    

Site 164
Site_Pictures 67
Site_Main geographic region 164
Site_Country 164
Site_Province / Region 164
Site_Locality 164
Site_Latitude 164
Site_Longitude 164
Site_Site type 164
Site_description 164
Site_Quantity 164
Site_note 164


In [24]:
filteredEntries["Site"]["Site_Pictures"]

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']