# Mediaflux training

Manipulating assets and metadata in Mediaflux using the `mfclient` library.

In [40]:
import mfclient
import logging
from Crypto.Cipher import AES
import pandas as pd
import random

In [2]:
logging.basicConfig(
    filename="mf_2018-01-08.log",
    level=logging.DEBUG,
    filemode="a",
    format='%(asctime)s %(levelname)s - %(message)s',
    datefmt='%m-%d-%Y %H:%M:%S')

-----

## Connect to MF server

In [171]:
with open("keys/key") as f:
    key = f.read().strip()
with open("keys/iv") as f:
    iv = f.read().strip()
obj = AES.new(key, AES.MODE_CFB, iv)

In [172]:
with open("/Users/jess/.ssh/encrypted_pw.txt") as f:
    pw = f.read().strip()

In [173]:
MF_HOST = "mediaflux.vicnode.org.au"
MF_PORT = 443
MF_TRANSPORT = "https"
MF_DOMAIN = "aaf"
MF_USER = "unimelb:jessicac"
MF_PASSWORD = obj.decrypt(pw)

In [174]:
con = mfclient.MFConnection(host=MF_HOST,
                            port=MF_PORT,
                            transport=MF_TRANSPORT,
                            domain=MF_DOMAIN,
                            user=MF_USER,
                            password=MF_PASSWORD)

In [175]:
con.open()
result = con.execute("server.version")

In [176]:
result.tostring()

'<result><ant-version>Apache Ant 1.9.4</ant-version><binary>aserver</binary><build-time>02-Nov-2017 16:24:41 AEDT</build-time><built-by>Arcitecta. Pty. Ltd.</built-by><created-by>1.8.0_111-b14 (Oracle Corporation)</created-by><manifest-version>1.0</manifest-version><target-jvm>1.7</target-jvm><vendor>Arcitecta Pty. Ltd.</vendor><version>4.6.021</version></result>'

-----

## Create file with `asset.get`

In [None]:
# logging.info("Try creating an file with asset.get")
# args = mfclient.XmlStringWriter("args")
# args.add("namespace", "/projects/proj-demonstration-1128.4.15")
# args.add("name", "jc_test_2")
# args.push("meta")
# args.push("mf-note")
# args.add("note", "this is a test file created from the python api")
# args.pop()
# args.pop()
# result2 = con.execute("asset.create", args.doc_text())
# logging.info(result2)

-----

## Retrieve metadata with `asset.get`

In [11]:
logging.info("Retrieve metadata")
args = mfclient.XmlStringWriter("args")
args.add("id", 35422787)
result3 = con.execute("asset.get", args.doc_text())
# logging.info(result3)
# logging.info(dir(result3))
# logging.info(result3.value("asset/@id"))
# logging.info(result3.attributes)

In [12]:
result3.value("asset/@id")

'35422787'

In [15]:
result3.tostring()

'<result><asset id="35422787" version="1" vid="58757764"><type>content/unknown</type><namespace>/projects/proj-demonstration-1128.4.15</namespace><path>/projects/proj-demonstration-1128.4.15/jc_test_2</path><name>jc_test_2</name><creator id="1398"><domain>aaf</domain><user>jchung@unimelb.edu.au</user><name>Jessica Chung</name><email>jchung@unimelb.edu.au</email></creator><ctime dst="true" gmt-offset="10.0" millisec="1513569904840">18-Dec-2017 15:05:04</ctime><mtime dst="true" gmt-offset="10.0" millisec="1513569904840">18-Dec-2017 15:05:04</mtime><stime>58757764</stime><versioned count="1">true</versioned><access><access>true</access><rename>false</rename><modify>true</modify><destroy>true</destroy><access-content>true</access-content><modify-content>true</modify-content></access><meta stime="58757764"><mf-revision-history id="1"><user id="1398"><authority protocol="saml">internalsp</authority><domain>aaf</domain><name>jchung@unimelb.edu.au</name></user><type>create</type></mf-revision-

In [49]:
print result3.element("asset/meta/mf-note/note")

<note>this is a test file created from the python api</note>


-----

## Query MF

In [19]:
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-demonstration-1128.4.15")
# args.add("action", "get-id")
args.add("action", "get-path")
args.add("size", "infinity")
result4 = con.execute("asset.query", args.doc_text())

In [22]:
print result4

<result><path id="29779506" version="1">/projects/proj-demonstration-1128.4.15/3films_vca.csv</path><path id="29870021" version="1">/projects/proj-demonstration-1128.4.15/mexplorer-1.3.0.jar</path><path id="29870022" version="1">/projects/proj-demonstration-1128.4.15/aterm.jar</path><path id="30952443" version="1">/projects/proj-demonstration-1128.4.15/ReadMe_ASIFAEast.rtf</path><path id="31327915" version="1">/projects/proj-demonstration-1128.4.15/data.csvy</path><path id="32417517" version="1">/projects/proj-demonstration-1128.4.15/right_up.png</path><path id="32417518" version="1">/projects/proj-demonstration-1128.4.15/1484286195_tick-circle.png</path><path id="32427338" version="1">/projects/proj-demonstration-1128.4.15/Apium_prostratum_ME.wpvh.jpg</path><path id="32522089" version="4">/projects/proj-demonstration-1128.4.15/junk1.tcl</path><path id="35422605" version="1">/projects/proj-demonstration-1128.4.15/Test_Note_1</path><path id="35422609" version="1">/projects/proj-demonstr

In [23]:
paths = result4.values("path")
ids = result4.values("path/@id")

In [28]:
# for p in paths:
#     print(p)

# for id in ids:
#     print(id)

In [29]:
# get all elements with element name 'path'
path_elements = result4.elements("path")

for pe in path_elements[:5]:
    print(pe)
    path = pe.value()     # get value inside 'path' element
    id = pe.value("@id")  # get attribute value of 'id' attribute in 'path' element
    print("\t" + id + " - " + path)

<path id="29779506" version="1">/projects/proj-demonstration-1128.4.15/3films_vca.csv</path>
	29779506 - /projects/proj-demonstration-1128.4.15/3films_vca.csv
<path id="29870021" version="1">/projects/proj-demonstration-1128.4.15/mexplorer-1.3.0.jar</path>
	29870021 - /projects/proj-demonstration-1128.4.15/mexplorer-1.3.0.jar
<path id="29870022" version="1">/projects/proj-demonstration-1128.4.15/aterm.jar</path>
	29870022 - /projects/proj-demonstration-1128.4.15/aterm.jar
<path id="30952443" version="1">/projects/proj-demonstration-1128.4.15/ReadMe_ASIFAEast.rtf</path>
	30952443 - /projects/proj-demonstration-1128.4.15/ReadMe_ASIFAEast.rtf
<path id="31327915" version="1">/projects/proj-demonstration-1128.4.15/data.csvy</path>
	31327915 - /projects/proj-demonstration-1128.4.15/data.csvy


-----

## Setting metadata

```
# asset.set :id 123 :meta < mf-note < note "Test" > >
# asset.set :id 123 :meta -action [add|merge|remove|replace] <document>

```

In [76]:
# First get all assets in specified directory and has metadata attached

args = mfclient.XmlStringWriter("args")

# Limit files to /projects/proj-marsupial_genomics-1128.4.19/Test_2017-08-29
args.add("where", "namespace=/projects/proj-marsupial_genomics-1128.4.19/Test_2017-08-29 \
          and proj-marsupial_genomics-1128.4.19:test has value")

# Get path of files matching query
args.add("action", "get-path")

# Get all assets (instead of the first 100)
args.add("size", "infinity")

# Run the query
result = con.execute("asset.query", args.doc_text())

In [77]:
print result

<result><path id="35009187" version="8">/projects/proj-marsupial_genomics-1128.4.19/Test_2017-08-29/test</path><path id="35009188" version="8">/projects/proj-marsupial_genomics-1128.4.19/Test_2017-08-29/mediaflux.sh</path></result>


In [82]:
# Get existing value of metadata (id and location) and change values
for path_element in result.elements("path"):
    print path_element
    asset_id = path_element.attribute("id")
    
    # Get existing values
    args = mfclient.XmlStringWriter("args")
    args.add("id", asset_id)
    asset_result_1 = con.execute("asset.get", args.doc_text())
    existing_id = asset_result_1.value("asset/meta/proj-marsupial_genomics-1128.4.19:test/id")
    existing_location = asset_result_1.value("asset/meta/proj-marsupial_genomics-1128.4.19:test/location")
    print "id: {}\tloc: {}".format(existing_id, existing_location)
    
    # Set new values
    args = mfclient.XmlStringWriter("args")
    args.add("id", asset_id)
    args.push("meta")
    #     args.push("meta", attributes={"action": "add"})
    args.push("proj-marsupial_genomics-1128.4.19:test")
    args.add("id", str(random.randint(1,10)))
    args.add("location", str(random.randint(1,1000)))
    args.pop()
    args.pop()
    asset_result_2 = con.execute("asset.set", args.doc_text())
    
    # Get new values
    args = mfclient.XmlStringWriter("args")
    args.add("id", asset_id)
    asset_result_3 = con.execute("asset.get", args.doc_text())
    new_id = asset_result_3.value("asset/meta/proj-marsupial_genomics-1128.4.19:test/id")
    new_location = asset_result_3.value("asset/meta/proj-marsupial_genomics-1128.4.19:test/location")
    print "id: {}\tloc: {}".format(new_id, new_location)
    
    print "=-=-=-=-=-=-=-=-=-=-=-=-=-=-="

<path id="35009187" version="8">/projects/proj-marsupial_genomics-1128.4.19/Test_2017-08-29/test</path>
id: 2	loc: 954
id: 8	loc: 451
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
<path id="35009188" version="8">/projects/proj-marsupial_genomics-1128.4.19/Test_2017-08-29/mediaflux.sh</path>
id: 1	loc: 151
id: 6	loc: 689
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=


In [None]:
# result5 = con.execute("asset.set", args.doc_text())

-----

## Metadata definitions

```
# aterm
# list definitions in a project
asset.doc.type.list :namespace proj-marsupial_genomics-1128.4.19

# view definition details
asset.doc.type.describe :type proj-marsupial_genomics-1128.4.19:test

```

In [181]:
# asset.doc.type.describe :type proj-marsupial_genomics-1128.4.19:test
args = mfclient.XmlStringWriter("args")
args.add("type", "proj-marsupial_genomics-1128.4.19:test")

# Run the query
result = con.execute("asset.doc.type.describe", args.doc_text())

In [182]:
print result

<result><type for="asset" id="241" name="proj-marsupial_genomics-1128.4.19:test" version="5"><label>proj-marsupial_genomics-1128.4.19:test</label><description>test description</description><generated-by>user</generated-by><allow-incomplete>maybe</allow-incomplete><access><administer>true</administer><create>true</create><modify>true</modify><publish>true</publish></access><creator id="1398"><domain>aaf</domain><user>jchung@unimelb.edu.au</user><name>Jessica Chung</name><email>jchung@unimelb.edu.au</email></creator><ctime dst="true" gmt-offset="10.0" time="1515392815328" tz="Australia/Melbourne">08-Jan-2018 17:26:55</ctime><definition><element label="ID" max-occurs="1" name="id" type="string"><description>description for id</description><instructions>instructions for id</instructions></element><element max-occurs="100" min-occurs="0" name="location" type="string" /><element max-occurs="1" name="enum" type="enumeration"><restriction base="enumeration"><value>val1</value><value>val2</valu

In [108]:
args = mfclient.XmlStringWriter("args")

args.add("type", "proj-marsupial_genomics-1128.4.19:py_test")
args.add("label", "proj-marsupial_genomics-1128.4.19:py_test")
args.add("description", "test create metadata doc")
args.add("instructions", "instructions for metadata doc")

# Metadata definitions
args.push("definition")
args.push("element", attributes={"name": "id", "max-occurs": 1, "type": "string"})
args.add("description", "description for id")
args.add("instructions", "instructions for id")
args.pop()
args.push("element", attributes={"name": "location", "max-occurs": 1, "type": "string"})
args.add("description", "description for location")
args.add("instructions", "instructions for location")
args.pop()
args.pop()

print(args.doc_text())

<args><type>proj-marsupial_genomics-1128.4.19:py_test</type><label>proj-marsupial_genomics-1128.4.19:py_test</label><description>test create metadata doc</description><instructions>instructions for metadata doc</instructions><definition><element type="string" name="id" max-occurs="1"><description>description for id</description><instructions>instructions for id</instructions></element><element type="string" name="location" max-occurs="1"><description>description for location</description><instructions>instructions for location</instructions></element></definition></args>


In [109]:
result = con.execute("asset.doc.type.create", args.doc_text())

In [112]:
print result

<result><proj-marsupial_genomics-1128.4.19:py_test xmlns:proj-marsupial_genomics-1128.4.19="proj-marsupial_genomics-1128.4.19" version="1" /></result>


-----

## MPP metadata test

In [117]:
metadata_elements = pd.read_excel("../metadata_elements.xlsx")

In [118]:
metadata_elements.head()

Unnamed: 0,element,definition,instructions,requirement,type,default value,notes
0,id,ID number on animal's ear tag,Usually a four-digit integer. Should not have ...,Mandatory,string,,Each sample should only have one 'general' met...
1,sex,Sex of the animal,"Allowed values: male, female, hermaphrodite, u...",Mandatory,enumeration,,
2,country,The name of the country or major administrativ...,"For this study, only relevant value is Austral...",Mandatory,string,Australia,
3,state_or_region,The name of the next smaller administrative re...,State name written out in full (no abbreviatio...,Mandatory,string,Victoria,
4,taxon_id,ID number for this taxon from the NCBI Taxonom...,ID number results from https://www.ncbi.nlm.ni...,Mandatory,integer,38600,


In [119]:
capture_metadata_elements = pd.read_excel("../metadata_elements.xlsx", sheet_name=2)

In [120]:
capture_metadata_elements.head()

Unnamed: 0,element,definition,instructions,requirement,type,default value,notes
0,id,ID number. The animals with these metadata fie...,Usually a four-digit integer. Should not have ...,Mandatory,string,,
1,year,Year of recording,Year of recording details of the animal,Mandatory,integer,,
2,source,Text describing the source population,"E.g. Healesville Sanctuary zoo release, Mount ...",Mandatory,string,,
3,sex,Sex of the animal,"Allowed values: male, female, hermaphrodite, u...",Mandatory,enumeration,,
4,weight,Weight of the animal in grams,"Numeric. Allowed missing values are unknown, n...",Mandatory,double,,Note: columns from weight to other_notes are i...


In [125]:
metadata_elements["element"]

0                  id
1                 sex
2             country
3     state_or_region
4            taxon_id
5              phylum
6               class
7               order
8              family
9               genus
10            species
11         subspecies
12        common_name
13     alternative_id
Name: element, dtype: object

In [142]:
metadata_elements_dict = metadata_elements.transpose().to_dict()

In [184]:
# <restriction base="enumeration"><value>val1</value><value>val2</value><value>val3</value></restriction><value as="default">val1</value></element>
enumeration_values = {
    "sex": [],
}

In [193]:
# temp change enumeration to string
metadata_elements_dict[1]["type"] = "string"

In [194]:
def add_elements(args, metadata_dict):
    args.push("definition")
    for index in metadata_dict:
        print index
        attr = {"name": metadata_dict[index]["element"],
                "max-occurs": 1,
                 "type": metadata_dict[index]["type"]}
        args.push("element", attributes=attr)
        args.add("description", metadata_dict[index]["definition"])
        args.add("instructions", metadata_dict[index]["instructions"])
        args.pop()
    args.pop()
    return args

In [195]:
args = mfclient.XmlStringWriter("args")

args.add("type", "proj-marsupial_genomics-1128.4.19:mpp_general_test")
args.add("label", "proj-marsupial_genomics-1128.4.19:mpp_general_test")
args.add("description", "test create metadata doc")
args.add("instructions", "instructions for metadata doc")

# Metadata definitions
args = add_elements(args=args, metadata_dict=metadata_elements_dict)

0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [None]:
# args.doc_text()

In [197]:
result = con.execute("asset.doc.type.create", args.doc_text())

In [199]:
print result

<result><proj-marsupial_genomics-1128.4.19:mpp_general_test xmlns:proj-marsupial_genomics-1128.4.19="proj-marsupial_genomics-1128.4.19" version="1" /></result>


-----

## Close connection to Mediaflux

In [200]:
con.close()