# MPP Metadata

Input MPP metadata into Mediaflux. Each possum has it's own asset file with metadata attached.

In [1]:
import sys
import logging
from Crypto.Cipher import AES
import pandas as pd
import random
import re
import os
import datetime
import time

In [2]:
sys.path.insert(0, 'python-mfclient/src')
import mfclient

In [4]:
logging.basicConfig(
    filename="mf_2018-02-05.log",
    level=logging.DEBUG,
    filemode="a",
    format='%(asctime)s %(levelname)s - %(message)s',
    datefmt='%m-%d-%Y %H:%M:%S')

-----

## Load files

In [5]:
# Possum data
general_data = pd.read_table("../output/general.tsv")
capture_data = pd.read_table("../output/wild_capture_records.tsv")
special_data = pd.read_table("../output/special_records.tsv")
microsat_data = pd.read_table("../output/microsat_data.tsv")

In [6]:
# Metadata elements
general_elements = pd.read_excel("../metadata_elements.xlsx", sheet_name=0)
capture_elements = pd.read_excel("../metadata_elements.xlsx", sheet_name=1)
special_elements = pd.read_excel("../metadata_elements.xlsx", sheet_name=2)
microsat_elements = pd.read_excel("../metadata_elements.xlsx", sheet_name=3)

-----

## Connect to MF server

In [32]:
with open("keys/key") as f:
    key = f.read().strip()
with open("keys/iv") as f:
    iv = f.read().strip()
obj = AES.new(key, AES.MODE_CFB, iv)

In [33]:
with open("/Users/jess/.ssh/encrypted_pw.txt") as f:
    pw = f.read().strip()

In [34]:
MF_HOST = "mediaflux.vicnode.org.au"
MF_PORT = 443
MF_TRANSPORT = "https"
MF_DOMAIN = "aaf"
MF_USER = "unimelb:jessicac"
MF_PASSWORD = obj.decrypt(pw)

In [78]:
con = mfclient.MFConnection(host=MF_HOST,
                            port=MF_PORT,
                            transport=MF_TRANSPORT,
                            domain=MF_DOMAIN,
                            user=MF_USER,
                            password=MF_PASSWORD)

In [79]:
logging.info("Connecting to mediaflux.")
con.open()
result = con.execute("server.version")

In [80]:
result.tostring()

'<result><ant-version>Apache Ant 1.9.4</ant-version><binary>aserver</binary><build-time>31-Jan-2018 16:49:25 AEDT</build-time><built-by>Arcitecta. Pty. Ltd.</built-by><created-by>1.8.0_111-b14 (Oracle Corporation)</created-by><manifest-version>1.0</manifest-version><target-jvm>1.7</target-jvm><vendor>Arcitecta Pty. Ltd.</vendor><version>4.6.034</version></result>'

-----

## Create asset files

Create an asset file for each possum ID

In [38]:
ids = list(general_data["id"])
len(ids)

757

In [39]:
matches = [re.search("\d+", x) for x in ids]

In [40]:
[i for i, x in enumerate(matches) if x == None]

[747]

In [41]:
ids[747]

'untagged'

In [42]:
# Remove "untagged" value from IDs
ids = [x for x in ids if x != "untagged"]
len(ids)

756

In [43]:
# Find asset files already in directory
asset_directory = "/projects/proj-marsupial_genomics-1128.4.19/Burramys/Possums"

args = mfclient.XmlStringWriter("args")
args.add("where", "namespace={}".format(asset_directory))
args.add("action", "get-path")
args.add("size", "infinity")
result = con.execute("asset.query", args.doc_text())

In [48]:
result.elements("path") == None

True

In [49]:
if result.elements("path"):
    for path_element in result.elements("path")[0:5]:
        path = path_element.value()     # get value inside 'path' element
        id = path_element.value("@id")  # get attribute value of 'id' attribute in 'path' element
        print("\t" + id + " - " + path)

In [50]:
if result.elements("path"):
    ids_in_directory = [os.path.basename(x.value()) for x in result.elements("path")]
else:
    ids_in_directory = []
len(ids_in_directory)

0

In [51]:
# Get list of IDs to create
mpp_ids = ids
ids_to_create = [x for x in mpp_ids if x not in ids_in_directory]
len(ids_to_create)

756

In [52]:
logging.info("Creating asset files with asset.get")
for possum_id in ids_to_create:
    args = mfclient.XmlStringWriter("args")
    args.add("namespace", asset_directory)
    args.add("name", possum_id)
    result = con.execute("asset.create", args.doc_text())
    asset_id = result.value("id")
    logging.info("Possum ID {} asset file created (asset id {})".format(possum_id, asset_id))

-----

## Process metadata and data

- Sanity check columns
- Enumeration values
- Change date format

In [53]:
general_elements.head()

Unnamed: 0,element,definition,instructions,requirement,type,default value,notes
0,id,ID number on animal's ear tag,Usually a four-digit integer. Should not have ...,Mandatory,string,,Each sample should only have one 'general' met...
1,sex,Sex of the animal,"Allowed values: male, female, hermaphrodite, u...",Mandatory,enumeration,,
2,country,The name of the country or major administrativ...,"For this study, only relevant value is Austral...",Mandatory,string,Australia,
3,state_or_region,The name of the next smaller administrative re...,State name written out in full (no abbreviatio...,Mandatory,string,Victoria,
4,taxon_id,ID number for this taxon from the NCBI Taxonom...,ID number results from https://www.ncbi.nlm.ni...,Mandatory,integer,38600,


In [54]:
capture_elements.head()

Unnamed: 0,element,definition,instructions,requirement,type,default value,notes
0,id,ID number on animal's ear tag,Usually a four-digit integer. Should not have ...,Mandatory,string,,Expect most samples to have multiple capture r...
1,date,Date of capture,Date of capture.,Mandatory,date,,
2,location,Text describing the mountain and sub-populatio...,"E.g. Mount Buller, Federation, above 350. Allo...",Mandatory,string,,
3,sex,Sex of the animal,"Allowed values: male, female, hermaphrodite, u...",Mandatory,enumeration,,
4,life_stage,The age class or life stage of the animal at t...,Terms from Uberon ontology http://www.ebi.ac.u...,Optional,enumeration,,"#WIP\nCurrent values are: juvenile, immature a..."


In [55]:
special_elements.head()

Unnamed: 0,element,definition,instructions,requirement,type,default value,notes
0,id,ID number. The animals with these metadata fie...,Usually a four-digit integer. Should not have ...,Mandatory,string,,
1,year,Year of recording,Year of recording details of the animal,Mandatory,integer,,
2,source,Text describing the source population,"E.g. Healesville Sanctuary zoo release, Mount ...",Mandatory,string,,
3,sex,Sex of the animal,"Allowed values: male, female, hermaphrodite, u...",Mandatory,enumeration,,
4,weight,Weight of the animal in grams,"Numeric. Allowed missing values are unknown, n...",Optional,double,,Note: columns from weight to other_notes are i...


In [56]:
microsat_elements.head()

Unnamed: 0,element,definition,instructions,requirement,type,default value,notes
0,index,Unique identifier. Necessary due to assays bei...,Integer in spreadsheet. Incrementally increase...,Mandatory,integer,,
1,id,ID number on animal's ear tag,Usually a four-digit integer. Should not have ...,Mandatory,string,,
2,BP1_FAM_allele1,Size (in base pairs) of the shorter allele obs...,Integer with field left blank for missing valu...,Optional,integer,,
3,BP1_FAM_allele2,Size (in base pairs) of the longer allele obse...,Integer with field left blank for missing valu...,Optional,integer,,
4,BP2_FAM_allele1,Size (in base pairs) of the shorter allele obs...,Integer with field left blank for missing valu...,Optional,integer,,


In [57]:
# Check order of metadata definitions is the same as the data columns
print list(general_elements["element"]) == list(general_data.columns)
print list(capture_elements["element"]) == list(capture_data.columns)
print list(special_elements["element"]) == list(special_data.columns)
print list(microsat_elements["element"]) == list(microsat_data.columns)

True
True
True
True


In [58]:
general_elements_dict = general_elements.transpose().to_dict()
capture_elements_dict = capture_elements.transpose().to_dict()
special_elements_dict = special_elements.transpose().to_dict()
microsat_elements_dict = microsat_elements.transpose().to_dict()

In [59]:
def get_enumeration_elements(metadata_dict):
    enumeration_values = []
    for e in metadata_dict:
        if metadata_dict[e]["type"] == "enumeration":
            element_name = metadata_dict[e]["element"]
            enumeration_values.append(element_name)
    return enumeration_values

In [60]:
enumeration_elements = \
    get_enumeration_elements(metadata_dict=general_elements_dict) + \
    get_enumeration_elements(metadata_dict=capture_elements_dict) + \
    get_enumeration_elements(metadata_dict=special_elements_dict) + \
    get_enumeration_elements(metadata_dict=microsat_elements_dict)
enumeration_elements = list(set(enumeration_elements))

In [61]:
all_data = [general_data, capture_data, special_data, microsat_data]

In [62]:
enumeration_dict = {}
for e in enumeration_elements:
    possible_values = []
    for df in all_data:
        if e in df.columns:
            possible_values += list(set(list(df[e])))
    # Remove NaN values
    possible_values = [x for x in possible_values if type(x) == str or not pd.np.isnan(x)]
    # Convert floats to integers to strings
    possible_values = [str(int(x)) if type(x) == float else x for x in possible_values]
    possible_values = list(set(possible_values))
    possible_values.sort()
    enumeration_dict[e] = possible_values
    print e, possible_values

life_stage ['adult', 'immature adult', 'juvenile']
sex ['female', 'male', 'not determined']
wild_captive ['captive', 'wild']
lactation_status ['advanced post lactating', 'lactating', 'not lactating', 'post lactating']
pregnancy_status ['likely not pregnant', 'likely pregnant', 'not pregnant', 'pregnant']
number_of_pouch_young ['0', '1', '2', '3', '4', 'undetermined']


In [63]:
# Manually re-order values
enumeration_dict = {
    'lactation_status': ['not lactating', 'lactating', 'post lactating', 'advanced post lactating'],
    'life_stage': ['adult', 'immature adult', 'juvenile'],
    'number_of_pouch_young': ['0', '1', '2', '3', '4', 'undetermined'],
    'pregnancy_status': ['not pregnant', 'likely not pregnant', 'likely pregnant', 'pregnant'],
    'sex': ['female', 'male', 'hermaphrodite', 'not determined'],
    'wild_captive': ['captive', 'wild']
}

In [64]:
# Need to change date format from YYYY-MM-DD to DD-MM-YYY
def flip_date(date):
    match = re.match("(\d{4})-(\d{2})-(\d{2})", date).groups()
    # new_format = "{}-{}-{}".format(match[2], match[1], match[0])
    new_format = datetime.date(int(match[0]),int(match[1]),int(match[2])).strftime("%d-%b-%Y")
    return new_format

capture_data.date = [flip_date(x) for x in capture_data.date]

In [65]:
capture_data.date.head()

0    28-Oct-2002
1    06-Nov-2010
2    07-Nov-2010
3    08-Nov-2010
4    09-Nov-2010
Name: date, dtype: object

In [66]:
# Change number_of_pouch_young to string in special_data
special_data.number_of_pouch_young = [str(int(x)) if not pd.np.isnan(x) else x 
                                      for x in special_data.number_of_pouch_young]

In [67]:
# Change microsat data from float to string
microsat_columns = [x for x in microsat_data.columns if re.search("allele\d$", x)]

for col in microsat_columns:
    # print col,
    microsat_data[[col]] = [str(int(x)) if not pd.np.isnan(x) else x for x in microsat_data[col]]

In [68]:
microsat_data.head()

Unnamed: 0,index,id,BP1_FAM_allele1,BP1_FAM_allele2,BP2_FAM_allele1,BP2_FAM_allele2,BP3_NED_allele1,BP3_NED_allele2,BP11_FAM_allele1,BP11_FAM_allele2,...,BC29_NED_allele1,BC29_NED_allele2,BC32_PET_allele1,BC32_PET_allele2,BC34_PET_allele1,BC34_PET_allele2,BC35_PET_allele1,BC35_PET_allele2,BC36_PET_allele1,BC36_PET_allele2
0,1,34,311,311,249,253,297,318,195,203,...,114,117,0,0,0,0,0.0,0.0,177,183
1,2,67,311,311,253,253,0,0,199,201,...,116,116,137,137,137,137,0.0,0.0,177,179
2,3,102,311,311,251,253,0,0,191,199,...,116,116,137,137,131,131,0.0,0.0,179,183
3,4,202,0,0,0,0,303,303,195,195,...,117,117,141,141,137,137,,,177,177
4,5,202,311,311,245,245,303,303,195,195,...,126,126,141,141,137,137,157.0,157.0,177,177


-----

## Create metadata definitions

Four metadata definition documents:
 - general
 - capture
 - special
 - microsat

In [None]:
# # Example of enumeration syntax: proj-marsupial_genomics-1128.4.19:test
# args = mfclient.XmlStringWriter("args")
# args.add("type", "proj-marsupial_genomics-1128.4.19:test")
# result = con.execute("asset.doc.type.describe", args.doc_text())

In [None]:
# print result

In [None]:
# <element max-occurs="1" name="enum" type="enumeration">
#   <restriction base="enumeration">
#     <value>val1</value>
#     <value>val2</value>
#     <value>val3</value>
#   </restriction>
#   <value as="default">val1</value>
# </element>

In [69]:
def add_elements(args, metadata_dict):
    args.push("definition")
    for index in metadata_dict:
        # print index
        if metadata_dict[index]["requirement"] == "Optional":
            min_occurs = 0
        else:
            min_occurs = 1
        attr = {"name": metadata_dict[index]["element"],
                "max-occurs": 1,
                "min-occurs": min_occurs,
                "type": metadata_dict[index]["type"]}
        args.push("element", attributes=attr)
        args.add("description", metadata_dict[index]["definition"])
        args.add("instructions", metadata_dict[index]["instructions"])
        if metadata_dict[index]["type"] == "enumeration":
            args.push("restriction", attributes={"base": "enumeration"})
            for val in enumeration_dict[metadata_dict[index]["element"]]:
                args.add("value", val)
            args.pop() # end restriction
        if str(metadata_dict[index]["default value"]) != "nan":
            args.add("value", metadata_dict[index]["default value"], attributes={"as": "default"})
        args.pop() # end element
    args.pop() # end definition
    return args

#### General document

In [81]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_general"

args = mfclient.XmlStringWriter("args")
args.add("type", "{}:{}".format(project_namespace, document_name))
args.add("label", "{}:{}".format(project_namespace, document_name))
args.add("description", "General possum information")
args.add("instructions", 
         "This metadata document holds the general information of a possum. " \
         "There should be one mmp_general document for each possum.")

# Metadata definitions
args = add_elements(args=args, metadata_dict=general_elements_dict)

In [82]:
# args.doc_text()

In [83]:
logging.info("Creating metadata document: {}:{}".format(project_namespace, document_name))
result = con.execute("asset.doc.type.create", args.doc_text())
logging.info(result)

#### Capture document

In [84]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_capture"

args = mfclient.XmlStringWriter("args")
args.add("type", "{}:{}".format(project_namespace, document_name))
args.add("label", "{}:{}".format(project_namespace, document_name))
args.add("description", "Capture record information")
args.add("instructions", 
         "This metadata document holds a capture record of a possum. " \
         "There can be many mpp_capture documents attached to each possum " \
         "(one for each instance of capture).")

# Metadata definitions
args = add_elements(args=args, metadata_dict=capture_elements_dict)

# Submit
logging.info("Creating metadata document: {}:{}".format(project_namespace, document_name))
result = con.execute("asset.doc.type.create", args.doc_text())
logging.info(result)

#### Special document

In [85]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_special"

args = mfclient.XmlStringWriter("args")
args.add("type", "{}:{}".format(project_namespace, document_name))
args.add("label", "{}:{}".format(project_namespace, document_name))
args.add("description", "Special possum information")
args.add("instructions", 
         "This metadata document holds records of possums which are non-capture " \
         "records. ID number. These possums are either Zoo releases from " \
         "Healesville Sanctuary, males from Hotham, or males from Timms Spur.")

# Metadata definitions
args = add_elements(args=args, metadata_dict=special_elements_dict)

# Submit
logging.info("Creating metadata document: {}:{}".format(project_namespace, document_name))
result = con.execute("asset.doc.type.create", args.doc_text())
logging.info(result)

#### Microsat document

In [86]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_microsat"

args = mfclient.XmlStringWriter("args")
args.add("type", "{}:{}".format(project_namespace, document_name))
args.add("label", "{}:{}".format(project_namespace, document_name))
args.add("description", "Microsatellite information")
args.add("instructions", 
         "This metadata document holds information of microsatellite markers " \
         "for a particular possum. There is usually zero or one documents " \
         "attached to a possum, but there may be multiple instances if a " \
         "possum was genotyped multiple times.")

# Metadata definitions
args = add_elements(args=args, metadata_dict=microsat_elements_dict)

# Submit
logging.info("Creating metadata document: {}:{}".format(project_namespace, document_name))
result = con.execute("asset.doc.type.create", args.doc_text())
logging.info(result)

-----

## Populate metadata

Set metadata for each line in the dataframe

In [87]:
# Get asset IDs of samples and store in dict
asset_directory = "/projects/proj-marsupial_genomics-1128.4.19/Burramys/Possums"

args = mfclient.XmlStringWriter("args")
args.add("where", "namespace={}".format(asset_directory))
args.add("action", "get-path")
args.add("size", "infinity")
result = con.execute("asset.query", args.doc_text())

In [88]:
# Create lookup table for sample id -> asset id
sample_id_asset_id_lookup = {}
for path_element in result.elements("path"):
    asset_id = path_element.value("@id")
    sample_id = os.path.basename(path_element.value())
    sample_id_asset_id_lookup[sample_id] = asset_id

In [89]:
def set_metadata(asset_id, project_namespace, document_name, data_dict,
                 row, action="merge"):
    # action: add|merge|remove|replace
    args = mfclient.XmlStringWriter("args")
    args.add("id", asset_id)
    args.push("meta", attributes={"action": action})
    args.push("{}:{}".format(project_namespace, document_name))
    for key, value in data_dict[row].items():
        if str(value) != "nan":
            args.add(key, value)
    args.pop() # end namespace:metadata_document
    args.pop() # end meta
    logging.info("Setting metadata for asset id " + asset_id)
    result = con.execute("asset.set", args.doc_text())
#     print args.doc_text()

#### General data

In [98]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_general"
data = general_data
data_dict = data.transpose().to_dict()

In [97]:
for row in data_dict:
    possum_id = data_dict[row]["id"]
    try:
        asset_id = sample_id_asset_id_lookup[possum_id]
        set_metadata(asset_id=asset_id, project_namespace=project_namespace,
                    document_name=document_name, data_dict=data_dict, row=row,
                    action="merge")
    except:
        logging.info("Skipping " + possum_id)
    time.sleep(0.5)

#### Capture data

In [100]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_capture"
data = capture_data
data_dict = data.transpose().to_dict()

In [101]:
for row in data_dict:
    possum_id = data_dict[row]["id"]
    try:
        asset_id = sample_id_asset_id_lookup[possum_id]
        set_metadata(asset_id=asset_id, project_namespace=project_namespace,
                    document_name=document_name, data_dict=data_dict, row=row,
                    action="add")
    except:
        logging.info("Skipping " + possum_id)
    time.sleep(0.5)

#### Special data

In [102]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_special"
data = special_data
data_dict = data.transpose().to_dict()

In [103]:
for row in data_dict:
    possum_id = data_dict[row]["id"]
    try:
        asset_id = sample_id_asset_id_lookup[possum_id]
        set_metadata(asset_id=asset_id, project_namespace=project_namespace,
                    document_name=document_name, data_dict=data_dict, row=row,
                    action="merge")
    except:
        logging.info("Skipping " + possum_id)
    time.sleep(0.5)

#### Microsat metadata

In [104]:
project_namespace = "proj-marsupial_genomics-1128.4.19"
document_name = "mpp_microsat"
data = microsat_data
data_dict = data.transpose().to_dict()

In [107]:
for row in data_dict:
    possum_id = data_dict[row]["id"]
    try:
        asset_id = sample_id_asset_id_lookup[possum_id]
        set_metadata(asset_id=asset_id, project_namespace=project_namespace,
                    document_name=document_name, data_dict=data_dict, row=row,
                    action="add")
    except:
        logging.info("Skipping " + possum_id)
    time.sleep(0.5)

-----

## Close connection to Mediaflux

In [108]:
logging.info("Closing connection to mediaflux.")
con.close()