<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Mediaflux-query-dump" data-toc-modified-id="Mediaflux-query-dump-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Mediaflux query dump</a></span></li><li><span><a href="#Connect-to-MF-server" data-toc-modified-id="Connect-to-MF-server-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Connect to MF server</a></span></li><li><span><a href="#Find-assets-in-Mediaflux" data-toc-modified-id="Find-assets-in-Mediaflux-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Find assets in Mediaflux</a></span></li><li><span><a href="#Get-metadata-documents" data-toc-modified-id="Get-metadata-documents-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Get metadata documents</a></span></li><li><span><a href="#For-each-asset,-get-metadata" data-toc-modified-id="For-each-asset,-get-metadata-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>For each asset, get metadata</a></span><ul class="toc-item"><li><span><a href="#Data" data-toc-modified-id="Data-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Data</a></span></li><li><span><a href="#Libraries" data-toc-modified-id="Libraries-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Individuals" data-toc-modified-id="Individuals-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Individuals</a></span></li></ul></li><li><span><a href="#Get-library-archive-size-from-data-dataframe" data-toc-modified-id="Get-library-archive-size-from-data-dataframe-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Get library archive size from data dataframe</a></span></li><li><span><a href="#Output-to-file" data-toc-modified-id="Output-to-file-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Output to file</a></span><ul class="toc-item"><li><span><a href="#Save-to-tsv" data-toc-modified-id="Save-to-tsv-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Save to tsv</a></span></li><li><span><a href="#Save-to-xlsx" data-toc-modified-id="Save-to-xlsx-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Save to xlsx</a></span></li></ul></li><li><span><a href="#Save-query-cache" data-toc-modified-id="Save-query-cache-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Save query cache</a></span></li><li><span><a href="#Close-connection-to-Mediaflux" data-toc-modified-id="Close-connection-to-Mediaflux-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Close connection to Mediaflux</a></span></li></ul></div>

In [1]:
date = "2018-09-12"
load_cache = True
max_retry = 3

In [2]:
# TODO: Get metadata documents and check max occurrences for each field is 1

-----

# Mediaflux query dump

Get all assets in the `proj-hoffmann_data-1128.4.49` project. Extract all metadata into dataframes and output to files.


In [3]:
import sys
import logging
from Crypto.Cipher import AES
import pandas as pd
import random
import re
import os
import datetime
import time
import collections
import pickle

In [4]:
sys.path.insert(0, '../python-mfclient/src')
import mfclient

In [5]:
# Set up logging
logging.basicConfig(
    filename="mf_{}.log".format(date),
    level=logging.DEBUG,
    filemode="a",
    format='%(asctime)s %(levelname)s - %(message)s',
    datefmt='%m-%d-%Y %H:%M:%S')

In [6]:
# Save all Mediaflux get queries in cache
cache_filename = "cache_{}.pkl".format(date)
if load_cache and os.path.exists(cache_filename):
    logging.info("Loading query cache from file: {}".format(cache_filename))
    query_cache = pickle.load(open(cache_filename, "rb"))
else:
    query_cache = {}

In [7]:
def get_asset(asset_id):
    if asset_id in query_cache:
        logging.info("Retrieving {} from cache".format(asset_id))
        return query_cache[asset_id]
    logging.info("Getting asset info for {}.".format(asset_id))
    args = mfclient.XmlStringWriter("args")
    args.add("id", asset_id)
    query = con.execute("asset.get", args.doc_text())
    query_cache[asset_id] = query
    return query

-----

# Connect to MF server

In [8]:
with open("keys/key") as f:
    key = f.read().strip()
with open("keys/iv") as f:
    iv = f.read().strip()
obj = AES.new(key, AES.MODE_CFB, iv)

In [9]:
with open("/Users/jess/.ssh/encrypted_pw.txt") as f:
    pw = f.read().strip()

In [10]:
MF_HOST = "mediaflux.vicnode.org.au"
MF_PORT = 443
MF_TRANSPORT = "https"
MF_DOMAIN = "aaf"
MF_USER = "unimelb:jessicac"
MF_PASSWORD = obj.decrypt(pw)

In [11]:
con = mfclient.MFConnection(host=MF_HOST,
                            port=MF_PORT,
                            transport=MF_TRANSPORT,
                            domain=MF_DOMAIN,
                            user=MF_USER,
                            password=MF_PASSWORD)

In [12]:
logging.info("Connecting to mediaflux.")
con.open()
result = con.execute("server.version")

In [13]:
result.tostring()

'<result><ant-version>Apache Ant 1.9.4</ant-version><binary>aserver</binary><build-time>02-Jul-2018 14:07:13 AEST</build-time><built-by>Arcitecta. Pty. Ltd.</built-by><created-by>1.8.0_111-b14 (Oracle Corporation)</created-by><manifest-version>1.0</manifest-version><target-jvm>1.8</target-jvm><vendor>Arcitecta Pty. Ltd.</vendor><version>4.7.027</version></result>'

-----

# Find assets in Mediaflux

Find assets and get asset ID on the Mediaflux server in `data/`, `individuals/`, and `libraries/`.

In [14]:
# Query for data directory
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-hoffmann_data-1128.4.49/data")
args.add("action", "get-path")
args.add("size", "infinity")
data_query = con.execute("asset.query", args.doc_text())

In [15]:
# Query for individual directory
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-hoffmann_data-1128.4.49/individuals")
args.add("action", "get-path")
args.add("size", "infinity")
individuals_query = con.execute("asset.query", args.doc_text())

In [16]:
# Query for library directory
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-hoffmann_data-1128.4.49/libraries")
args.add("action", "get-path")
args.add("size", "infinity")
libraries_query = con.execute("asset.query", args.doc_text())

In [17]:
libraries_query.tostring()[:1000]

'<result><path id="35280914" version="7">/projects/proj-hoffmann_data-1128.4.49/libraries/P4_CA5YJANXX</path><path id="35280919" version="7">/projects/proj-hoffmann_data-1128.4.49/libraries/P5_CA5YJANXX</path><path id="35280920" version="7">/projects/proj-hoffmann_data-1128.4.49/libraries/P6_CA5YJANXX</path><path id="35547226" version="2">/projects/proj-hoffmann_data-1128.4.49/libraries/CairnsL1</path><path id="35547227" version="2">/projects/proj-hoffmann_data-1128.4.49/libraries/CairnsL2</path><path id="35547228" version="2">/projects/proj-hoffmann_data-1128.4.49/libraries/CairnsL3</path><path id="36551235" version="4">/projects/proj-hoffmann_data-1128.4.49/libraries/ASH_LIB_02_BRA</path><path id="36551336" version="3">/projects/proj-hoffmann_data-1128.4.49/libraries/ASH_LIB_01_QLD</path><path id="36609035" version="3">/projects/proj-hoffmann_data-1128.4.49/libraries/RLEM_SA_2018</path><path id="36609036" version="1">/projects/proj-hoffmann_data-1128.4.49/libraries/tom_albo_2018</pat

In [18]:
def create_asset_id_dataframe(query):
    df = []
    path_elements = query.elements("path")
    for pe in path_elements:
        path = pe.value()
        id = pe.value("@id")
        archive_name = os.path.basename(path)
        df.append([id, archive_name, path])
    df = pd.DataFrame(df)
    df.columns = ["asset_id", "asset_name", "full_path"]
    return df

In [19]:
data_df = create_asset_id_dataframe(data_query)
individuals_df = create_asset_id_dataframe(individuals_query)
libraries_df = create_asset_id_dataframe(libraries_query)

In [20]:
print(data_df.shape)
print(individuals_df.shape)
print(libraries_df.shape)

(43, 3)
(1082, 3)
(29, 3)


-----

# Get metadata documents



In [21]:
# TODO

-----

# For each asset, get metadata



## Data

Get size of each asset in `data/`

In [22]:
asset_id_list = list(data_df["asset_id"])

In [23]:
human_size = []
size = []
for asset_id in asset_id_list:
    query = get_asset(asset_id)
    human_size.append(query.element("asset/content/size").attribute("h"))
    size.append(query.element("asset/content/size").value())

In [24]:
data_df["approx_size"] = human_size
data_df["size_in_bytes"] = size

In [25]:
data_df.head()

Unnamed: 0,asset_id,asset_name,full_path,approx_size,size_in_bytes
0,35512059,P4_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P4...,28.992 GB,28992107988
1,35512420,P5_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P5...,28.337 GB,28336907055
2,35515752,P6_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P6...,27.79 GB,27790225800
3,35516931,QA_library9_albo_INC_DPS_BKK.zip,/projects/proj-hoffmann_data-1128.4.49/data/QA...,30.134 GB,30133670228
4,35535007,CairnsL1.zip,/projects/proj-hoffmann_data-1128.4.49/data/Ca...,33.758 GB,33757722857


## Libraries

Get metadata from all assets in `libraries/`

In [26]:
asset_id_list = list(libraries_df["asset_id"])

In [27]:
# Query one asset
query = get_asset(asset_id_list[0])

In [28]:
# Get metadata documents
metadata_documents = [x.name() for x in query.element("asset/meta").elements() if x.name() != "mf-revision-history"]
metadata_documents

['proj-hoffmann_data-1128.4.49:libraries_metadata',
 'proj-hoffmann_data-1128.4.49:libraries_storage']

In [29]:
# # Test asset with multiple metadata documents from MPP project
# args = mfclient.XmlStringWriter("args")
# args.add("id", "35532748")
# query = con.execute("asset.get", args.doc_text())
# query.tostring()
# metadata_documents = [x.name() for x in query.element("asset/meta").elements() if x.name() != "mf-revision-history"]
# metadata_documents

In [30]:
libraries_metadata_dicts = {}
for asset_id in asset_id_list:
    d = collections.OrderedDict({"asset_id": asset_id})
    query = get_asset(asset_id)
    metadata_documents = [x.name() for x in query.element("asset/meta").elements() 
                          if x.name() != "mf-revision-history"]
    # Not expecting any metadata document to appear multiple times
    assert(len(metadata_documents) == len(set(metadata_documents)))
    for document in metadata_documents:
        for element in query.element("asset/meta/{}".format(document)).elements():
            d[element.name()] = element.value()
    libraries_metadata_dicts[asset_id] = d

In [31]:
libraries_metadata_df = pd.DataFrame(libraries_metadata_dicts).T
libraries_metadata_df.head()

Unnamed: 0,asset_id,bp_sequencing,common_name_species1,common_name_species2,date_current_update,date_sequencing_completion,flowcell_id,library_description,library_freezer,library_freezer_location,...,raw_sequence_filename,raw_sequence_storage1,raw_sequence_storage2,scientific_name_species1,scientific_name_species2,sequencing_facility,sequencing_quote_number,size_select_high,size_select_low,stage_at
35280914,35280914,,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,,...,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,,,Sequencing completed
35280919,35280919,,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,,...,P5_CA5YJANXX_NoIndex_L003_R1.fastq.gz|P5_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,,,Sequencing completed
35280920,35280920,,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,,...,P6_CA5YJANXX_NoIndex_L004_R1.fastq.gz|P6_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,,,Sequencing completed
35547226,35547226,,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,Library not retained,,...,cairnsL1_R1_001.fastq.gz | cairnsL1_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22",Aedes aegypti aegypti,,AGRF,CAGRF11131,,,Sequencing completed
35547227,35547227,,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,Library not retained,,...,cairnsL2_R1_001.fastq.gz | cairnsL2_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,,AGRF,CAGRF11131,,,Sequencing completed


In [32]:
# Merge libraries_df with metadata
libraries_df = libraries_df.merge(libraries_metadata_df, on="asset_id")

In [33]:
libraries_df.head()

Unnamed: 0,asset_id,asset_name,full_path,bp_sequencing,common_name_species1,common_name_species2,date_current_update,date_sequencing_completion,flowcell_id,library_description,...,raw_sequence_filename,raw_sequence_storage1,raw_sequence_storage2,scientific_name_species1,scientific_name_species2,sequencing_facility,sequencing_quote_number,size_select_high,size_select_low,stage_at
0,35280914,P4_CA5YJANXX,/projects/proj-hoffmann_data-1128.4.49/librari...,,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,...,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,,,Sequencing completed
1,35280919,P5_CA5YJANXX,/projects/proj-hoffmann_data-1128.4.49/librari...,,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,...,P5_CA5YJANXX_NoIndex_L003_R1.fastq.gz|P5_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,,,Sequencing completed
2,35280920,P6_CA5YJANXX,/projects/proj-hoffmann_data-1128.4.49/librari...,,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,...,P6_CA5YJANXX_NoIndex_L004_R1.fastq.gz|P6_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,,,Sequencing completed
3,35547226,CairnsL1,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,...,cairnsL1_R1_001.fastq.gz | cairnsL1_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22",Aedes aegypti aegypti,,AGRF,CAGRF11131,,,Sequencing completed
4,35547227,CairnsL2,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,...,cairnsL2_R1_001.fastq.gz | cairnsL2_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,,AGRF,CAGRF11131,,,Sequencing completed


## Individuals

Get metadata from all assets in `individuals/`

In [34]:
asset_id_list = list(individuals_df["asset_id"])

In [35]:
# Query one asset
query = get_asset(asset_id_list[0])

In [36]:
# Get metadata documents
metadata_documents = [x.name() for x in query.element("asset/meta").elements() if x.name() != "mf-revision-history"]
metadata_documents

['proj-hoffmann_data-1128.4.49:individuals_metadata',
 'proj-hoffmann_data-1128.4.49:individuals_storage',
 'proj-hoffmann_data-1128.4.49:individuals_locations',
 'proj-hoffmann_data-1128.4.49:individuals_store-and-retrieve',
 'proj-hoffmann_data-1128.4.49:individuals_name',
 'proj-hoffmann_data-1128.4.49:individuals_assays']

In [37]:
individuals_metadata_dicts = {}
assay_list = []

In [38]:
# Sometimes this times out when getting assets from mediaflux
# TODO: Add retry if timeout
# TODO: Refactor this block
for asset_id in asset_id_list:
    if asset_id not in individuals_metadata_dicts:
        d = collections.OrderedDict({"asset_id": asset_id})
        query = get_asset(asset_id)
        metadata_documents = [x.name() for x in query.element("asset/meta").elements() 
                              if x.name() != "mf-revision-history"]
        # Not expecting any metadata document to appear multiple times
        assert(len(metadata_documents) == len(set(metadata_documents)))
        for document in metadata_documents:
            if document != "proj-hoffmann_data-1128.4.49:individuals_assays":
                for element in query.element("asset/meta/{}".format(document)).elements():
                    if element.name() not in d:
                        d[element.name()] = element.value()
                    else:
                        logging.warning("Asset {}: Duplicated element {}. Ignoring duplicates."\
                            .format(asset_id, element.name()))
            else:
                # Process individual_assay metadata separately. Each element is another document.
                assay_elements = query.element("asset/meta/proj-hoffmann_data-1128.4.49:individuals_assays").elements()
                for a in assay_elements:
                    assay_dict = collections.OrderedDict()
                    assay_dict["asset_id"] = asset_id
                    assay_dict["assay_name"] = a.name()
                    for element in a.elements():
                        if element.name() not in assay_dict:
                            assay_dict[element.name()] = element.value()
                        else:
                            logging.warning("Asset {}: Duplicated element {} in {}. Ignoring duplicates."\
                                .format(asset_id, element.name(), a.name()))
                    assay_list.append(assay_dict)
        individuals_metadata_dicts[asset_id] = d

In [39]:
individuals_metadata_df = pd.DataFrame(individuals_metadata_dicts).T
individuals_metadata_df.head()

Unnamed: 0,asset_id,barcode_reference_list,common_name_species,country,date_collection,date_current_update,dev_stage_collected,dev_stage_stored,dna_tube1_conc,dna_tube1_label,...,radseq_library_alias,radseq_library_name,raw_sequence_filename,sampling_scheme,sampling_type,scientific_name_species,sex,specific_location,strain,subregion
35280960,35280960,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,14.6,A001 elution A,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,,Guangzhou City
35281030,35281030,pearg_bc2017,Asian Tiger Mosquito,China,25-Oct-2015,31-Oct-2017,Larva or pupa,Larva or pupa,19.95,A002 elution A,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,,Guangzhou City
35281033,35281033,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,14.69,A003 elution A,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,,Guangzhou City
35281798,35281798,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,25.54,A004 elution A,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,,Guangzhou City
35282328,35282328,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,17.55,A005 elution A,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,,Guangzhou City


In [40]:
# TODO: Change this to get order from the metadata documents, not a particular asset 
#       because some assets can be missing fileds
individuals_metadata_col_order = [x for x in individuals_metadata_dicts[asset_id_list[0]]]

In [41]:
# XXX: These columns are missing
missing = [col for col in individuals_metadata_df.columns if col not in individuals_metadata_col_order]
missing

['dna_tube2_conc', 'strain']

In [42]:
individuals_metadata_df = individuals_metadata_df[individuals_metadata_col_order + missing]
individuals_metadata_df.head()

Unnamed: 0,asset_id,date_current_update,person_current_update,scientific_name_species,common_name_species,individual_description,date_collection,person_collection,dev_stage_collected,dev_stage_stored,...,radseq_library_name,radseq_library_alias,raw_sequence_filename,barcode_reference_list,p1_barcode_code_sequence,p2_barcode_code_sequence,individual_code,individual_code_alias,dna_tube2_conc,strain
35280960,35280960,31-Oct-2017,Tom Schmidt,Aedes albopictus,Asian Tiger Mosquito,This individual has been extracted and sequenc...,25-Sep-2015,Dongjing Zhang,Larva or pupa,Larva or pupa,...,P4_CA5YJANXX,P4,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,pearg_bc2017,3 ACGTCA,1 CATGAC,chn_alb_A001,A001,,
35281030,35281030,31-Oct-2017,Tom Schmidt,Aedes albopictus,Asian Tiger Mosquito,This individual has been extracted and sequenc...,25-Oct-2015,Dongjing Zhang,Larva or pupa,Larva or pupa,...,P4_CA5YJANXX,P4,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,pearg_bc2017,9 ATGCTA,1 CATGAC,chn_alb_A002,A002,,
35281033,35281033,31-Oct-2017,Tom Schmidt,Aedes albopictus,Asian Tiger Mosquito,This individual has been extracted and sequenc...,25-Sep-2015,Dongjing Zhang,Larva or pupa,Larva or pupa,...,P4_CA5YJANXX,P4,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,pearg_bc2017,3 ACGTCA,2 TGCAGT,chn_alb_A003,A003,,
35281798,35281798,31-Oct-2017,Tom Schmidt,Aedes albopictus,Asian Tiger Mosquito,This individual has been extracted and sequenc...,25-Sep-2015,Dongjing Zhang,Larva or pupa,Larva or pupa,...,P4_CA5YJANXX,P4,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,pearg_bc2017,15 CGATAC,1 CATGAC,chn_alb_A004,A004,,
35282328,35282328,31-Oct-2017,Tom Schmidt,Aedes albopictus,Asian Tiger Mosquito,This individual has been extracted and sequenc...,25-Sep-2015,Dongjing Zhang,Larva or pupa,Larva or pupa,...,P4_CA5YJANXX,P4,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,pearg_bc2017,9 ATGCTA,2 TGCAGT,chn_alb_A005,A005,,


In [43]:
individuals_metadata_df.shape

(1082, 39)

In [44]:
collections.Counter([x["assay_name"] for x in assay_list])

Counter({'KDR_assay': 1900, 'wolbachia_assay': 200})

In [45]:
kdr_assay_metadata_df = pd.DataFrame([x for x in assay_list if x["assay_name"] == "KDR_assay"])
kdr_assay_metadata_df.head()

Unnamed: 0,asset_id,assay_name,date_current_update,resistance_mutation,resistance_genotype
0,35280960,KDR_assay,21-May-2018,V1023G,
1,35280960,KDR_assay,21-May-2018,F1565C,
2,35280960,KDR_assay,21-May-2018,S996P,
3,35510511,KDR_assay,15-Jan-2018,,
4,36614300,KDR_assay,21-May-2018,V1023G,TT


In [46]:
wolbachia_assay_metadata_df = pd.DataFrame([x for x in assay_list if x["assay_name"] == "wolbachia_assay"])
wolbachia_assay_metadata_df.head()

Unnamed: 0,asset_id,assay_name,date_current_update,wolbachia_infection_assay,wolbachia_strain,infection_status
0,35280960,wolbachia_assay,21-May-2018,Ronald's Lightcycler PCR assay for wMel; see L...,wMel,
1,35510511,wolbachia_assay,15-Jan-2018,Ronald's Lightcycler PCR assay for wMel; see L...,wMel,uninfected
2,35510512,wolbachia_assay,15-Jan-2018,Ronald's Lightcycler PCR assay for wMel; see L...,wMel,no result
3,35510513,wolbachia_assay,15-Jan-2018,Ronald's Lightcycler PCR assay for wMel; see L...,wMel,no result
4,35510514,wolbachia_assay,15-Jan-2018,Ronald's Lightcycler PCR assay for wMel; see L...,wMel,infected


-----

# Get library archive size from data dataframe

Combine library dataframe and data dataframe to a single dataframe

In [47]:
data_df.tail()

Unnamed: 0,asset_id,asset_name,full_path,approx_size,size_in_bytes
38,37125593,QA_first_inc.tar,/projects/proj-hoffmann_data-1128.4.49/data/QA...,13.073 GB,13072970752
39,37125601,braz_indo_viet_OLD.tar,/projects/proj-hoffmann_data-1128.4.49/data/br...,25.512 GB,25511974400
40,38749551,brazil_K_OLD.tar,/projects/proj-hoffmann_data-1128.4.49/data/br...,30.162 GB,30162212352
41,39350581,yang_albo_2018.tar,/projects/proj-hoffmann_data-1128.4.49/data/ya...,41.582 GB,41582463732
42,39350622,tom_albo_2018.tar,/projects/proj-hoffmann_data-1128.4.49/data/to...,41.796 GB,41795581992


In [48]:
libraries_df.tail()

Unnamed: 0,asset_id,asset_name,full_path,bp_sequencing,common_name_species1,common_name_species2,date_current_update,date_sequencing_completion,flowcell_id,library_description,...,raw_sequence_filename,raw_sequence_storage1,raw_sequence_storage2,scientific_name_species1,scientific_name_species2,sequencing_facility,sequencing_quote_number,size_select_high,size_select_low,stage_at
24,39084575,QA_library8_INC_TW_CY,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,,21-Aug-2018,12-Apr-2017,HHGG7BBXX,"Quarantine individuals. From Australia:Perth, ...",...,QA_library8_INC_TW_CY,Stored on the Hoffmann mediaflux project: proj...,"Stored on the Hoffmann lab hard drive, Bio21 i...",Aedes aegypti aegypti,,MHTP,QACCGM0605,,,Sequencing completed
25,39084578,QA_library9_albo_INC_DPS_BKK,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,Asian Tiger Mosquito,21-Aug-2018,27-Aug-2017,HLKNTBBXX,"Quarantine individuals. From Australia:Perth, ...",...,QA_library9_albo_INC_DPS_BKK,Stored on the Hoffmann mediaflux project: proj...,"Stored on the Hoffmann lab hard drive, Bio21 i...",Aedes aegypti aegypti,Aedes albopictus,MHTP,QACCGM0651,,,Sequencing completed
26,39084580,QA_first_inc,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,,21-Aug-2018,,,First library of quarantine individuals. Incur...,...,QA-gyp_S1_L001_R1_001.fastq | QA-gyp_S1_L001_R...,Stored on the Hoffmann mediaflux project: proj...,"Stored on the Hoffmann lab hard drive, Bio21 i...",Aedes aegypti aegypti,,other,,,,Sequencing completed
27,39350113,sngp_OLD1,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,,31-Aug-2018,,,SNGP_OLD2 has the same individuals,...,SNGPlib_S1_L001_R1_001.fastq | SNGPlib_S1_L001...,Stored on the Hoffmann mediaflux project: proj...,"Stored on the Hoffmann lab hard drive, Bio21 i...",Aedes aegypti aegypti,,other,,,,Sequencing completed
28,39350115,sngp_OLD2,/projects/proj-hoffmann_data-1128.4.49/librari...,,Yellow Fever Mosquito,,31-Aug-2018,,,sngp_OLD1 has same individuals,...,SNGPlib_S1_L002_R1_001.fastq | SNGPlib_S1_L002...,Stored on the Hoffmann mediaflux project: proj...,"Stored on the Hoffmann lab hard drive, Bio21 i...",Aedes aegypti aegypti,,unsure,,,,Sequencing completed


In [49]:
# Copy data_df and rename columns
tmp = data_df.copy()
tmp.columns = [x if re.search("size", x) else "data_{}".format(x) for x in tmp.columns]

In [50]:
tmp["asset_name"] = [re.sub(".zip$", "", x) for x in tmp["data_asset_name"]]

In [51]:
tmp.head()

Unnamed: 0,data_asset_id,data_asset_name,data_full_path,approx_size,size_in_bytes,asset_name
0,35512059,P4_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P4...,28.992 GB,28992107988,P4_CA5YJANXX
1,35512420,P5_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P5...,28.337 GB,28336907055,P5_CA5YJANXX
2,35515752,P6_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P6...,27.79 GB,27790225800,P6_CA5YJANXX
3,35516931,QA_library9_albo_INC_DPS_BKK.zip,/projects/proj-hoffmann_data-1128.4.49/data/QA...,30.134 GB,30133670228,QA_library9_albo_INC_DPS_BKK
4,35535007,CairnsL1.zip,/projects/proj-hoffmann_data-1128.4.49/data/Ca...,33.758 GB,33757722857,CairnsL1


In [52]:
# Merge with library_df
data_libraries_df = pd.merge(left=libraries_df, right=tmp, on="asset_name", how="outer")

-----

# Output to file

Write dataframes to files.

## Save to tsv

In [53]:
data_libraries_df.to_csv("output/data_libraries_{}.tsv".format(date), index=False, sep="\t", na_rep="NA")

In [54]:
individuals_metadata_df.to_csv("output/individuals_{}.tsv".format(date), index=False, sep="\t", na_rep="NA")

In [55]:
kdr_assay_metadata_df.to_csv("output/kdr_assays_{}.tsv".format(date), index=False, sep="\t", na_rep="NA")

In [56]:
wolbachia_assay_metadata_df.to_csv("output/wolbachia_assays_{}.tsv".format(date), index=False, sep="\t", na_rep="NA")

## Save to xlsx

In [57]:
writer = pd.ExcelWriter("output/mediaflux_output_{}.xlsx".format(date))

In [58]:
data_libraries_df.to_excel(writer, "data_libraries", index=False, na_rep="NA")

In [59]:
individuals_metadata_df.to_excel(writer, "individuals", index=False)

In [60]:
kdr_assay_metadata_df.to_excel(writer, "kdr_assays", index=False)

In [61]:
wolbachia_assay_metadata_df.to_excel(writer, "wolbachia_assays", index=False)

In [62]:
writer.save()
writer.close()

-----

# Save query cache

In [63]:
with open("cache_{}.pkl".format(date), "wb") as f:
    logging.info("Saving query cache.")
    pickle.dump(query_cache, f, pickle.HIGHEST_PROTOCOL)

-----

# Close connection to Mediaflux

In [64]:
logging.info("Closing connection to mediaflux.")
con.close()