<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Mediaflux-query-dump" data-toc-modified-id="Mediaflux-query-dump-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Mediaflux query dump</a></span></li><li><span><a href="#Connect-to-MF-server" data-toc-modified-id="Connect-to-MF-server-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Connect to MF server</a></span></li><li><span><a href="#Find-assets-in-Mediaflux" data-toc-modified-id="Find-assets-in-Mediaflux-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Find assets in Mediaflux</a></span></li><li><span><a href="#For-each-asset,-get-metadata" data-toc-modified-id="For-each-asset,-get-metadata-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>For each asset, get metadata</a></span><ul class="toc-item"><li><span><a href="#Data" data-toc-modified-id="Data-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Data</a></span></li><li><span><a href="#Libraries" data-toc-modified-id="Libraries-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Individuals" data-toc-modified-id="Individuals-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Individuals</a></span></li></ul></li><li><span><a href="#Get-library-archive-size-from-data-dataframe" data-toc-modified-id="Get-library-archive-size-from-data-dataframe-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Get library archive size from data dataframe</a></span></li><li><span><a href="#Output-to-file" data-toc-modified-id="Output-to-file-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Output to file</a></span><ul class="toc-item"><li><span><a href="#Save-to-tsv" data-toc-modified-id="Save-to-tsv-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Save to tsv</a></span></li><li><span><a href="#Save-to-xlsx" data-toc-modified-id="Save-to-xlsx-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Save to xlsx</a></span></li></ul></li><li><span><a href="#Save-query-cache" data-toc-modified-id="Save-query-cache-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Save query cache</a></span></li><li><span><a href="#Close-connection-to-Mediaflux" data-toc-modified-id="Close-connection-to-Mediaflux-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Close connection to Mediaflux</a></span></li></ul></div>

In [1]:
date = "2018-05-30"
load_cache = True

-----

# Mediaflux query dump

Get all assets in the `proj-hoffmann_data-1128.4.49` project. Extract all metadata into dataframes and output to files.


In [2]:
import sys
import logging
from Crypto.Cipher import AES
import pandas as pd
import random
import re
import os
import datetime
import time
import collections
import pickle

In [3]:
sys.path.insert(0, 'python-mfclient/src')
import mfclient

In [4]:
# Set up logging
logging.basicConfig(
    filename="mf_{}.log".format(date),
    level=logging.DEBUG,
    filemode="a",
    format='%(asctime)s %(levelname)s - %(message)s',
    datefmt='%m-%d-%Y %H:%M:%S')

In [5]:
# Save all Mediaflux get queries in cache
cache_filename = "cache_{}.pkl".format(date)
if load_cache and os.path.exists(cache_filename):
    logging.info("Loading query cache from file: {}".format(cache_filename))
    query_cache = pickle.load(open(cache_filename, "rb"))
else:
    query_cache = {}

In [6]:
def get_asset(asset_id):
    if asset_id in query_cache:
        logging.info("Retrieving {} from cache".format(asset_id))
        return query_cache[asset_id]
    logging.info("Getting asset info for {}.".format(asset_id))
    args = mfclient.XmlStringWriter("args")
    args.add("id", asset_id)
    query = con.execute("asset.get", args.doc_text())
    query_cache[asset_id] = query
    return query

-----

# Connect to MF server

In [7]:
with open("keys/key") as f:
    key = f.read().strip()
with open("keys/iv") as f:
    iv = f.read().strip()
obj = AES.new(key, AES.MODE_CFB, iv)

In [8]:
with open("/Users/jess/.ssh/encrypted_pw.txt") as f:
    pw = f.read().strip()

In [9]:
MF_HOST = "mediaflux.vicnode.org.au"
MF_PORT = 443
MF_TRANSPORT = "https"
MF_DOMAIN = "aaf"
MF_USER = "unimelb:jessicac"
MF_PASSWORD = obj.decrypt(pw)

In [10]:
con = mfclient.MFConnection(host=MF_HOST,
                            port=MF_PORT,
                            transport=MF_TRANSPORT,
                            domain=MF_DOMAIN,
                            user=MF_USER,
                            password=MF_PASSWORD)

In [11]:
logging.info("Connecting to mediaflux.")
con.open()
result = con.execute("server.version")

In [12]:
result.tostring()

'<result><ant-version>Apache Ant 1.9.4</ant-version><binary>aserver</binary><build-time>08-Mar-2018 15:23:48 AEDT</build-time><built-by>Arcitecta. Pty. Ltd.</built-by><created-by>1.8.0_111-b14 (Oracle Corporation)</created-by><manifest-version>1.0</manifest-version><target-jvm>1.7</target-jvm><vendor>Arcitecta Pty. Ltd.</vendor><version>4.6.043</version></result>'

-----

# Find assets in Mediaflux

Find assets and get asset ID on the Mediaflux server in `data/`, `individuals/`, and `libraries/`.

In [13]:
# Query for data directory
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-hoffmann_data-1128.4.49/data")
args.add("action", "get-path")
args.add("size", "infinity")
data_query = con.execute("asset.query", args.doc_text())

In [14]:
# Query for individual directory
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-hoffmann_data-1128.4.49/individuals")
args.add("action", "get-path")
args.add("size", "infinity")
individuals_query = con.execute("asset.query", args.doc_text())

In [15]:
# Query for library directory
args = mfclient.XmlStringWriter("args")
args.add("where", "namespace=/projects/proj-hoffmann_data-1128.4.49/libraries")
args.add("action", "get-path")
args.add("size", "infinity")
libraries_query = con.execute("asset.query", args.doc_text())

In [16]:
libraries_query.tostring()[:1000]

'<result><path id="35280914" version="7">/projects/proj-hoffmann_data-1128.4.49/libraries/P4_CA5YJANXX</path><path id="35280919" version="7">/projects/proj-hoffmann_data-1128.4.49/libraries/P5_CA5YJANXX</path><path id="35280920" version="7">/projects/proj-hoffmann_data-1128.4.49/libraries/P6_CA5YJANXX</path><path id="35547226" version="2">/projects/proj-hoffmann_data-1128.4.49/libraries/CairnsL1</path><path id="35547227" version="2">/projects/proj-hoffmann_data-1128.4.49/libraries/CairnsL2</path><path id="35547228" version="2">/projects/proj-hoffmann_data-1128.4.49/libraries/CairnsL3</path><path id="36551235" version="4">/projects/proj-hoffmann_data-1128.4.49/libraries/ASH_LIB_02_BRA</path><path id="36551336" version="3">/projects/proj-hoffmann_data-1128.4.49/libraries/ASH_LIB_01_QLD</path><path id="36609035" version="3">/projects/proj-hoffmann_data-1128.4.49/libraries/RLEM_SA_2018</path><path id="36609036" version="1">/projects/proj-hoffmann_data-1128.4.49/libraries/tom_albo_2018</pat

In [17]:
def create_asset_id_dataframe(query):
    df = []
    path_elements = query.elements("path")
    for pe in path_elements:
        path = pe.value()
        id = pe.value("@id")
        archive_name = os.path.basename(path)
        df.append([id, archive_name, path])
    df = pd.DataFrame(df)
    df.columns = ["asset_id", "asset_name", "full_path"]
    return df

In [18]:
data_df = create_asset_id_dataframe(data_query)
individuals_df = create_asset_id_dataframe(individuals_query)
libraries_df = create_asset_id_dataframe(libraries_query)

In [19]:
print(data_df.shape)
print(individuals_df.shape)
print(libraries_df.shape)

(14, 3)
(619, 3)
(11, 3)


-----

# For each asset, get metadata



## Data

Get size of each asset in `data/`

In [20]:
asset_id_list = list(data_df["asset_id"])

In [21]:
human_size = []
size = []
for asset_id in asset_id_list:
    query = get_asset(asset_id)
    human_size.append(query.element("asset/content/size").attribute("h"))
    size.append(query.element("asset/content/size").value())

In [22]:
data_df["approx_size"] = human_size
data_df["size_in_bytes"] = size

In [23]:
data_df.head()

Unnamed: 0,asset_id,asset_name,full_path,approx_size,size_in_bytes
0,35512059,P4_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P4...,28.992 GB,28992107988
1,35512420,P5_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P5...,28.337 GB,28336907055
2,35515752,P6_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P6...,27.79 GB,27790225800
3,35516931,QA_library9_albo_INC_DPS_BKK.zip,/projects/proj-hoffmann_data-1128.4.49/data/QA...,30.134 GB,30133670228
4,35535007,CairnsL1.zip,/projects/proj-hoffmann_data-1128.4.49/data/Ca...,33.758 GB,33757722857


## Libraries

Get metadata from all assets in `libraries/`

In [24]:
asset_id_list = list(libraries_df["asset_id"])

In [25]:
# Query one asset
query = get_asset(asset_id_list[0])

In [26]:
# Get metadata documents
metadata_documents = [x.name() for x in query.element("asset/meta").elements() if x.name() != "mf-revision-history"]
metadata_documents

['proj-hoffmann_data-1128.4.49:libraries_metadata',
 'proj-hoffmann_data-1128.4.49:libraries_storage']

In [27]:
# # Test asset with multiple metadata documents from MPP project
# args = mfclient.XmlStringWriter("args")
# args.add("id", "35532748")
# query = con.execute("asset.get", args.doc_text())
# query.tostring()
# metadata_documents = [x.name() for x in query.element("asset/meta").elements() if x.name() != "mf-revision-history"]
# metadata_documents

In [28]:
libraries_metadata_dicts = {}
for asset_id in asset_id_list:
    d = collections.OrderedDict({"asset_id": asset_id})
    query = get_asset(asset_id)
    metadata_documents = [x.name() for x in query.element("asset/meta").elements() 
                          if x.name() != "mf-revision-history"]
    # Not expecting any metadata document to appear multiple times
    assert(len(metadata_documents) == len(set(metadata_documents)))
    for document in metadata_documents:
        for element in query.element("asset/meta/{}".format(document)).elements():
            d[element.name()] = element.value()
    libraries_metadata_dicts[asset_id] = d

In [29]:
libraries_metadata_df = pd.DataFrame(libraries_metadata_dicts).T
libraries_metadata_df.head()

Unnamed: 0,asset_id,common_name_species1,common_name_species2,date_current_update,date_sequencing_completion,flowcell_id,library_description,library_freezer,mol_type,number_individuals,...,radseq_library_alias,radseq_library_name,raw_sequence_filename,raw_sequence_storage1,raw_sequence_storage2,scientific_name_species1,scientific_name_species2,sequencing_facility,sequencing_quote_number,stage_at
35280914,35280914,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,genomic DNA,54,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,Sequencing completed
35280919,35280919,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,genomic DNA,50,...,P5,P5_CA5YJANXX,P5_CA5YJANXX_NoIndex_L003_R1.fastq.gz|P5_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,Sequencing completed
35280920,35280920,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,genomic DNA,48,...,P6,P6_CA5YJANXX,P6_CA5YJANXX_NoIndex_L004_R1.fastq.gz|P6_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,Sequencing completed
35547226,35547226,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,Library not retained,genomic DNA,61,...,,cairnsL1,cairnsL1_R1_001.fastq.gz | cairnsL1_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22",Aedes aegypti aegypti,,AGRF,CAGRF11131,Sequencing completed
35547227,35547227,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,Library not retained,genomic DNA,60,...,,cairnsL2,cairnsL2_R1_001.fastq.gz | cairnsL2_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,,AGRF,CAGRF11131,Sequencing completed


In [30]:
# Merge libraries_df with metadata
libraries_df = libraries_df.merge(libraries_metadata_df, on="asset_id")

In [31]:
libraries_df.head()

Unnamed: 0,asset_id,asset_name,full_path,common_name_species1,common_name_species2,date_current_update,date_sequencing_completion,flowcell_id,library_description,library_freezer,...,radseq_library_alias,radseq_library_name,raw_sequence_filename,raw_sequence_storage1,raw_sequence_storage2,scientific_name_species1,scientific_name_species2,sequencing_facility,sequencing_quote_number,stage_at
0,35280914,P4_CA5YJANXX,/projects/proj-hoffmann_data-1128.4.49/librari...,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,Sequencing completed
1,35280919,P5_CA5YJANXX,/projects/proj-hoffmann_data-1128.4.49/librari...,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,...,P5,P5_CA5YJANXX,P5_CA5YJANXX_NoIndex_L003_R1.fastq.gz|P5_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,Sequencing completed
2,35280920,P6_CA5YJANXX,/projects/proj-hoffmann_data-1128.4.49/librari...,Asian Tiger Mosquito,,31-Oct-2017,14-Oct-2016,CA5YJANXX,Individuals in this library were collected as ...,Library not retained,...,P6,P6_CA5YJANXX,P6_CA5YJANXX_NoIndex_L004_R1.fastq.gz|P6_CA5YJ...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22.",Aedes albopictus,,AGRF,CAGRF13590,Sequencing completed
3,35547226,CairnsL1,/projects/proj-hoffmann_data-1128.4.49/librari...,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,Library not retained,...,,cairnsL1,cairnsL1_R1_001.fastq.gz | cairnsL1_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,"Stored on the Hoffmann Lab hard drive, G22",Aedes aegypti aegypti,,AGRF,CAGRF11131,Sequencing completed
4,35547227,CairnsL2,/projects/proj-hoffmann_data-1128.4.49/librari...,Yellow Fever Mosquito,,16-Feb-2018,,C7NAAANXX,Individuals in this library were collected as ...,Library not retained,...,,cairnsL2,cairnsL2_R1_001.fastq.gz | cairnsL2_R2_001.fas...,Stored on the Hoffmann Mediaflux Project: proj...,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,,AGRF,CAGRF11131,Sequencing completed


## Individuals

Get metadata from all assets in `individuals/`

In [32]:
asset_id_list = list(individuals_df["asset_id"])

In [33]:
# Query one asset
query = get_asset(asset_id_list[0])

In [34]:
# Get metadata documents
metadata_documents = [x.name() for x in query.element("asset/meta").elements() if x.name() != "mf-revision-history"]
metadata_documents

['proj-hoffmann_data-1128.4.49:individuals_metadata',
 'proj-hoffmann_data-1128.4.49:individuals_storage',
 'proj-hoffmann_data-1128.4.49:individuals_locations',
 'proj-hoffmann_data-1128.4.49:individuals_store-and-retrieve',
 'proj-hoffmann_data-1128.4.49:individuals_name',
 'proj-hoffmann_data-1128.4.49:individuals_assays']

In [35]:
individuals_metadata_dicts = {}
for asset_id in asset_id_list:
    d = collections.OrderedDict({"asset_id": asset_id})
    query = get_asset(asset_id)
    metadata_documents = [x.name() for x in query.element("asset/meta").elements() 
                          if x.name() != "mf-revision-history"]
    # Not expecting any metadata document to appear multiple times
    assert(len(metadata_documents) == len(set(metadata_documents)))
    for document in metadata_documents:
        for element in query.element("asset/meta/{}".format(document)).elements():
            d[element.name()] = element.value()
    individuals_metadata_dicts[asset_id] = d

In [36]:
individuals_metadata_df = pd.DataFrame(individuals_metadata_dicts).T
individuals_metadata_df.head()

Unnamed: 0,KDR_assay,asset_id,barcode_reference_list,common_name_species,country,date_collection,date_current_update,dev_stage_collected,dev_stage_stored,dna_tube1_conc,...,radseq_library_alias,radseq_library_name,raw_sequence_filename,sampling_scheme,sampling_type,scientific_name_species,sex,specific_location,subregion,wolbachia_assay
35280960,,35280960,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,14.6,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,Guangzhou City,
35281030,,35281030,pearg_bc2017,Asian Tiger Mosquito,China,25-Oct-2015,31-Oct-2017,Larva or pupa,Larva or pupa,19.95,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,Guangzhou City,
35281033,,35281033,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,14.69,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,Guangzhou City,
35281798,,35281798,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,25.54,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,Guangzhou City,
35282328,,35282328,pearg_bc2017,Asian Tiger Mosquito,China,25-Sep-2015,31-Oct-2017,Larva or pupa,Larva or pupa,17.55,...,P4,P4_CA5YJANXX,P4_CA5YJANXX_NoIndex_L002_R1.fastq.gz|P4_CA5YJ...,Collected from natural containers in Guangzhou...,field,Aedes albopictus,female,Jiuwei Village,Guangzhou City,


In [37]:
individuals_metadata_df.shape

(619, 40)

-----

# Get library archive size from data dataframe

Combine library dataframe and data dataframe to a single dataframe

In [38]:
data_df.tail()

Unnamed: 0,asset_id,asset_name,full_path,approx_size,size_in_bytes
9,36550162,Gv13_1_TTAGGC.zip,/projects/proj-hoffmann_data-1128.4.49/data/Gv...,52.951 GB,52950524835
10,36550166,Gv13_2_TGACCA.zip,/projects/proj-hoffmann_data-1128.4.49/data/Gv...,39.388 GB,39387562304
11,36550499,ASH_LIB_02_BRA.zip,/projects/proj-hoffmann_data-1128.4.49/data/AS...,26.429 GB,26428874424
12,36551436,ASH_LIB_01_QLD.zip,/projects/proj-hoffmann_data-1128.4.49/data/AS...,37.124 GB,37124444681
13,36621405,pearg_bc2017.zip,/projects/proj-hoffmann_data-1128.4.49/data/pe...,672 bytes,672


In [39]:
libraries_df.tail()

Unnamed: 0,asset_id,asset_name,full_path,common_name_species1,common_name_species2,date_current_update,date_sequencing_completion,flowcell_id,library_description,library_freezer,...,radseq_library_alias,radseq_library_name,raw_sequence_filename,raw_sequence_storage1,raw_sequence_storage2,scientific_name_species1,scientific_name_species2,sequencing_facility,sequencing_quote_number,stage_at
6,36551235,ASH_LIB_02_BRA,/projects/proj-hoffmann_data-1128.4.49/librari...,Yellow Fever Mosquito,,23-Mar-2018,01-Jan-0001,,This will be known by Marion Shadbolt. Abdomen...,Library not retained,...,ASH_LIB_02_Brazil,ASH_LIB_02_BRA,Lib_2_C9HBTANXX_NoIndex_L007_R1.fastq.gz | Lib...,Stored on the Hoffmann Mediaflux Project: proj...,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,,AGRF,,Sequencing completed
7,36551336,ASH_LIB_01_QLD,/projects/proj-hoffmann_data-1128.4.49/librari...,Yellow Fever Mosquito,,23-Mar-2018,,,Individuals in this library were collected to ...,Library not retained,...,ASH_LIB_01_Queenslandensis,ASH_LIB_01_QLD,queenslandensis_C9J2FANXX_NoIndex_L007_R1_001....,Stored on the Hoffmann Mediaflux Project: proj...,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,,AGRF,,Sequencing completed
8,36609035,RLEM_SA_2018,/projects/proj-hoffmann_data-1128.4.49/librari...,Redlegged Earth Mite,,17-May-2018,,,resistant mites in South Australia,Library not retained,...,,RLEM_SA_2018,RLEM_SA_2018,Stored as data in the Hoffmann project mediafl...,Stored on the Hoffmann lab hard drive,Halotydeus destructor,,MHTP,QACCGM0764,Sent for sequencing
9,36609036,tom_albo_2018,/projects/proj-hoffmann_data-1128.4.49/librari...,Asian Tiger Mosquito,,17-May-2018,,,Part of a project to investigate genetic struc...,Library not retained,...,,tom_albo_2018,tom_albo_2018,"Stored as DATA in the Hoffmann project, mediaflux",Stored on the Hoffman lab hard drive,Aedes albopictus,,MHTP,QACCGM0764,Sequencing completed
10,36609037,QA_Library10,/projects/proj-hoffmann_data-1128.4.49/librari...,Yellow Fever Mosquito,Asian Tiger Mosquito,17-May-2018,,,Library part of the mosquito incursion biosecu...,Library not retained,...,QA_library10_PacificIs_INC,QA_library10,QA_library10_PacificIs_INC,Stored on Hoffmann Mediaflux Project,Stored on the Hoffmann Lab hard drive,Aedes aegypti aegypti,Aedes albopictus,MHTP,QACCGM0756,Sent for sequencing


In [40]:
# Copy data_df and rename columns
tmp = data_df.copy()
tmp.columns = [x if re.search("size", x) else "data_{}".format(x) for x in tmp.columns]

In [41]:
tmp["asset_name"] = [re.sub(".zip$", "", x) for x in tmp["data_asset_name"]]

In [42]:
tmp.head()

Unnamed: 0,data_asset_id,data_asset_name,data_full_path,approx_size,size_in_bytes,asset_name
0,35512059,P4_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P4...,28.992 GB,28992107988,P4_CA5YJANXX
1,35512420,P5_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P5...,28.337 GB,28336907055,P5_CA5YJANXX
2,35515752,P6_CA5YJANXX.zip,/projects/proj-hoffmann_data-1128.4.49/data/P6...,27.79 GB,27790225800,P6_CA5YJANXX
3,35516931,QA_library9_albo_INC_DPS_BKK.zip,/projects/proj-hoffmann_data-1128.4.49/data/QA...,30.134 GB,30133670228,QA_library9_albo_INC_DPS_BKK
4,35535007,CairnsL1.zip,/projects/proj-hoffmann_data-1128.4.49/data/Ca...,33.758 GB,33757722857,CairnsL1


In [43]:
# Merge with library_df
data_libraries_df = pd.merge(left=libraries_df, right=tmp, on="asset_name", how="outer")

-----

# Output to file

Write dataframes to files.

## Save to tsv

In [44]:
data_libraries_df.to_csv("output/data_libraries_{}.tsv".format(date), index=False, sep="\t", na_rep="NA")

In [45]:
individuals_metadata_df.to_csv("output/individuals_{}.tsv".format(date), index=False, sep="\t", na_rep="NA")

## Save to xlsx

In [46]:
writer = pd.ExcelWriter("output/mediaflux_output_{}.xlsx".format(date))

In [47]:
data_libraries_df.to_excel(writer, "data_libraries", index=False, na_rep="NA")

In [48]:
individuals_metadata_df.to_excel(writer, "individuals", index=False)

In [49]:
writer.save()
writer.close()

-----

# Save query cache

In [50]:
with open("cache_{}.pkl".format(date), "wb") as f:
    logging.info("Saving query cache.")
    pickle.dump(query_cache, f, pickle.HIGHEST_PROTOCOL)

-----

# Close connection to Mediaflux

In [51]:
logging.info("Closing connection to mediaflux.")
con.close()