# HERMES 3.0 PHENOTYPING

## Libraries

In [23]:
import dxpy
import subprocess
import os
import glob
import pandas as pd

## Project and record IDs

In [7]:
dispensed_dataset_id = dxpy.find_one_data_object(typename='Dataset', name='app*.dataset', folder='/', name_mode='glob')['id']
project_id = dxpy.find_one_project()["id"]
dataset = (':').join([project_id, dispensed_dataset_id])
print("Dataset id: " + dataset)

Dataset id: project-GvZyZ20J81vgPJGbJy8pgpyq:record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg


## Get data dictionaries

In [18]:
os.chdir("/opt/notebooks")
cmd = ["dx", "extract_dataset", dataset, "-ddd", "--delimiter", ","]
subprocess.check_call(cmd)

0

In [133]:
path = "/opt/notebooks"
data_dict_csv = glob.glob(os.path.join(path, "*.data_dictionary.csv"))[0]
data_dict_df = pd.read_csv(data_dict_csv, low_memory=False)

codes_to_extract = {
    "eid":               {"name": "eid",       "entity": "participant", "search": "matches"},
    "sex":               {"name": "p31",       "entity": "participant", "search": "matches"},
    "age":               {"name": "p21022",    "entity": "participant", "search": "matches"},
    "ethnicity":         {"name": "p21000",    "entity": "participant", "search": "matches"},
    "genetic_sex":       {"name": "p22001",    "entity": "participant", "search": "matches"},
    "genetic_ethnicity": {"name": "p22006",    "entity": "participant", "search": "matches"},
    "pc1":               {"name": "p22009_a1", "entity": "participant", "search": "matches"},
    "pc2":               {"name": "p22009_a2", "entity": "participant", "search": "matches"},
    "pc3":               {"name": "p22009_a3", "entity": "participant", "search": "matches"},
    "pc4":               {"name": "p22009_a4", "entity": "participant", "search": "matches"},
    "pc5":               {"name": "p22009_a5", "entity": "participant", "search": "matches"},
    "self_rep_ill":      {"name": "p20002",    "entity": "participant", "search": "startswith"},
    "self_rep_ill_year": {"name": "p20008",    "entity": "participant", "search": "startswith"},
    "self_rep_proc":     {"name": "p20004",    "entity": "participant", "search": "startswith"},
    "self_rep_proc_year":{"name": "p20010",    "entity": "participant", "search": "startswith"},
}

filtered_df = pd.DataFrame()

for desc, criteria in codes_to_extract.items():
    df = data_dict_df[data_dict_df["entity"] == criteria["entity"]]
    if criteria["search"] == "matches":
        filtered_df = pd.concat([filtered_df, df[df["name"] == criteria["name"]]])
    elif criteria["search"] == "startswith":
        filtered_df = pd.concat([filtered_df, df[df["name"].str.startswith(criteria["name"])]])
        
filtered_df = filtered_df.reset_index(drop=True)
filtered_df["name"].to_csv("hermes_field_names.txt", index=False, header=False)

fields_file = "hermes_field_names.txt"
cmd = ["dx", "upload", fields_file, "--path", "/"]
try:
    output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    print("Command Output:")
    print(output.decode())
except subprocess.CalledProcessError as e:
    print(f"Command failed with error code {e.returncode}")
    print(f"Command output: {e.output.decode()}")
    print(f"Command error: {e.stderr.decode()}")

Command Output:
ID                                file-Gz5Jv5jJ81vv928Xy0VvVX5G
Class                             file
Project                           project-GvZyZ20J81vgPJGbJy8pgpyq
Folder                            /
Name                              hermes_field_names.txt
State                             closing
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Fri Mar  7 11:52:55 2025
Created by                        nicholas.sunderland
 via the job                      job-Gz5FZfQJ81vzvj1p12Y171y9
Last modified                     Fri Mar  7 11:52:55 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"



In [136]:
cmd = ["dx", "run", "app-table-exporter", 
       "-idataset_or_cohort_or_dashboard=" + dataset, 
       "-ioutput=data", 
       "-ioutput_format=TSV",
       "-icoding_option=REPLACE",
       "-iheader_style=FIELD-NAME",
       "-ientity=participant",
       " ".join([f"-ifield_names={field}" for field in filtered_df["name"]])]
try:
    output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    print("Command Output:")
    print(output.decode())
except subprocess.CalledProcessError as e:
    print(f"Command failed with error code {e.returncode}")
    print(f"Command output: {e.output.decode()}")
    print(f"Command error: {e.stderr.decode()}")

Command Output:

Using input JSON:
{
    "output": "data",
    "output_format": "TSV",
    "coding_option": "REPLACE",
    "header_style": "FIELD-NAME",
    "entity": "participant",
    "field_names": [
        "eid -ifield_names=p31 -ifield_names=p21022 -ifield_names=p22001 -ifield_names=p22006 -ifield_names=p22009_a1 -ifield_names=p22009_a2 -ifield_names=p22009_a3 -ifield_names=p22009_a4 -ifield_names=p22009_a5 -ifield_names=p20002_i0_a0 -ifield_names=p20002_i0_a1 -ifield_names=p20002_i0_a2 -ifield_names=p20002_i0_a3 -ifield_names=p20002_i0_a4 -ifield_names=p20002_i0_a5 -ifield_names=p20002_i0_a6 -ifield_names=p20002_i0_a7 -ifield_names=p20002_i0_a8 -ifield_names=p20002_i0_a9 -ifield_names=p20002_i0_a10 -ifield_names=p20002_i0_a11 -ifield_names=p20002_i0_a12 -ifield_names=p20002_i0_a13 -ifield_names=p20002_i0_a14 -ifield_names=p20002_i0_a15 -ifield_names=p20002_i0_a16 -ifield_names=p20002_i0_a17 -ifield_names=p20002_i0_a18 -ifield_names=p20002_i0_a19 -ifield_names=p20002_i0_a20 -ifie

In [137]:
import dxpy
import pyspark

config = pyspark.SparkConf().setAll([('spark.kryoserializer.buffer.max', '128'),('spark.sql.execution.arrow.pyspark.enabled','true')])  
sc = pyspark.SparkContext(conf=config)
spark = pyspark.sql.SparkSession(sc)

RuntimeError: Java gateway process exited before sending its port number

Command Output:

Using input JSON:
{
    "dataset_or_cohort_or_dashboard": {
        "$dnanexus_link": {
            "project": "project-GvZyZ20J81vgPJGbJy8pgpyq",
            "id": "record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg"
        }
    }
}

Calling app-GyY66z89pP7JGx68BX8GB0Xp with output destination
  project-GvZyZ20J81vgPJGbJy8pgpyq:/

Job ID: job-Gz5JjbjJ81vvGG55Kv4g04xY

