# HERMES 3.0 PHENOTYPING

## Libraries

In [23]:
import dxpy
import subprocess
import os
import glob
import pandas as pd

## Project and record IDs

In [7]:
dispensed_dataset_id = dxpy.find_one_data_object(typename='Dataset', name='app*.dataset', folder='/', name_mode='glob')['id']
project_id = dxpy.find_one_project()["id"]
dataset = (':').join([project_id, dispensed_dataset_id])
print("Dataset id: " + dataset)

Dataset id: project-GvZyZ20J81vgPJGbJy8pgpyq:record-Gvb0Bg0Jfxfv0q8Fb2pXqKjg


## Get data dictionaries

In [18]:
os.chdir("/opt/notebooks")
cmd = ["dx", "extract_dataset", dataset, "-ddd", "--delimiter", ","]
subprocess.check_call(cmd)

0

In [92]:
path = "/opt/notebooks"
data_dict_csv = glob.glob(os.path.join(path, "*.data_dictionary.csv"))[0]
data_dict_df = pd.read_csv(data_dict_csv)

codes_to_extract = {
    "eid":               {"name": "eid",       "entity": "participant", "search": "matches"},
    "sex":               {"name": "p31",       "entity": "participant", "search": "matches"},
    "age":               {"name": "p21022",    "entity": "participant", "search": "matches"},
    "ethnicity":         {"name": "p21000",    "entity": "participant", "search": "matches"},
    "genetic_sex":       {"name": "p22001",    "entity": "participant", "search": "matches"},
    "genetic_ethnicity": {"name": "p22006",    "entity": "participant", "search": "matches"},
    "pc1":               {"name": "p22009_a1", "entity": "participant", "search": "matches"},
    "pc2":               {"name": "p22009_a2", "entity": "participant", "search": "matches"},
    "pc3":               {"name": "p22009_a3", "entity": "participant", "search": "matches"},
    "pc4":               {"name": "p22009_a4", "entity": "participant", "search": "matches"},
    "pc5":               {"name": "p22009_a5", "entity": "participant", "search": "matches"},
    "self_rep_ill":      {"name": "p20002",    "entity": "participant", "search": "startswith"},
    "self_rep_ill_year": {"name": "p20008",    "entity": "participant", "search": "startswith"},
    "self_rep_proc":     {"name": "p20004",    "entity": "participant", "search": "startswith"},
    "self_rep_proc_year":{"name": "p20010",    "entity": "participant", "search": "startswith"},
}

filtered_df = pd.DataFrame()

for desc, criteria in codes_to_extract.items():
    df = data_dict_df[data_dict_df["entity"] == criteria["entity"]]
    if criteria["search"] == "matches":
        filtered_df = pd.concat([filtered_df, df[df["name"] == criteria["name"]]])
    elif criteria["search"] == "startswith":
        filtered_df = pd.concat([filtered_df, df[df["name"].str.startswith(criteria["name"])]])
        
filtered_df = filtered_df.reset_index(drop=True)
filtered_df["name"].to_csv("field_name.txt", index=False, header=False)


cmd = "dx run table-exporter -dataset_or_cohort_or_dashboard=" + dataset +
      "-ientity=participant " +
      "-ioutput_format=TSV " +
      "-ifield_names=p31 " +
      "--destination data_participant.tsv"
subprocess.check_output(cmd,
try:
    output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    print("Command Output:")
    print(output.decode())
except subprocess.CalledProcessError as e:
    print(f"Command failed with error code {e.returncode}")
    print(f"Command output: {e.output.decode()}")
    print(f"Command error: {e.stderr.decode()}")

  data_dict_df = pd.read_csv(data_dict_csv)


Command failed with error code 3
Command output: dxpy.exceptions.DXCLIError: Some inputs (dataset_or_cohort_or_dashboard) are missing, and interactive mode is not available



AttributeError: 'NoneType' object has no attribute 'decode'

In [None]:
import dxpy
import pyspark

config = pyspark.SparkConf().setAll([('spark.kryoserializer.buffer.max', '128'),('spark.sql.execution.arrow.pyspark.enabled','true')])  
sc = pyspark.SparkContext(conf=config)
spark = pyspark.sql.SparkSession(sc)