# setup

## import python modules

In [1]:
import pandas as pd
import os

## define workspace bucket

In [2]:
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

'gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc'

In [3]:
os.getenv('GOOGLE_PROJECT')

'terra-vpc-sc-f6a2ddef'

## set up dsub

In [4]:
%%writefile ~/aou_dsub.bash

#!/bin/bash

# This shell function passes reasonable defaults for several dsub parameters, while
# allowing the caller to override any of them. It creates a nice folder structure within
# the workspace bucket for dsub log files.

# --[ Parameters ]--
# any valid dsub parameter flag

#--[ Returns ]--
# the job id of the job created by dsub

#--[ Details ]--
# The first five parameters below should always be those values when running on AoU RWB.

# Feel free to change the values for --user, --regions, --logging, and --image if you like.

# Note that we insert some job data into the logging path.
# https://github.com/DataBiosphere/dsub/blob/main/docs/logging.md#inserting-job-data

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

Overwriting /home/jupyter/aou_dsub.bash


# make exome file map

In [None]:
numbers = [f"{i:010}" for i in range(0, 20017)]

In [None]:
!gsutil cp exome_vcf_file_map.py ${WORKSPACE_BUCKET}/exome_map/input/

In [5]:
!gsutil cp ${WORKSPACE_BUCKET}/exome_map/input/exome_vcf_file_map.py ./

Copying gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/exome_map/input/exome_vcf_file_map.py...
/ [1 files][  1.1 KiB/  1.1 KiB]                                                
Operation completed over 1 objects/1.1 KiB.                                      


In [None]:
!gsutil ls {bucket}/exome_map/input/

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'exome-map-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--input VCF_FILE': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz" for x in range(0, 20017)],
    '--input SCRIPT': [f"{bucket}/exome_map/input/exome_vcf_file_map.py" for _ in range(20017)],
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 20017)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/exome_map/output/" for _ in range(20017)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/general_python:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/exome_map/" \
  --disk-size 7 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'python ${SCRIPT} \
              --input ${VCF_FILE} \
              --file_number ${FILE_NUM} \
              --output_dir ${OUTPUT_DIR}'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'exome-map---kathleen-cardone--250226-195148-20' \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/exome_map/output/

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/exome_map/output/* | head

# bcftools Split Multiallelic Variants

## check out size of VCF files

In [None]:
!gsutil -u $GOOGLE_PROJECT ls -lh gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/

## command

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-norm-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--input INPUT_FILE': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz" for x in range(0, 18465)],
    '--input INDEX': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz.tbi" for x in range(0, 18465)],
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/split_multiallelic/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/" \
  --disk-size 10 \
  --min-ram 26 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools norm -m- \
              $INPUT_FILE \
              -Oz -o ${OUTPUT_DIR}/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'RUNNING'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'FAILURE' | wc -l

In [None]:
%%bash
dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'FAILURE' | awk '{print $2}' | grep -v 'Name' | grep -v '-' | grep -v '^$' | awk '{print $1 - 1}' > norm_failed.txt
tail norm_failed.txt

In [None]:
import pandas as pd
failed_df = pd.read_csv('norm_failed.txt',header=None)
failed_list=failed_df[0].tolist()
print(len(failed_list))
failed_list

## rerun command for ones that ran out of memory (grr)

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-norm-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--input INPUT_FILE': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz" for x in failed_list],
    '--input INDEX': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz.tbi" for x in failed_list],
    '--env FILE_NUM': [f"{i:010}" for i in failed_list],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/split_multiallelic/" for _ in range(115)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools norm -m- \
              $INPUT_FILE \
              -Oz -o ${OUTPUT_DIR}/exome_v8.${FILE_NUM}.exwas_genes_only.split_multiallelic.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-214322-30' \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!gsutil cp ${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/bcftools-n--kathleen-cardone--250227-200958-99.90-* .

In [None]:
!cat bcftools-n--kathleen-cardone--250227-200958-99.90-stderr.log

In [None]:
!gsutil -u $GOOGLE_PROJECT ls -lh gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/ | tail

In [None]:
!gsutil ls -lh ${WORKSPACE_BUCKET}/split_multiallelic/ | tail

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/split_multiallelic/ | wc -l

# change filename

## run command

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'norm_change_filename-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(1897, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/split_multiallelic/" for _ in range(16568)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/" \
  --disk-size 10 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'cp $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.exwas_genes_only.split_multiallelic.vcf.gz $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'norm-chang--kathleen-cardone--250303-165225-93' \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'norm-chang--kathleen-cardone--250303-165225-93' \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'norm-chang--kathleen-cardone--250303-165225-93' \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

# bcftools Sites Only & Pass QC

## Command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-view-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/view/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools view -G -f .,PASS \
              $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250303-193616-32' \
    --users "*" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250303-193616-32' \
    --users "*" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250303-193616-32' \
    --users "*" \
    --status 'RUNNING' | wc -l

## rerun files that failed bc AOU is stupid

In [None]:
import pandas as pd
failed_df = pd.read_csv('sites_only_pass_qc.failed_numbers.txt',header=None)
failed_list=failed_df[0].tolist()
print(len(failed_list))

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-view-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in failed_list],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(139)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/view/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools view -G -f .,PASS \
              $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250304-150138-81' \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250304-150138-81' \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250304-150138-81' \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/sites_only_pass_QC/ | grep '.gz' | wc -l

# bcftools Pass QC Only (for PLINK files)

## command

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-view-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/view/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools view -f .,PASS \
              $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

# index sites only VCF files for merge

## command

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'index_sites_only-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/index/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools index -t \
              $BUCKET/sites_only_pass_QC/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz \
              -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz.tbi'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## get status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-site--kathleen-cardone--250304-175527-62" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-site--kathleen-cardone--250304-175527-62" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-site--kathleen-cardone--250304-175527-62" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

# index pass QC VCF files for merge

## command

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'index_pass_qc-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/index/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools index -t \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz.tbi'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/pass_QC_VCF/*.tbi | head

## redo chunk that failed

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'index_pass_qc-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in [3307]],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(1)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/index/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools index -t \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz.tbi'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250304-141347-83" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/pass_QC_VCF/*.tbi | wc -l

# merge sites only vcf files by chromosome

## command

In [None]:
!gsutil ls -lh ${WORKSPACE_BUCKET}/sites_only_pass_QC/ | tail

In [None]:
!wc -l AOU_v8.srWGS_exome_vcf.file_map/AOU_v8.srWGS_exome_vcf.chr1.sites_only_pass_qc.merge_list.txt

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'sites_only_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': list(range(1,23)),
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(22)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 3 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools merge -l $BUCKET/sites_only_pass_QC/AOU_v8.srWGS_exome_vcf.chr${CHR}.sites_only_pass_qc.merge_list.txt --force-samples \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "sites-only--kathleen-cardone--250304-190259-66" \
    --users "kathleen-cardone" \
    --status 'FAILURE'

In [None]:
!gsutil cp ${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/sites-only--kathleen-cardone--250304-190259-66.19-* .

In [None]:
!cat sites-only--kathleen-cardone--250304-190259-66.19-stderr.log

In [None]:
!cat sites-only--kathleen-cardone--250304-190259-66.19-stdout.log

In [None]:
!gsutil cp gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/sites_only_pass_QC/exome_v8.0000017069.split_multiallelic.sites_only.pass_qc.vcf.gz .

In [None]:
!zcat exome_v8.0000017069.split_multiallelic.sites_only.pass_qc.vcf.gz | tail

In [None]:
!gzip -t exome_v8.0000017069.split_multiallelic.sites_only.pass_qc.vcf.gz

In [None]:
!gsutil ls gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/sites_only_pass_QC/exome_v8.0000012180*

In [None]:
!gsutil ls gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/sites_only_pass_QC/exome_v8.0000017069*

## redo failed chromosomes

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'sites_only_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [1,2,3,6,11,12,17,19],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(8)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 3 \
  --min-ram 15 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools merge -l $BUCKET/sites_only_pass_QC/AOU_v8.srWGS_exome_vcf.chr${CHR}.sites_only_pass_qc.merge_list.txt --force-samples \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "sites-only--kathleen-cardone--250305-150319-50" \
    --users "kathleen-cardone" \
    --status '*'

# merge pass QC only VCF files

## command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'pass_qc_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': list(range(1,23)),
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(22)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 315 \
  --min-ram 300 \
  --min-core 16 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools concat -a -f $BUCKET/pass_QC_VCF/AOU_v8.srWGS_exome_vcf.chr${CHR}.pass_qc_only.merge_list.txt \
              --threads 16 \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "pass-qc-me--kathleen-cardone--250311-142457-14" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "pass-qc-me--kathleen-cardone--250311-142457-14" \
    --users "kathleen-cardone" \
    --status 'FAILURE'

In [None]:
!gsutil cp ${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/pass-qc-me--kathleen-cardone--250311-142457-14.17* .

## redo chromosomes that failed

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'pass_qc_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [1,19],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(2)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 315 \
  --min-ram 400 \
  --min-core 16 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools concat -a -f $BUCKET/pass_QC_VCF/AOU_v8.srWGS_exome_vcf.chr${CHR}.pass_qc_only.merge_list.txt \
              --threads 16 \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "pass-qc-me--kathleen-cardone--250312-152422-07" \
    --users "kathleen-cardone" \
    --status '*'

# bcftools update variant ID in sites only/pass QC files

## command for chromosomes that merged

In [None]:
!gsutil ls -lh ${WORKSPACE_BUCKET}/pass_QC_VCF/*chr*

In [1]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': list(range(1,23)),
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(22)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/sites_only_pass_QC/exome_v8.chr${CHR}.split_multiallelic.sites_only.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

env: USER_NAME=kathleen-cardone
env: JOB_NAME=bcftools-annotate-kathleen-cardone
env: PARAMETER_FILENAME=bcftools-annotate-kathleen-cardone_params.tsv
Job properties:
  job-id: bcftools-a--kathleen-cardone--250321-171640-47
  job-name: bcftools-annotate-kathleen-cardone
  user-id: kathleen-cardone
Provider internal-id (operation): projects/540927738276/locations/us-central1/operations/17051743046552730538
Provider internal-id (operation): projects/540927738276/locations/us-central1/operations/1408123632464452589
Provider internal-id (operation): projects/540927738276/locations/us-central1/operations/7621773929260110014
Provider internal-id (operation): projects/540927738276/locations/us-central1/operations/3024405971926822848
Provider internal-id (operation): projects/540927738276/locations/us-central1/operations/2217785990500010288
Provider internal-id (operation): projects/540927738276/locations/us-central1/operations/16441849479250758734
Provider internal-id (operation): projects/54

## check job status

In [4]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250321-171640-47" \
    --users "kathleen-cardone" \
    --status '*'

Job Name           Task  Status    Last Update
---------------  ------  --------  --------------
bcftools-ann...      22  Success   03-21 17:18:53
bcftools-ann...      21  Success   03-21 17:18:52
bcftools-ann...      20  Success   03-21 17:19:05
bcftools-ann...      19  Success   03-21 17:19:15
bcftools-ann...      18  Success   03-21 17:18:41
bcftools-ann...      17  Success   03-21 17:18:51
bcftools-ann...      16  Success   03-21 17:19:02
bcftools-ann...      15  Success   03-21 17:18:51
bcftools-ann...      14  Success   03-21 17:18:57
bcftools-ann...      13  Success   03-21 17:19:07
bcftools-ann...      12  Success   03-21 17:22:21
bcftools-ann...      11  Success   03-21 17:18:52
bcftools-ann...      10  Success   03-21 17:19:05
bcftools-ann...       9  Success   03-21 17:18:56
bcftools-ann...       8  Success   03-21 17:19:05
bcftools-ann...       7  Success   03-21 17:19:02
bcftools-ann...       6  Success   03-21 17:18:50
bcftools-ann...       5  Success  

In [5]:
!gsutil ls ${WORKSPACE_BUCKET}/newID_VCF/*sites_only*

gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr1.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr10.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr11.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr12.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr13.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr14.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr15.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_VCF/exome_v8.chr16.new_id.split_m

In [None]:
!gsutil cp ${WORKSPACE_BUCKET}/newID_VCF/exome_v8.chr22.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz .

In [None]:
!zcat exome_v8.chr22.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz | grep -v '##' | head

# bcftools update variant ID in pass QC files (for PLINK files)

## command for chromosomes that merged

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [8,9,13,14,15,18,20,21,22],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(9)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 120 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.chr${CHR}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-012340-24" \
    --users "kathleen-cardone" \
    --status '*'

## command for chunked files

### chr 1-7

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 7846)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(7846)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-192422-85" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-192422-85" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-192422-85" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

### chr 10-12

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(9271, 12199)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(2928)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-130823-95" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-130823-95" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-130823-95" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 16-17

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(13904, 15681)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(1777)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-160403-39" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-160403-39" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-160403-39" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 19

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(16057, 17397)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(1340)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-194848-23" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-194848-23" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-194848-23" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

# make new PLINK files

## command for merged chromosomes

In [None]:
!gsutil ls -lh ${WORKSPACE_BUCKET}/newID_VCF/*chr*split_multiallelic.pass_qc*

In [None]:
!gsutil ls -lh ${WORKSPACE_BUCKET}/newID_PLINK/*chr*

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [8,9,13,14,15,18,20,21,22],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(9)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 300 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-133518-68" \
    --users "kathleen-cardone" \
    --status '*'

## command for chromosome chunks

### chr 1-7

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 7846)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(7846)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chunks that failed from chr1-7 (stupid aou)

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | grep -v 'Job' | grep -v '\--' | sed 's/    /,/g' | sed 's/  /,/g' > chr1_7.convert_plink.failed.csv

In [None]:
import pandas as pd
failed_df = pd.read_csv('chr1_7.convert_plink.failed.csv',header=None)
failed_df['FILE_NUM'] = failed_df[1] - 1
failed_list=failed_df['FILE_NUM'].tolist()
print(failed_list)
print(len(failed_list))

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in failed_list],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(40)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-180850-57" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-180850-57" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-180850-57" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 10-12

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(9271, 12199)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(2928)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-191511-35" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-191511-35" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-191511-35" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 16-17

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(13904, 15681)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(1777)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-125200-07" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-125200-07" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-125200-07" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 19

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(16057, 17397)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(1340)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-144112-37" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-144112-37" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-144112-37" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

# merge chunked plink files

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [1,2,3,4,5,6,7,10,11,12,16,17,19],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(13)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 500 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --pmerge-list $BUCKET/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr${CHR}.pass_qc_only.plink_merge_list.txt \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

In [1]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-merg--kathleen-cardone--250319-215818-28" \
    --users "kathleen-cardone" \
    --status '*'

Job Name           Task  Status    Last Update
---------------  ------  --------  --------------
plink-merge-...      13  Success   03-20 01:21:59
plink-merge-...      12  Success   03-20 00:28:40
plink-merge-...      11  Success   03-19 23:51:15
plink-merge-...      10  Success   03-20 00:29:56
plink-merge-...       9  Success   03-20 00:39:47
plink-merge-...       8  Success   03-19 23:58:35
plink-merge-...       7  Success   03-20 00:16:37
plink-merge-...       6  Success   03-20 00:36:36
plink-merge-...       5  Success   03-20 00:09:09
plink-merge-...       4  Success   03-19 23:56:03
plink-merge-...       3  Success   03-20 00:36:44
plink-merge-...       2  Success   03-20 01:08:51
plink-merge-...       1  Success   03-20 02:48:39



In [2]:
!gsutil ls ${WORKSPACE_BUCKET}/newID_PLINK/*chr*

gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr1.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr10.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr11.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr12.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr16.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr17.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr19.pass_qc_only.plink_merge_list.txt
gs://fc-secure-5e490ca2-d5ae-40a3-aa5e-7355e31ab9cc/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr2.pass_qc_only.plink_m