# setup

## import python modules

In [None]:
import pandas as pd
import os

## define workspace bucket

In [None]:
bucket = os.getenv('WORKSPACE_BUCKET')
bucket

In [None]:
os.getenv('GOOGLE_PROJECT')

## set up dsub

In [None]:
%%writefile ~/aou_dsub.bash

#!/bin/bash

# This shell function passes reasonable defaults for several dsub parameters, while
# allowing the caller to override any of them. It creates a nice folder structure within
# the workspace bucket for dsub log files.

# --[ Parameters ]--
# any valid dsub parameter flag

#--[ Returns ]--
# the job id of the job created by dsub

#--[ Details ]--
# The first five parameters below should always be those values when running on AoU RWB.

# Feel free to change the values for --user, --regions, --logging, and --image if you like.

# Note that we insert some job data into the logging path.
# https://github.com/DataBiosphere/dsub/blob/main/docs/logging.md#inserting-job-data

function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

# identify chromosome, start, and stop positions of chunked exome files and create file maps

## copy script to workspace bucket

In [None]:
!gsutil cp exome_vcf_file_map.py ${WORKSPACE_BUCKET}/exome_map/input/

## command

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'exome-map-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--input VCF_FILE': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz" for x in range(0, 20017)],
    '--input SCRIPT': [f"{bucket}/exome_map/input/exome_vcf_file_map.py" for _ in range(20017)],
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 20017)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/exome_map/output/" for _ in range(20017)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/general_python:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/exome_map/" \
  --disk-size 7 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'python ${SCRIPT} \
              --input ${VCF_FILE} \
              --file_number ${FILE_NUM} \
              --output_dir ${OUTPUT_DIR}'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'exome-map---kathleen-cardone--250226-195148-20' \
    --users "kathleen-cardone" \
    --status '*'

## check output files

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/exome_map/output/* | head

# split multiallelic variants with bcftools

## check out size of VCF files

In [None]:
!gsutil -u $GOOGLE_PROJECT ls -lh gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/

## command

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-norm-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--input INPUT_FILE': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz" for x in range(0, 18465)],
    '--input INDEX': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz.tbi" for x in range(0, 18465)],
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/split_multiallelic/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/" \
  --disk-size 10 \
  --min-ram 26 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools norm -m- \
              $INPUT_FILE \
              -Oz -o ${OUTPUT_DIR}/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'RUNNING'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'FAILURE' | wc -l

## identify jobs that did not complete

In [None]:
%%bash
dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-151649-32' \
    --users "*" \
    --status 'FAILURE' | awk '{print $2}' | grep -v 'Name' | grep -v '-' | grep -v '^$' | awk '{print $1 - 1}' > norm_failed.txt
tail norm_failed.txt

In [None]:
import pandas as pd
failed_df = pd.read_csv('norm_failed.txt',header=None)
failed_list=failed_df[0].tolist()
print(len(failed_list))
failed_list

## resubmit jobs that ran out of memory

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-norm-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--input INPUT_FILE': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz" for x in failed_list],
    '--input INDEX': [f"gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/exome/vcf/{x:010}.vcf.bgz.tbi" for x in failed_list],
    '--env FILE_NUM': [f"{i:010}" for i in failed_list],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/split_multiallelic/" for _ in range(115)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools norm -m- \
              $INPUT_FILE \
              -Oz -o ${OUTPUT_DIR}/exome_v8.${FILE_NUM}.exwas_genes_only.split_multiallelic.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status of redone jobs

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-n--kathleen-cardone--250227-214322-30' \
    --users "kathleen-cardone" \
    --status '*'

## check output files after all jobs finished

In [None]:
!gsutil ls -lh ${WORKSPACE_BUCKET}/split_multiallelic/ | tail

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/split_multiallelic/ | wc -l

# change filename of output file (remove "exwas_genes_only" from filename- not a relevant name here)

## run command

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'norm_change_filename-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(1897, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/split_multiallelic/" for _ in range(16568)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/norm/" \
  --disk-size 10 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'cp $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.exwas_genes_only.split_multiallelic.vcf.gz $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'norm-chang--kathleen-cardone--250303-165225-93' \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'norm-chang--kathleen-cardone--250303-165225-93' \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'norm-chang--kathleen-cardone--250303-165225-93' \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

# filter to variants that passed QC and remove genotypes, creating sites only VCFs, with bcftools (for VEP annotations)

## Command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-view-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/view/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools view -G -f .,PASS \
              $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250303-193616-32' \
    --users "*" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250303-193616-32' \
    --users "*" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250303-193616-32' \
    --users "*" \
    --status 'RUNNING' | wc -l

## resubmit jobs that ran out of memory

In [None]:
import pandas as pd
failed_df = pd.read_csv('sites_only_pass_qc.failed_numbers.txt',header=None)
failed_list=failed_df[0].tolist()
print(len(failed_list))

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-view-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in failed_list],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(139)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/view/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools view -G -f .,PASS \
              $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status of redone jobs

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250304-150138-81' \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250304-150138-81' \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs 'bcftools-v--kathleen-cardone--250304-150138-81' \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

## check output files after all jobs finished

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/sites_only_pass_QC/ | grep '.gz' | wc -l

# filter to variants that passed QC and keep genotypes with bcftools (for PLINK files)

## command

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-view-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/view/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --min-ram 128 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools view -f .,PASS \
              $BUCKET/split_multiallelic/exome_v8.${FILE_NUM}.split_multiallelic.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-v--kathleen-cardone--250228-145821-62" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

# index sites only VCF files for merge

## command

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'index_sites_only-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/index/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools index -t \
              $BUCKET/sites_only_pass_QC/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz \
              -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.sites_only.pass_qc.vcf.gz.tbi'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-site--kathleen-cardone--250304-175527-62" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-site--kathleen-cardone--250304-175527-62" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-site--kathleen-cardone--250304-175527-62" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

# index VCF files with genotypes for merge

## command

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'index_pass_qc-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 18465)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(18465)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/index/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools index -t \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz.tbi'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250311-131326-16" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

## check output files

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/pass_QC_VCF/*.tbi | head

## redo job that ran out of memory

In [None]:
import pandas as pd
import os
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'index_pass_qc-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in [3307]],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(1)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/index/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools index -t \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz.tbi'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status of redone jobs

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "index-pass--kathleen-cardone--250304-141347-83" \
    --users "kathleen-cardone" \
    --status '*'

## check output files after all jobs complete

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/pass_QC_VCF/*.tbi | wc -l

# merge sites only vcf files by chromosome

## command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'sites_only_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': list(range(1,23)),
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(22)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 3 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools merge -l $BUCKET/sites_only_pass_QC/AOU_v8.srWGS_exome_vcf.chr${CHR}.sites_only_pass_qc.merge_list.txt --force-samples \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "sites-only--kathleen-cardone--250304-190259-66" \
    --users "kathleen-cardone" \
    --status 'FAILURE'

## redo jobs that ran out of memory

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'sites_only_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [1,2,3,6,11,12,17,19],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/sites_only_pass_QC/" for _ in range(8)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 3 \
  --min-ram 15 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools merge -l $BUCKET/sites_only_pass_QC/AOU_v8.srWGS_exome_vcf.chr${CHR}.sites_only_pass_qc.merge_list.txt --force-samples \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status of redone jobs

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "sites-only--kathleen-cardone--250305-150319-50" \
    --users "kathleen-cardone" \
    --status '*'

# merge pass QC only VCF files

## command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'pass_qc_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': list(range(1,23)),
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(22)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 315 \
  --min-ram 300 \
  --min-core 16 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools concat -a -f $BUCKET/pass_QC_VCF/AOU_v8.srWGS_exome_vcf.chr${CHR}.pass_qc_only.merge_list.txt \
              --threads 16 \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "pass-qc-me--kathleen-cardone--250311-142457-14" \
    --users "kathleen-cardone" \
    --status '*'

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "pass-qc-me--kathleen-cardone--250311-142457-14" \
    --users "kathleen-cardone" \
    --status 'FAILURE'

## redo chromosomes that ran out of memory

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'pass_qc_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [1,19],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/pass_QC_VCF/" for _ in range(2)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 315 \
  --min-ram 400 \
  --min-core 16 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools concat -a -f $BUCKET/pass_QC_VCF/AOU_v8.srWGS_exome_vcf.chr${CHR}.pass_qc_only.merge_list.txt \
              --threads 16 \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check status of redone jobs

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "pass-qc-me--kathleen-cardone--250312-152422-07" \
    --users "kathleen-cardone" \
    --status '*'

# update variant ID format for sites only vcf files with bcftools

## command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': list(range(1,23)),
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(22)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/sites_only_pass_QC/exome_v8.chr${CHR}.split_multiallelic.sites_only.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250321-171640-47" \
    --users "kathleen-cardone" \
    --status '*'

## check output files

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/newID_VCF/*sites_only*

## inspect one output file to make sure command worked

In [None]:
!gsutil cp ${WORKSPACE_BUCKET}/newID_VCF/exome_v8.chr22.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz .

In [None]:
!zcat exome_v8.chr22.new_id.split_multiallelic.sites_only.pass_qc.vcf.gz | grep -v '##' | head

# reformat variant ID in vcf files with genotypes with bcftools

## command for chromosomes that merged

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [8,9,13,14,15,18,20,21,22],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(9)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 120 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.chr${CHR}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-012340-24" \
    --users "kathleen-cardone" \
    --status '*'

## commands for chunked files

### chr 1-7

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 7846)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(7846)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-192422-85" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-192422-85" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250314-192422-85" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

### chr 10-12

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(9271, 12199)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(2928)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-130823-95" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-130823-95" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-130823-95" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 16-17

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(13904, 15681)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(1777)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-160403-39" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-160403-39" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-160403-39" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 19

In [None]:
import os
import pandas as pd
bucket = os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'bcftools-annotate-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(16057, 17397)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_VCF/" for _ in range(1340)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/bcftools:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/bcftools/annotate/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'bcftools annotate --set-id "%CHROM:%POS:%REF:%FIRST_ALT" \
              $BUCKET/pass_QC_VCF/exome_v8.${FILE_NUM}.split_multiallelic.pass_qc.vcf.gz \
              -Oz -o $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-194848-23" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-194848-23" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "bcftools-a--kathleen-cardone--250317-194848-23" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

# convert VCF files to PLINK2 files

## command for merged chromosomes

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [8,9,13,14,15,18,20,21,22],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(9)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 300 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "${GOOGLE_PROJECT}" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-133518-68" \
    --users "kathleen-cardone" \
    --status '*'

## command for chromosome chunks

### chr 1-7

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(0, 7846)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(7846)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### redo chunks in chr1-7 that ran out of memory

#### identify failed chunks

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-140621-85" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | grep -v 'Job' | grep -v '\--' | sed 's/    /,/g' | sed 's/  /,/g' > chr1_7.convert_plink.failed.csv

In [None]:
import pandas as pd
failed_df = pd.read_csv('chr1_7.convert_plink.failed.csv',header=None)
failed_df['FILE_NUM'] = failed_df[1] - 1
failed_list=failed_df['FILE_NUM'].tolist()
print(failed_list)
print(len(failed_list))

#### resubmit jobs

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in failed_list],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(40)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-180850-57" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-180850-57" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-180850-57" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 10-12

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(9271, 12199)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(2928)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-191511-35" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-191511-35" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250318-191511-35" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 16-17

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(13904, 15681)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(1777)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-125200-07" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-125200-07" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-125200-07" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

### chr 19

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env FILE_NUM': [f"{i:010}" for i in range(16057, 17397)],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(1340)]
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/" \
  --disk-size 10 \
  --min-ram 128 \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --vcf $BUCKET/newID_VCF/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc.vcf.gz \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.${FILE_NUM}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

#### check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-144112-37" \
    --users "kathleen-cardone" \
    --status 'FAILURE' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-144112-37" \
    --users "kathleen-cardone" \
    --status 'SUCCESS' | wc -l

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-kath--kathleen-cardone--250319-144112-37" \
    --users "kathleen-cardone" \
    --status 'RUNNING' | wc -l

# merge chunked plink files

## command

In [None]:
import os
import pandas as pd
bucket=os.getenv('WORKSPACE_BUCKET')
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'plink_merge-{USER_NAME}'
%env JOB_NAME={JOB_NAME}

params_df = pd.DataFrame(data={
    '--env CHR': [1,2,3,4,5,6,7,10,11,12,16,17,19],
    '--output-recursive OUTPUT_DIR': [f"{bucket}/newID_PLINK/" for _ in range(13)]
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/ritchie-aou-psom-9015/plink2:latest" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/plink/merge/" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --disk-size 500 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'plink2 --pmerge-list $BUCKET/newID_PLINK/AOU_v8.srWGS_exome_vcf.chr${CHR}.pass_qc_only.plink_merge_list.txt \
              --make-pgen \
              --out $OUTPUT_DIR/exome_v8.chr${CHR}.new_id.split_multiallelic.pass_qc'

print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## check job status

In [None]:
!dstat \
    --provider google-cls-v2 \
    --project "$GOOGLE_PROJECT" \
    --location us-central1 \
    --jobs "plink-merg--kathleen-cardone--250319-215818-28" \
    --users "kathleen-cardone" \
    --status '*'

## check output files

In [None]:
!gsutil ls ${WORKSPACE_BUCKET}/newID_PLINK/*chr*