# Update GWAS Catalog dataset


This script fetches the most recent release from the GWAS Catalog ftp. Dates accordingly, and uploads to GCP bucket. 

**Warning!!** It does not automatically update config file in the etl repo. It still has to be manual.

In [14]:
%%bash

# Resources:
export BASE_URL=ftp://ftp.ebi.ac.uk/pub/databases/gwas
export RELEASE_INFO_URL=https://www.ebi.ac.uk/gwas/api/search/stats
export GCP_TARGET=gs://genetics_etl_python_playground/input/v2d/

# Function to get the most recent date:
get_most_recent(){
   cat $1 | perl -lane 'push @a, $_ if $_ =~ /^\d+$/; END {@a = sort { $a <=> $b} @a; print pop @a }'
}

# Function to return the path the to the most recent release:
get_release_url(){
    curl -s --list-only ${BASE_URL}/releases/ | get_most_recent | while read YEAR; do 
        curl -s --list-only ${BASE_URL}/releases/${YEAR}/  | get_most_recent | while read MONTH; do 
             DAY=$(curl -s --list-only ${BASE_URL}/releases/${YEAR}/${MONTH}/  | get_most_recent)
            echo $YEAR $MONTH $DAY 
        done
    done
}

# Function to get the Ensembl and EFO version which used to ground GWAS data:
get_release_info(){
    curl -s https://www.ebi.ac.uk/gwas/api/search/stats | jq -r '"\(.ensemblbuild) \(.efoversion)"'
}

# Capturing release date:
read YEAR MONTH DAY < <(get_release_url)

# Capturing release metadata:
read ENSEMBL EFO < <(get_release_info)

echo $YEAR $MONTH $DAY $ENSEMBL $EFO

# Constructing FTP URL to access the most recent release:
RELEASE_URL=${BASE_URL}/releases/${YEAR}/${MONTH}/${DAY}

# Fetching files while assigning properly dated and annotated names:
echo "Fetching data from ftp..."
wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv \
    -O gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv

wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt \
    -O gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv
    
wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt \
    -O gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv
 
wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt

echo "Copying files to GCP..."
gsutil cp file://$(pwd)gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
gsutil cp file://$(pwd)gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
gsutil cp file://$(pwd)gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
gsutil cp file://$(pwd)harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt ${GCP_TARGET}/



2023 09 11 110 v3.57.0
Copying files to GCP...


bash: line 34: ${datasets.inputs}/v2d/gwas_catalog_v1.0.2-associations_e107_r2022-11-29.tsv: bad substitution
bash: line 35: ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2022-11-29.tsv: bad substitution
bash: line 36: ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2022-11-29.tsv: bad substitution
bash: line 37: ${datasets.inputs}/v2d/harmonised_list.txt: bad substitution
Copying file:///gwas_catalog_v1.0.2-associations_e110_r2023-09-11.tsv [Content-Type=text/tab-separated-values]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installe