## GWAS catalog filtering notebook

The purpose of this notebook is to filter GWAS associations and traits based on the curation comments and our needs

In [1]:
import polars as pl
from pathlib import *

### Setting up paths

In [2]:
base = Path(".")
data = base / "data"
input = data / "input"
output = data / "output"

In [3]:
traits_path = input / "traits_no_measurement.tsv"
traits_df = pl.read_csv(str(traits_path), sep="\t")
traits_df.head(3)

trait,trait_uri,studies,association_counts,papers,Olga's comments
str,str,i64,i64,i64,str
"""clinical treat...","""http://www.ebi...",687,31,687,
"""employment sta...","""http://www.ebi...",687,10,687,
"""cortical thick...","""http://www.ebi...",325,1292,325,


Filter by curators comments and fix typos

In [4]:
comments = traits_df["Olga's comments"].to_list()

curator_comments = (
    pl.col("Olga's comments")
        .str.strip() # steam leading and ending spaces, they are often added accidentally and cause troubles
        .str.to_lowercase()
        .str.replace(" ", "_") # let's go dash instead of spaces
        .str.replace("/", ", ").str.replace("\?",  "") #clean up from some redundant symbols
        .str.replace("muskuloskeletal", "musculoskeletal").str.replace('musculosceletal', "musculoskeletal") # typos correction
        .str.replace("obesiy", 'obesity')
        .str.replace("homeostais","homeostasis")
        .str.replace(", ", "_and_").str.replace(",", "_and_") #let's first use end and then check if we can go further
) # cleaned up Olga's comments column

traits_filtered: pl.DataFrame = traits_df.filter(pl.col("Olga's comments").is_not_null())\
    .with_column(curator_comments).rename({"Olga's comments": "curator_comments"})
traits_filtered.head(10)


trait,trait_uri,studies,association_counts,papers,curator_comments
str,str,i64,i64,i64,str
"""type 2 diabete...","""http://purl.ob...",175,5003,175,"""glucose_homeos..."
"""body mass inde...","""http://www.ebi...",151,8912,151,"""glucose_homeos..."
"""COVID-19""","""http://purl.ob...",123,659,123,"""lung"""
"""asthma""","""http://purl.ob...",112,1475,112,"""lung"""
"""Alzheimer dise...","""http://purl.ob...",89,669,89,"""mental"""
"""bone fracture""","""http://www.ebi...",85,53,85,"""musculoskeleta..."
"""systolic blood...","""http://www.ebi...",73,3435,73,"""cardiovascular..."
"""coronary arter...","""http://www.ebi...",72,2408,72,"""cardiovascular..."
"""diastolic bloo...","""http://www.ebi...",68,2316,68,"""cardiovascular..."
"""osteoarthritis...","""http://purl.ob...",60,99,60,"""musculoskeleta..."


In [5]:
traits_grouped = traits_filtered.groupby("curator_comments").agg(pl.all())

traits = traits_grouped.explode([c for c in traits_grouped.columns if c != "curator_comments"])
traits.head(10)

curator_comments,trait,trait_uri,studies,association_counts,papers
str,str,str,i64,i64,i64
"""glucose_homeos...","""type 2 diabete...","""http://purl.ob...",175,5003,175
"""glucose_homeos...","""body mass inde...","""http://www.ebi...",151,8912,151
"""glucose_homeos...","""fat body mass""","""http://www.ebi...",40,184,40
"""glucose_homeos...","""obesity""","""http://www.ebi...",38,196,38
"""glucose_homeos...","""waist circumfe...","""http://www.ebi...",31,500,31
"""glucose_homeos...","""diabetes melli...","""http://www.ebi...",25,6,25
"""glucose_homeos...","""body fat perce...","""http://www.ebi...",23,527,23
"""glucose_homeos...","""metabolic synd...","""http://www.ebi...",22,233,22
"""glucose_homeos...","""non-alcoholic ...","""http://www.ebi...",22,169,22
"""glucose_homeos...","""insulin resist...","""http://www.ebi...",12,21,12


## Writing results to folders

let's clean the output first

In [6]:
import shutil
if output.exists():
    shutil.rmtree(output)
output.mkdir()

Let's create folders for modules

In [7]:
modules = traits_filtered.select([pl.col("curator_comments").unique()]).to_series().to_list()
modules

['musculoskeletal',
 'cardiovascular',
 'inflammation',
 'mental',
 'metabolic_health_and_obesity',
 'glucose_homeostasis_and_obesity',
 'other',
 'lung',
 'longevity']

In [8]:
df_list = []

for item in modules:           
    item_folder = output / item
    item_folder.mkdir()
    file_name = item + '.tsv'
    file_path = item_folder / file_name
    print(f"writing module {item} to {item_folder} with traits file {file_path}\n")
    module_df = traits.filter(pl.col('curator_comments')==item)

    module_df.write_csv(file = str(file_path), sep='\t')


writing module musculoskeletal to data\output\musculoskeletal with traits file data\output\musculoskeletal\musculoskeletal.tsv

writing module cardiovascular to data\output\cardiovascular with traits file data\output\cardiovascular\cardiovascular.tsv

writing module inflammation to data\output\inflammation with traits file data\output\inflammation\inflammation.tsv

writing module mental to data\output\mental with traits file data\output\mental\mental.tsv

writing module metabolic_health_and_obesity to data\output\metabolic_health_and_obesity with traits file data\output\metabolic_health_and_obesity\metabolic_health_and_obesity.tsv

writing module glucose_homeostasis_and_obesity to data\output\glucose_homeostasis_and_obesity with traits file data\output\glucose_homeostasis_and_obesity\glucose_homeostasis_and_obesity.tsv

writing module other to data\output\other with traits file data\output\other\other.tsv

writing module lung to data\output\lung with traits file data\output\lung\lung.t

dowloading GWASS catalogs to input folder

In [None]:
import urllib.request

# Associations file v1.0.2

associations_link = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative"
local_associations = "gwass_associations.tsv" 
path_associations = input / local_associations

urllib.request.urlretrieve(associations_link, path_associations)


# Studies file v1.0.3

studies_link = "https://www.ebi.ac.uk/gwas/api/search/downloads/studies_new"
local_studies = "gwass_studies.tsv"
path_studies = input / local_studies

urllib.request.urlretrieve(studies_link, path_studies)

In [9]:
# for local work if catalogas already downloaded

local_associations = "gwass_associations.tsv" 
path_associations = input / local_associations

local_studies = "gwass_studies.tsv"
path_studies = input / local_studies

reading gwass catalogues and filtering needed rows

In [10]:
associations_df = pl.read_csv(str(path_associations),{'CHR_ID': utf}, sep="\t") 
associations_df.head(3)

TypeError: argument 'has_header': 'dict' object cannot be converted to 'PyBool'

In [11]:
studies_df = pl.read_csv(str(path_studies), sep="\t")
studies_df.head(3)

DATE ADDED TO CATALOG,PUBMED ID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,PLATFORM [SNPS PASSING QC],ASSOCIATION COUNT,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY,SUMMARY STATS LOCATION,SUBMISSION DATE,STATISTICAL MODEL,BACKGROUND TRAIT,MAPPED BACKGROUND TRAIT,MAPPED BACKGROUND TRAIT URI
str,i64,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str
"""2016-10-14""",26626624,"""Stuart PE""","""2015-11-28""","""Am J Hum Genet...","""www.ncbi.nlm.n...","""Genome-wide As...","""Cutaneous psor...","""1,363 European...","""up to 2,969 Eu...","""Illumina [1153...",10,"""cutaneous psor...","""http://www.ebi...","""GCST003269""","""Genome-wide ge...",,,,,,
"""2016-10-14""",26626624,"""Stuart PE""","""2015-11-28""","""Am J Hum Genet...","""www.ncbi.nlm.n...","""Genome-wide As...","""Psoriasis vulg...","""4,007 European...","""up to 9,075 Eu...","""Illumina [up t...",39,"""psoriasis vulg...","""http://www.ebi...","""GCST003268""","""Genome-wide ge...",,,,,,
"""2016-10-14""",26626624,"""Stuart PE""","""2015-11-28""","""Am J Hum Genet...","""www.ncbi.nlm.n...","""Genome-wide As...","""Psoriatic arth...","""1,946 European...","""up to 2,883 Eu...","""Illumina [1153...",14,"""psoriatic arth...","""http://www.ebi...","""GCST003270""","""Genome-wide ge...",,,,,,


filtering studies

In [20]:
folders = output.iterdir()
folders

<generator object Path.iterdir at 0x000001EA9AF4D430>

In [21]:
for f in folders:
    file_name = f.iterdir()
    df_path = f / file_name
    df = pl.read_csv(df_path, sep="\t")
    df.head(3)
    

TypeError: unsupported operand type(s) for /: 'WindowsPath' and 'generator'