# GWAS catalog filtering notebook

The purpose of this notebook is to filter GWAS associations and traits based on the curation comments and our needs

In [64]:
import polars as pl
from pathlib import *

### Setting up paths

In [82]:
base = Path(".")
data = base / "data"
input = data / "input"
output = data / "output"

In [42]:
traits_path = input / "traits_no_measurement.tsv"
traits_df = pl.read_csv(str(traits_path), sep="\t")
traits_df.head(3)

trait,trait_uri,studies,association_counts,papers,Olga's comments
str,str,i64,i64,i64,str
"""clinical treat...","""http://www.ebi...",687,31,687,
"""employment sta...","""http://www.ebi...",687,10,687,
"""cortical thick...","""http://www.ebi...",325,1292,325,


Let's filter by curators comments and fix typos

In [76]:
curator_comments = (
    pl.col("Olga's comments")
        .str.strip() # steam leading and ending spaces, they are often added accidentally and cause troubles
        .str.to_lowercase()
        .str.replace(" ", "_") # let's go dash instead of spaces
        .str.replace("/", ", ").str.replace("\?",  "") #clean up from some redundant symbols
        .str.replace("muskuloskeletal", "musculoskeletal").str.replace('musculosceletal', "musculoskeletal") # typos correction
        .str.replace("obesiy", 'obesity')
        .str.replace("homeostais","homeostasis")
        .str.replace(", ", "_and_").str.replace(",", "_and_") #let's first use end and then check if we can go further
) # cleaned up olga comments column

traits_filtered: pl.DataFrame = traits_df.filter(pl.col("Olga's comments").is_not_null())\
    .with_column(curator_comments).rename({"Olga's comments": "curator_comments"})
traits_filtered.head(10)

trait,trait_uri,studies,association_counts,papers,curator_comments
str,str,i64,i64,i64,str
"""type 2 diabete...","""http://purl.ob...",175,5003,175,"""glucose_homeos..."
"""body mass inde...","""http://www.ebi...",151,8912,151,"""glucose_homeos..."
"""COVID-19""","""http://purl.ob...",123,659,123,"""lung"""
"""asthma""","""http://purl.ob...",112,1475,112,"""lung"""
"""Alzheimer dise...","""http://purl.ob...",89,669,89,"""mental"""
"""bone fracture""","""http://www.ebi...",85,53,85,"""musculoskeleta..."
"""systolic blood...","""http://www.ebi...",73,3435,73,"""cardiovascular..."
"""coronary arter...","""http://www.ebi...",72,2408,72,"""cardiovascular..."
"""diastolic bloo...","""http://www.ebi...",68,2316,68,"""cardiovascular..."
"""osteoarthritis...","""http://purl.ob...",60,99,60,"""musculoskeleta..."


In [77]:
traits_grouped = traits_filtered.groupby("curator_comments").agg(pl.all())
traits = traits_grouped.explode([c for c in traits_grouped.columns if c != "curator_comments"])
traits.head(10)

curator_comments,trait,trait_uri,studies,association_counts,papers
str,str,str,i64,i64,i64
"""other""","""age-related ma...","""http://www.ebi...",31,128,31
"""other""","""age at menopau...","""http://www.ebi...",22,327,22
"""other""","""age-related he...","""http://www.ebi...",13,270,13
"""other""","""menopause""","""http://www.ebi...",4,0,4
"""other""","""response to mT...","""http://www.ebi...",2,13,2
"""other""","""response to mi...","""http://www.ebi...",1,13,1
"""cardiovascular...","""systolic blood...","""http://www.ebi...",73,3435,73
"""cardiovascular...","""coronary arter...","""http://www.ebi...",72,2408,72
"""cardiovascular...","""diastolic bloo...","""http://www.ebi...",68,2316,68
"""cardiovascular...","""stroke""","""http://www.ebi...",46,150,46


## Writing results to folders

let's clean the output first

In [89]:
import shutil
if output.exists():
    shutil.rmtree( output)
output.mkdir()

Let's create folders for modules

In [83]:
modules = traits_filtered.select([pl.col("curator_comments").unique()]).to_series().to_list()
modules

['longevity',
 'lung',
 'inflammation',
 'other',
 'cardiovascular',
 'glucose_homeostasis_and_obesity',
 'musculoskeletal',
 'mental',
 'metabolic_health_and_obesity']

In [90]:
for m in modules:
    folder = (output / m)
    file = folder / (m+".tsv")
    folder.mkdir()
    print(f"writing module {m} to {folder} with traits file {file}")
    module_df = traits.filter(pl.col('curator_comments') == m)
    module_df.write_csv(file = str(file), sep='\t')


writing module longevity to data/output/longevity with traits file data/output/longevity/longevity.tsv
writing module lung to data/output/lung with traits file data/output/lung/lung.tsv
writing module inflammation to data/output/inflammation with traits file data/output/inflammation/inflammation.tsv
writing module other to data/output/other with traits file data/output/other/other.tsv
writing module cardiovascular to data/output/cardiovascular with traits file data/output/cardiovascular/cardiovascular.tsv
writing module glucose_homeostasis_and_obesity to data/output/glucose_homeostasis_and_obesity with traits file data/output/glucose_homeostasis_and_obesity/glucose_homeostasis_and_obesity.tsv
writing module musculoskeletal to data/output/musculoskeletal with traits file data/output/musculoskeletal/musculoskeletal.tsv
writing module mental to data/output/mental with traits file data/output/mental/mental.tsv
writing module metabolic_health_and_obesity to data/output/metabolic_health_and_