In [12]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor

# Base URL for the GWAS Catalog
base_url = "https://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/"

# URL for the list of harmonized summary statistics files
list_url = base_url + "harmonised_list.txt"

# Download the list of files
response = requests.get(list_url)
file_list = response.text.splitlines()

# Set the number of files to download
num_files = 2
file_list = file_list[:num_files]

# Set the path where the files should be saved
path = 'C:\\Users\\falty\\Desktop\\geometric-omics\\GWAS-Catalog\\data\\'

def download_file(file_url):
    # Download the file
    response = requests.get(file_url)

    # Get the file name from the file_url
    filename = file_url.split("/")[-1]

    # Create full file path
    file_path = os.path.join(path, filename)

    # Write the data to a file
    with open(file_path, 'wb') as f:
        f.write(response.content)
    
    # Return the file name
    return filename

# Set the number of threads
num_threads = 12

# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Use the executor to map the download_file function
    # to the list of file URLs
    file_urls = [base_url + file for file in file_list]
    file_names = list(executor.map(download_file, file_urls))

# Print the file names
print("File names:")
for name in file_names:
    print(name)

File names:
35078996-GCST90086737-EFO_0007937.h.tsv.gz
35078996-GCST90086591-EFO_0007937.h.tsv.gz


In [28]:
import pandas as pd
import glob

# Define your path here
path = 'C:\\Users\\falty\\Desktop\\geometric-omics\\GWAS-Catalog\\data\\'

# Get a list of all the files ending in .h.tsv.gz in the specified directory
files = glob.glob(path + "/*.h.tsv.gz")

# Define the column types
column_types = {
    'hm_variant_id': str,
    'hm_rsid': str,
    'hm_chrom': str,
    'hm_pos': int,
    'hm_other_allele': str,
    'hm_effect_allele': str,
    'hm_beta': float,
    'hm_odds_ratio': float,
    'hm_ci_lower': float,
    'hm_ci_upper': float,
    'hm_effect_allele_frequency': float,
    'hm_code': int,
    'variant_id': str,
    'p_value': float,
    'traits': str
}

# Load the summary statistics list into a dataframe
summary_stats_df = pd.read_csv(path + '/list_gwas_summary_statistics.tsv', sep='\t')

# Set the index to be the 'Study Accession' column for easier lookup
summary_stats_df.set_index('Study Accession', inplace=True)

# Create a list to hold dataframes
dfs = []

# Loop over the list of files
for file in files:
    # Extract the filename from the path
    filename = file.split('\\')[-1]

    # Extract the study_accession from the filename
    study_accession = filename.split('-')[1]

    # Get the corresponding traits from the summary statistics list
    traits = summary_stats_df.loc[study_accession, 'Trait(s)']

    # Read the file into a dataframe
    df = pd.read_csv(file, sep='\t', low_memory=False)

    # Convert the 'hm_chrom' column to object type
    df['hm_chrom'] = df['hm_chrom'].astype(object)

    # Add a new column with the traits
    df['traits'] = traits

    # Append the dataframe to the list
    dfs.append(df)

# Concatenate all the dataframes together
result = pd.concat(dfs, ignore_index=True)

# Select only the columns you're interested in
cols = [
    'hm_variant_id', 'hm_rsid', 'hm_chrom', 'hm_pos', 'hm_other_allele',
    'hm_effect_allele', 'hm_beta', 'hm_odds_ratio', 'hm_ci_lower', 'hm_ci_upper',
    'hm_effect_allele_frequency', 'hm_code', 'variant_id', 'p_value', 'traits'
]

result = result[cols]

# Print the final result
print(result.columns)

Index(['hm_variant_id', 'hm_rsid', 'hm_chrom', 'hm_pos', 'hm_other_allele',
       'hm_effect_allele', 'hm_beta', 'hm_odds_ratio', 'hm_ci_lower',
       'hm_ci_upper', 'hm_effect_allele_frequency', 'hm_code', 'variant_id',
       'p_value', 'traits'],
      dtype='object')


In [30]:
result.head()

Unnamed: 0,hm_variant_id,hm_rsid,hm_chrom,hm_pos,hm_other_allele,hm_effect_allele,hm_beta,hm_odds_ratio,hm_ci_lower,hm_ci_upper,hm_effect_allele_frequency,hm_code,variant_id,p_value,traits
0,1_910255_C_T,rs117086422,1,910255.0,C,T,0.017964,,,,0.202,10,rs117086422,0.411336,[blood protein measurement]
1,1_910558_G_A,rs57760052,1,910558.0,G,A,0.016885,,,,0.2036,10,rs57760052,0.438808,[blood protein measurement]
2,1_910698_C_T,rs28612348,1,910698.0,C,T,0.01599,,,,0.1952,10,rs28612348,0.470367,[blood protein measurement]
3,1_911018_G_A,rs58781670,1,911018.0,G,A,0.017412,,,,0.203,10,rs58781670,0.424585,[blood protein measurement]
4,1_911428_C_T,rs4475691,1,911428.0,C,T,0.018561,,,,0.1977,10,rs4475691,0.400094,[blood protein measurement]
