In [114]:
import pandas as pd
import numpy as np

### GWAS summary datasets

In [85]:
bbj_path = "/wynton/scratch/BMI206_NIC/BBJ/hum0197.v3.BBJ.PLT.v1/cl_GWASsummary_PLT_Japanese_SakaueKanai2020.auto.txt"
bbj = pd.read_csv(bbj_path, sep="\t")

In [86]:
page_path = "/wynton/scratch/BMI206_NIC/PAGE_data/cl_WojcikG_PMID_platelets"
page = pd.read_csv(page_path, sep="\t")

In [87]:
# clean bbj and page data to have only rsIDs by themselves
bbj['SNP'] = bbj['SNP'].str.split(':').str[0]
page['SNP'] = page['SNP'].str.split(':').str[0]


### Ref datasets

Not considering EUR because the BBJ is from Asia and PAGE is from AMR

In [4]:
# ref_eur_path = '/wynton/scratch/BMI206_NIC/ref_datasets/1KG_datasets/ldblk_1kg_eur/snpinfo_1kg_hm3'
# ref_eur = pd.read_csv(ref_eur_path, sep="\t", usecols=[1])

In [88]:
ref_amr_path = '/wynton/scratch/BMI206_NIC/ref_datasets/1KG_datasets/ldblk_1kg_amr/snpinfo_1kg_hm3'
ref_amr = pd.read_csv(ref_amr_path, sep="\t", usecols=[1])

In [89]:
ref_eas_path = '/wynton/scratch/BMI206_NIC/ref_datasets/1KG_datasets/ldblk_1kg_eas/snpinfo_1kg_hm3'
ref_eas = pd.read_csv(ref_eas_path, sep="\t", usecols=[1])

### Target datasets

In [90]:
alz_path = '/wynton/scratch/BMI206_NIC/naracGenos-gaw16raw.bim'
alz = pd.read_csv(alz_path, sep="\t", usecols=[1], header=None)
alz.columns = ['SNP']

## Intersect

In [91]:
bbj_set = set(bbj["SNP"])
print('bbj_set complete')

page_set = set(page["SNP"])
print('page_set complete')

# ref_eur_set = set(ref_eur["SNP"])
# print('ref_eur_set complete')

ref_amr_set = set(ref_amr["SNP"])
print('ref_amr_set complete')

ref_eas_set = set(ref_eas["SNP"])
print('ref_eas_set complete')

alz_set = set(alz["SNP"])
print('alz_set complete')


bbj_set complete
page_set complete
ref_amr_set complete
ref_eas_set complete
alz_set complete


In [92]:
interset = list(bbj_set.intersection(page_set, ref_amr_set, ref_eas_set, alz_set))

In [93]:
len(interset)

395406

Ok, so I established that there is an intersection between the three datasets I would need. Yay!

## Saving the file as a csv

In [94]:
# Convert the list to a DataFrame
interset_df = pd.DataFrame(interset, columns=['SNP'])

# Save the DataFrame to a CSV file
interset_df.to_csv('interset_2.csv', index=False)

# Downsampling GWAS summary stats

In order to have even representation across the different SNPs, I need to downsample the SNPs to be sure I am capturing an even number of SNPs.

In [95]:
page_i_alz = page_set.intersection(alz_set)
len(page_i_alz)

520390

In [96]:
page_i_amr = page_set.intersection(ref_amr_set)
len(page_i_amr)

1171405

In [97]:
bbj_i_alz = bbj_set.intersection(alz_set)
len(bbj_i_alz)

442859

In [98]:
bbj_i_eas = bbj_set.intersection(ref_eas_set)
len(bbj_i_eas)

957779

In [99]:
# all SNPs that do not overlap with either alz or amr or eas
page_anti_set = page_set.difference(alz_set, ref_amr_set, ref_eas_set)

In [100]:
# all SNPs that do not overlap with either alz or amr or eas
bbj_anti_set = bbj_set.difference(alz_set, ref_amr_set, ref_eas_set)

In [101]:
len(page_anti_set)

24081415

In [102]:
len(bbj_anti_set)

9402659

This is the set I will sample from!

In [103]:
len(interset)

395406

^ This is the number of SNPs I will downsample to with the remaining GWAS rows

In [59]:
# Sample DataFrame from your description
df = pd.DataFrame({
    'SNP': ['rs28544273', 'rs28527770', 'rs3094315', 'rs3131971', 'rs3115860'],
    'A1': ['T', 'T', 'G', 'T', 'C'],
    'A2': ['A', 'C', 'A', 'C', 'A'],
    'BETA': [-0.002152, -0.002130, 0.001966, 0.002179, 0.002134],
    'SE': [0.004887, 0.004876, 0.004516, 0.004754, 0.004841]
})

### BBJ simulation population

In [104]:
bbj_interset = bbj[bbj['SNP'].isin(interset)]

In [105]:
bbj_diff = bbj[~bbj['SNP'].isin(interset)].sample(len(interset), random_state=42)

In [106]:
bbj_sim_pop = pd.concat([bbj_interset, bbj_diff], axis=0, ignore_index=True)

In [107]:
bbj_sim_pop.shape

(790812, 5)

### PAGE simulation population

In [108]:
page_interset = page[page['SNP'].isin(interset)]

In [109]:
page_diff = page[~page['SNP'].isin(interset)].sample(len(interset), random_state=42)

In [110]:
page_sim_pop = pd.concat([page_interset, page_diff], axis=0, ignore_index=True)

In [111]:
page_sim_pop.shape

(790812, 5)

## Save the simulation GWAS data

In [112]:
bbj_sim_pop.to_csv('simulations/bbj_sim_pop.tsv', sep='\t', index=False, encoding='utf-8')

In [113]:
page_sim_pop.to_csv('simulations/page_sim_pop.tsv', sep='\t', index=False, encoding='utf-8')

## Save the individual GWAS summary data

In [116]:
bbj_diff.to_csv('simulations/bbj_only_pop.tsv', sep='\t', index=False, encoding='utf-8')
bbj_interset.to_csv('simulations/bbj_share_pop.tsv', sep='\t', index=False, encoding='utf-8')
page_diff.to_csv('simulations/page_only_pop.tsv', sep='\t', index=False, encoding='utf-8')
page_interset.to_csv('simulations/page_share_pop.tsv', sep='\t', index=False, encoding='utf-8')

# Supplemental

In [42]:
'4:117161848' in page_set

True

seems to be using GRCh37 cause it matched with rs9999998 here with 4:116240692 (GRCh37)

24/11/26 I am trying to merge the rsID stuff

In [25]:
import requests
import csv
import time


In [26]:

def fetch_rsids(df, retries=3, delay=1):
    results = []
    for index, row in df.iterrows():
        attempts = 0
        while attempts < retries:
            url = f"https://rest.ensembl.org/variation/human/chr{row['chromosome']}:{row['position']}_{row['ref']}/{row['alt']}?content-type=application/json"
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                rsid = data.get('name', 'Not found')
                results.append({
                    'chromosome': row['chromosome'],
                    'position': row['position'],
                    'ref': row['ref'],
                    'alt': row['alt'],
                    'rsid': rsid
                })
                break  # Exit the retry loop on success
            else:
                attempts += 1
                time.sleep(delay)  # Wait before retrying
                print(f"Retry {attempts}/{retries} for position {row['position']}")
        if attempts == retries:
            print(f"Error fetching data after {retries} attempts for position {row['position']}")
            results.append({
                'chromosome': row['chromosome'],
                'position': row['position'],
                'ref': row['ref'],
                'alt': row['alt'],
                'rsid': 'Error fetching data'
            })

    return pd.DataFrame(results)

In [7]:
page_small = page[:10]

In [21]:
# Convert data to a DataFrame
df = pd.DataFrame(page_small[1:], columns=['variant'])  # Skip the header line

# Split the 'variant' column into separate columns
df[['chromosome', 'position', 'ref', 'alt']] = df['variant'].str.split(r'[\s:]', expand=True)

# Drop the original 'variant' column
df.drop(columns=['variant'], inplace=True)

# Display the DataFrame
print(df)

  chromosome position ref alt
1          1    69487   G   A
2          1    69569   T   C
3          1   139853   C   T
4          1   692794  CA   C
5          1   693731   A   G
6          1   707522   G   C
7          1   717587   G   A
8          1   723329   A   T
9          1   730087   T   C


In [27]:
# 
snps_with_rsids = fetch_rsids(df)
for snp in snps_with_rsids:
    print(snp)

Retry 1/3 for position 69487
Retry 2/3 for position 69487
Retry 3/3 for position 69487
Error fetching data after 3 attempts for position 69487
Retry 1/3 for position 69569
Retry 2/3 for position 69569
Retry 3/3 for position 69569
Error fetching data after 3 attempts for position 69569
Retry 1/3 for position 139853
Retry 2/3 for position 139853
Retry 3/3 for position 139853
Error fetching data after 3 attempts for position 139853
Retry 1/3 for position 692794
Retry 2/3 for position 692794
Retry 3/3 for position 692794
Error fetching data after 3 attempts for position 692794
Retry 1/3 for position 693731
Retry 2/3 for position 693731
Retry 3/3 for position 693731
Error fetching data after 3 attempts for position 693731
Retry 1/3 for position 707522
Retry 2/3 for position 707522
Retry 3/3 for position 707522
Error fetching data after 3 attempts for position 707522
Retry 1/3 for position 717587
Retry 2/3 for position 717587
Retry 3/3 for position 717587
Error fetching data after 3 attempts

## UKBB

In [26]:
# Splitting the 'variants' column and keeping only the chromosome and position
page['chromosome_position'] = page['variant'].str.split(':').str[:2].str.join(':')

In [27]:
page

Unnamed: 0,variant,chromosome_position
0,1:15791:C:T,1:15791
1,1:69487:G:A,1:69487
2,1:69569:T:C,1:69569
3,1:139853:C:T,1:139853
4,1:692794:CA:C,1:692794
...,...,...
13791462,X:154929412:C:T,X:154929412
13791463,X:154929637:CT:C,X:154929637
13791464,X:154929952:CAA:C,X:154929952
13791465,X:154930230:A:G,X:154930230


In [28]:
# page.sort_values(by='variant', ascending=False)['variant']

In [29]:
# bbj.sort_values(by='SNP', ascending=False)

## Other

https://hgdownload.soe.ucsc.edu/downloads.html

https://www.ncbi.nlm.nih.gov/snp/?term=4%3A116240692

https://www.biostars.org/p/312369/

https://github.com/milospjanic/chrPos2rsID

https://ftp.ncbi.nih.gov/snp/

https://ftp.ncbi.nih.gov/snp/organisms/