## 1. Imports & File Paths

In [1]:
import os
import pandas as pd
import subprocess

RAW_DATA_DIR = "../data/raw/"
PROCESSED_DATA_DIR = "../data/processed/EUR/"
PLINK_PATH = "../plink"

sample_file = f"{RAW_DATA_DIR}synthetic_v1.sample"
fam_file = f"{RAW_DATA_DIR}synthetic_v1.fam"
pheno_file = f"{RAW_DATA_DIR}synthetic_v1.pheno3"
chr_plink_path = f"{RAW_DATA_DIR}synthetic_v1_chr-"

filtered_fam = f"{PROCESSED_DATA_DIR}synthetic_v1_filtered.fam"
filtered_pheno = f"{PROCESSED_DATA_DIR}synthetic_v1_filtered.pheno"
filtered_chr_plink_path = f"{PROCESSED_DATA_DIR}synthetic_v1_filtered_chr-"

chromosomes = list(range(1, 23))

## 2 Preprocessing

### 2.2 Load `.sample` and `.fam` files

In [2]:
sample_df = pd.read_csv(sample_file, sep='\s+', header=None, names=["Ancestry"])

fam_cols = ["FID", "IID", "Father", "Mother", "Sex", "Phenotype"]
fam_df = pd.read_csv(fam_file, sep='\s+', names=fam_cols)

sample_df["IID"] = fam_df["IID"].values

sample_df.head(), fam_df.head()

(  Ancestry   IID
 0      AFR  syn1
 1      AFR  syn2
 2      AFR  syn3
 3      AFR  syn4
 4      AFR  syn5,
     FID   IID  Father  Mother  Sex  Phenotype
 0  syn1  syn1       0       0    0         -9
 1  syn2  syn2       0       0    0         -9
 2  syn3  syn3       0       0    0         -9
 3  syn4  syn4       0       0    0         -9
 4  syn5  syn5       0       0    0         -9)

### 2.3 Filter individuals by ancestry

In [3]:
selected_group = "EUR"
filtered_ids = sample_df[sample_df['Ancestry'] == selected_group]["IID"].tolist()
filtered_fam_df = fam_df[fam_df["IID"].isin(filtered_ids)]

filtered_fam_df.head()

Unnamed: 0,FID,IID,Father,Mother,Sex,Phenotype
504000,syn504001,syn504001,0,0,0,-9
504001,syn504002,syn504002,0,0,0,-9
504002,syn504003,syn504003,0,0,0,-9
504003,syn504004,syn504004,0,0,0,-9
504004,syn504005,syn504005,0,0,0,-9


### 2.4 Load and merge `.pheno3` (phenotype file). 

In [4]:
pheno_df = pd.read_csv(pheno_file, delim_whitespace=True)

pheno_df.rename(columns={"Sample": "IID"}, inplace=True)
filtered_fam_df = fam_df[fam_df["IID"].isin(filtered_ids)].merge(pheno_df[["IID", "Phenotype(binary)"]], on="IID", how="inner")
filtered_fam_df["Phenotype"] = filtered_fam_df["Phenotype(binary)"].map({0: 1, 1: 2})  # 1 = control, 2 = case
filtered_fam_df = filtered_fam_df.drop("Phenotype(binary)", axis=1)

filtered_fam_df[["FID", "IID", "Phenotype"]].to_csv(filtered_pheno, sep=" ", index=False, header=False)
filtered_fam_df.to_csv(filtered_fam, sep=" ", index=False, header=False)

print(filtered_fam_df["Phenotype"].value_counts())
filtered_fam_df.head()

  pheno_df = pd.read_csv(pheno_file, delim_whitespace=True)


Phenotype
2    84065
1    83935
Name: count, dtype: int64


Unnamed: 0,FID,IID,Father,Mother,Sex,Phenotype
0,syn504001,syn504001,0,0,0,2
1,syn504002,syn504002,0,0,0,2
2,syn504003,syn504003,0,0,0,1
3,syn504004,syn504004,0,0,0,2
4,syn504005,syn504005,0,0,0,2


### 2.6 Filter based on filtered `.fam` file & QC

In [None]:
for chrom in chromosomes:
    print(f"==== Filtering {chrom}... ====")
    plink_filter_cmd = [
        PLINK_PATH, '--bfile', f'{chr_plink_path}{chrom}',
        '--keep', filtered_fam,
        '--pheno', filtered_pheno,
        '--geno', '0.05',
        '--hwe', '1e-6',
        '--maf', '0.01',
        '--allow-no-sex',
        '--make-bed',
        '--out', f'{filtered_chr_plink_path}{chrom}'
    ]
    subprocess.run(plink_filter_cmd)



==== Filtering 1... ====
PLINK v1.9.0-b.7.7 64-bit (22 Oct 2024)            cog-genomics.org/plink/1.9/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ../data/processed/EUR/synthetic_v1_filtered_chr-1.log.
Options in effect:
  --allow-no-sex
  --bfile ../data/raw/synthetic_v1_chr-1
  --geno 0.05
  --hwe 1e-6
  --keep ../data/processed/EUR/synthetic_v1_filtered.fam
  --maf 0.01
  --make-bed
  --out ../data/processed/EUR/synthetic_v1_filtered_chr-1
  --pheno ../data/processed/EUR/synthetic_v1_filtered.pheno

16384 MB RAM detected; reserving 8192 MB for main workspace.
533532 variants loaded from .bim file.
1008000 people (0 males, 0 females, 1008000 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
../data/processed/EUR/synthetic_v1_filtered_chr-1.nosex .
168000 phenotype values present after --pheno.
--keep: 168000 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 168000 founders and 