## 1. Imports & File Paths

In [None]:
import pandas as pd
import subprocess

RAW_DATA_DIR = "../data/raw/"
PROCESSED_DATA_DIR = "../data/processed/EUR/"
PLINK_PATH = "../plink"

selected_group = "EUR"

sample_file = f"{RAW_DATA_DIR}synthetic_v1.sample"
fam_file = f"{RAW_DATA_DIR}synthetic_v1.fam"
pheno_file = f"{RAW_DATA_DIR}synthetic_v1.pheno3"
chr_plink_path = f"{RAW_DATA_DIR}synthetic_v1_chr-"

filtered_fam = f"{PROCESSED_DATA_DIR}synthetic_v1_filtered.fam"
filtered_pheno = f"{PROCESSED_DATA_DIR}synthetic_v1_filtered.pheno"
filtered_chr_plink_path = f"{PROCESSED_DATA_DIR}synthetic_v1_filtered_chr-"

chromosomes = list(range(1, 23))

## 2 Preprocessing

### 2.2 Load `.sample` and `.fam` files

In [None]:
sample_df = pd.read_csv(sample_file, sep='\s+', header=None, names=["Ancestry"])

fam_cols = ["FID", "IID", "Father", "Mother", "Sex", "Phenotype"]
fam_df = pd.read_csv(fam_file, sep='\s+', names=fam_cols)

sample_df["IID"] = fam_df["IID"].values

sample_df.head(), fam_df.head()

### 2.3 Filter individuals by ancestry

In [None]:
filtered_ids = sample_df[sample_df['Ancestry'] == selected_group]["IID"].tolist()
filtered_fam_df = fam_df[fam_df["IID"].isin(filtered_ids)]

filtered_fam_df.head()

### 2.4 Load and merge `.pheno3` (phenotype file). 

In [None]:
pheno_df = pd.read_csv(pheno_file, delim_whitespace=True)

pheno_df.rename(columns={"Sample": "IID"}, inplace=True)
filtered_fam_df = fam_df[fam_df["IID"].isin(filtered_ids)].merge(pheno_df[["IID", "Phenotype(binary)"]], on="IID", how="inner")
filtered_fam_df["Phenotype"] = filtered_fam_df["Phenotype(binary)"].map({0: 1, 1: 2})  # 1 = control, 2 = case
filtered_fam_df = filtered_fam_df.drop("Phenotype(binary)", axis=1)

filtered_fam_df[["FID", "IID", "Phenotype"]].to_csv(filtered_pheno, sep=" ", index=False, header=False)
filtered_fam_df.to_csv(filtered_fam, sep=" ", index=False, header=False)

print(filtered_fam_df["Phenotype"].value_counts())
filtered_fam_df.head()

### 2.6 Filter based on filtered `.fam` file & QC

In [None]:
for chrom in chromosomes:
    print(f"==== Filtering {chrom}... ====")
    plink_filter_cmd = [
        PLINK_PATH, '--bfile', f'{chr_plink_path}{chrom}',
        '--keep', filtered_fam,
        '--pheno', filtered_pheno,
        '--geno', '0.05',
        '--hwe', '1e-6',
        '--maf', '0.01',
        '--allow-no-sex',
        '--make-bed',
        '--out', f'{filtered_chr_plink_path}{chrom}'
    ]
    subprocess.run(plink_filter_cmd)