In [1]:
import numpy as np
import pandas as pd

## Load the sample list (with haplotype code appended)

In [2]:
sample_list = "sample_list.txt"
sample_list = [s.strip() for s in open(sample_list, "r").readlines()]

## Load the sample table which contains sub/super pop info

In [3]:
sample_table = "1kg_sample_table.tsv"
sample_table = pd.read_csv(sample_table, sep='\t')
sample_table.head()

Unnamed: 0,Sample name,Sex,Biosample ID,Population code,Population name,Superpopulation code,Superpopulation name,Population elastic ID,Data collections
0,HG00271,male,SAME123417,FIN,Finnish,EUR,European Ancestry,FIN,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
1,HG00276,female,SAME123424,FIN,Finnish,EUR,European Ancestry,FIN,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
2,HG00288,female,SAME1839246,FIN,Finnish,EUR,European Ancestry,FIN,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
3,HG00290,male,SAME1839057,FIN,Finnish,EUR,European Ancestry,FIN,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
4,HG00308,male,SAME124161,FIN,Finnish,EUR,European Ancestry,FIN,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."


## Remove AFR samples from the table

In [4]:
no_afr_table = sample_table[sample_table["Superpopulation code"] != "AFR"]


## Remove AFR samples from the sample list

In [5]:
keep_samples = set(no_afr_table["Sample name"].values) 
filtered_list = [s for s in sample_list if s.split("_")[0] in keep_samples]

## Sanity checks

In [6]:
print(f"{len(filtered_list)//2 = }")
print(f"{len(sample_list)//2 = }")

len(filtered_list)//2 = 2309
len(sample_list)//2 = 3202


In [7]:
print(f"{len(no_afr_table) = }")
print(f"{len(sample_table) = }")

len(no_afr_table) = 2309
len(sample_table) = 3202


## Save to disk

In [9]:
with open("no_afr_sample_list.txt", "w") as f:
    for sample in filtered_list:
        f.write(f"{sample}\n")

In [None]:
no_afr_table.to_csv("no_afr_sample_table.tsv", sep="\t")