In [1]:
import numpy as np
import pandas as pd

In [2]:
# Without loss of generality, use Chromosome 21 Sample IDs
#S=pd.read_csv('/scratch/users/magu/deepmix/data/ref_panel_plus_two.txt', 
#              header=None, sep='\t', dtype=str).iloc[:,1].tolist()
S=np.load('/home/magu/deepmix/data/reference_panel/panel_chr21.npz')['S']
#S=np.load('/scratch/users/magu/deepmix/data/panel_chr2.npz')['S']
S=list(set([i[:-3] for i in list(S)])) # remove _S1 _S2 strand encoding
print(len(S))
S[:5]

1382


['HGDP01196', 'HGDP01198', 'HG02854', 'NA18740', 'HG00622']

In [3]:
# Load demographic info
D=pd.read_csv('reference_panel_metadata_w_qs.tsv', sep='\t', index_col='Sample').loc[S,:]
D.columns

Index(['Population code', 'Population', 'Superpopulation code',
       'Superpopulation', 'Source', 'Latitude', 'Longitude', 'Region',
       'Sample Alias', 'Country', 'Town', 'Single_Ancestry', 'k7_EUR',
       'k7_AHG', 'k7_WAS', 'k7_NAT', 'k7_EAS', 'k7_SAS', 'k7_AFR', 'k8_EUR',
       'k8_AFR', 'k8_NAT', 'k8_EAS', 'k8_SAS', 'k8_AHG', 'k8_OCE', 'k8_WAS',
       'k6_AHG', 'k6_EAS', 'k6_NAT', 'k6_AFR', 'k6_EUR', 'k6_SAS', 'Panel'],
      dtype='object')

In [4]:
# Load 1KG family information 
F=pd.read_csv('20130606_g1k.ped.txt', sep='\t', index_col='Individual ID')

# check that everyone has a family
print(F['Family ID'].isnull().sum())
F.head()

0


Unnamed: 0_level_0,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,Third Order,Other Comments
Individual ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
HG01879,BB01,0,0,1,0,ACB,father,0,0,0,0
HG01880,BB01,0,0,2,0,ACB,mother,0,0,0,0
HG01881,BB01,HG01879,HG01880,2,0,ACB,child,0,0,0,0
HG01882,BB02,0,0,1,0,ACB,father,0,0,0,0
HG01883,BB02,0,0,2,0,ACB,mother,0,0,0,0


In [5]:
# Join demographic and family info
X=pd.merge(D, F, left_index=True, right_index=True, how='outer')

# S/HGDP samples don't have FID, so use IID
X['Family ID']=X['Family ID'].fillna(pd.Series(X.index)) 

# check that this worked
print(X['Family ID'].isnull().sum())
X.shape

298


(3799, 45)

In [6]:
# filter down to panel individuals and store the old table
XX=X
X=X.loc[S,:]
X.shape

(1382, 45)

In [7]:
# drop African Hunter-Gatherers (AHG)
X=X[X['Panel']!='AHG']
X.shape

(1361, 45)

In [8]:
# Display population counts
X['Panel'].value_counts()

EAS    494
AFR    382
SAS    171
EUR    155
NAT     75
WAS     66
OCE     16
Name: Panel, dtype: int64

In [9]:
# look at relatedness info
X['Relationship'].value_counts()

unrel                      392
father                     304
mother                     293
mat grandfather             17
mat grandmother             17
pat grandmother             15
child                       13
unrels                      13
pat grandfather             13
not father                   2
pat grandmother; mother      1
husband of Child             1
wife of child                1
mother; child                1
Child                        1
Name: Relationship, dtype: int64

In [10]:
# Check for family relationships (i.e. look for children)
for fid in X['Family ID'].value_counts().index[X['Family ID'].value_counts()>1]:
    if any(['hild' in x for x in X.loc[X['Family ID']==fid,'Relationship']]):
        print(X.loc[X['Family ID']==fid,'Relationship'])

NA12890    pat grandmother; mother
NA12878              mother; child
Name: Relationship, dtype: object


In [11]:
# Assume parental sets are unrelated, so just remove NA12890
X=X.drop('NA12890')
print(X.shape)
X['Panel'].value_counts()

(1360, 45)


EAS    494
AFR    382
SAS    171
EUR    154
NAT     75
WAS     66
OCE     16
Name: Panel, dtype: int64

In [12]:
# Check for siblings
rels=['Siblings', 'Second Order', 'Third Order']

sibs=set()
for rel in rels:
    for inds in set(X[[rel]].to_numpy().flatten().tolist()):
        if str(inds) in ['0','nan']:
            continue
        for ind in inds.split(','):
            if ind in X.index:
                sibs.add(ind)
    print(rel+': '+str(len(sibs)))

X.loc[list(sibs),rels+['Panel','Population code']].sort_values('Population code')

Siblings: 0
Second Order: 0
Third Order: 15


Unnamed: 0,Siblings,Second Order,Third Order,Panel,Population code
HG03352,0,0,"HG03366,HG03343",AFR,ESN
HG03366,0,0,HG03352,AFR,ESN
HG03343,0,0,HG03352,AFR,ESN
HG03372,0,0,HG03301,AFR,ESN
HG03268,0,0,HG03271,AFR,ESN
HG03301,0,0,HG03372,AFR,ESN
HG03271,0,0,HG03268,AFR,ESN
HG02624,0,0,0,AFR,GWD
HG02666,0,0,"HG02666,HG02624",AFR,GWD
HG03469,0,0,0,AFR,MSL


In [13]:
# remove all third cousins
X=X.drop(sibs)
print(X.shape)
X['Panel'].value_counts()

(1345, 45)


EAS    494
AFR    367
SAS    171
EUR    154
NAT     75
WAS     66
OCE     16
Name: Panel, dtype: int64

In [14]:
# how many of each should we get if sampling randomly
(X['Panel'].value_counts()*(200./X.shape[0])).astype(int)

EAS    73
AFR    54
SAS    25
EUR    22
NAT    11
WAS     9
OCE     2
Name: Panel, dtype: int64

In [15]:
# select test set individuals with these sizes
pop_n={'OCE':4, 'WAS':10,'NAT':16,'EUR':24,'SAS':24,'AFR':52,'EAS':70}
test=[]
for pop,n in pop_n.items():
    # reproducibly sample
    np.random.seed(10835412)
    test+=np.random.choice(X[X['Panel']==pop].index, replace=False, size=n).tolist()
X.loc[test,'Panel'].value_counts()

EAS    70
AFR    52
SAS    24
EUR    24
NAT    16
WAS    10
OCE     4
Name: Panel, dtype: int64

In [16]:
# pick a (smaller) dev set in the same way
dev=[]
for pop,n in pop_n.items():
    # reproducibly sample
    np.random.seed(10835412)
    dev+=np.random.choice(X[(X['Panel']==pop) & ~(X.index.isin(test))].index, 
                          replace=False, size=n//2).tolist()
X.loc[dev,'Panel'].value_counts()

EAS    35
AFR    26
SAS    12
EUR    12
NAT     8
WAS     5
OCE     2
Name: Panel, dtype: int64

In [17]:
# train set counts
train=X.index[~(X.index.isin(test) | X.index.isin(dev))].to_numpy()
X.loc[train,'Panel'].value_counts()

EAS    389
AFR    289
SAS    135
EUR    118
NAT     51
WAS     51
OCE     10
Name: Panel, dtype: int64

In [18]:
# write to file -- with a safety
if True: 
    for group,name in zip([train,dev,test],['train','dev','test']):
        group=list(sorted(list(group)))
        with open('split/'+name+'.txt','w') as o:
            o.write('\n'.join(group)+'\n')
        with open('split/'+name+'.strands.txt','w') as o:
            o.write('\n'.join([i+s for i in group for s in ['_S1','_S2']])+'\n')
        if name != 'train':
            with open('split/'+name+'.superpop.txt','w') as o:
                o.write('\n'.join(['\t'.join([i, X.loc[i,'Panel']]) for i in group])+'\n')

In [19]:
pd.concat({'Train':X.loc[train,'Panel'].value_counts(), 
           'Dev':X.loc[dev,'Panel'].value_counts(), 
           'Test':X.loc[test,'Panel'].value_counts()}, axis=1)

Unnamed: 0,Train,Dev,Test
EAS,389,35,70
AFR,289,26,52
SAS,135,12,24
EUR,118,12,24
NAT,51,8,16
WAS,51,5,10
OCE,10,2,4
