In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
# find data, load
data_dir='/scratch/users/magu/deepmix/data/simulated_chr20/'

yhat_raw=pd.read_table(data_dir+'vcf/rf_out/dev_10gen.no_OCE_WAS.query.msp.tsv', skiprows=1)
y=np.load(data_dir+'label/dev_10gen.no_OCE_WAS.result.npz')

In [3]:
# match samples
S=np.array([s.replace('_S1','.0').replace('_S2','.1') for s in y['S']])
print(S.shape, np.sum(S==yhat_raw.columns[6:]))

(200,) 200


In [4]:
# expand rfmix windows into variant-level assignments
V_pos=y['V'][:,1].astype(int)
yhat=pd.DataFrame(index=['_'.join(s) for s in y['V']], columns=S)

for ix in range(yhat_raw.shape[0]):
    ids=(yhat_raw.iloc[ix,1] <= V_pos) & (V_pos <= yhat_raw.iloc[ix,2])
    yhat.iloc[ids,:]=np.vstack([yhat_raw.iloc[ix,6:] for _ in range(sum(ids))]).astype(int)+1

In [5]:
# confusion
cm=confusion_matrix(y['L'].flatten(), yhat.T.values.flatten().astype(int))

In [6]:
# accuracy
np.sum(np.diag(cm))/np.sum(cm) 

0.9748963527547354

In [7]:
# rows are truth, columns are labels;
# sensitivity (row-normalized, diagonal is fraction of A which we say is A_hat)
anc_label=['AFR', 'EAS', 'EUR', 'NAT', 'SAS']
pd.DataFrame(cm, index=anc_label, columns=anc_label).divide(cm.sum(axis=1), axis=0)

Unnamed: 0,AFR,EAS,EUR,NAT,SAS
AFR,0.989071,0.003264,0.004964,0.000494,0.002206
EAS,0.001201,0.98015,0.011728,0.002715,0.004206
EUR,0.001459,0.008843,0.965348,0.004331,0.020018
NAT,0.00226,0.011493,0.000906,0.985335,6e-06
SAS,0.000932,0.052717,0.042211,0.005568,0.898573


In [8]:
# specificity (column-normalized, diagonal is fraction of A_hat which is truly A)
pd.DataFrame(cm, index=anc_label, columns=anc_label).divide(cm.sum(axis=0), axis=1)

Unnamed: 0,AFR,EAS,EUR,NAT,SAS
AFR,0.996887,0.002462,0.011044,0.001782,0.008804
EAS,0.001608,0.981923,0.034661,0.013006,0.022301
EUR,0.000637,0.002885,0.929281,0.006758,0.034576
NAT,0.000624,0.002374,0.000552,0.973224,6e-06
SAS,0.000245,0.010356,0.024462,0.00523,0.934313


In [9]:
pd.DataFrame(cm, index=anc_label, columns=anc_label)

Unnamed: 0,AFR,EAS,EUR,NAT,SAS
AFR,31020763,102383,155704,15500,69177
EAS,50045,40837419,488652,113126,175228
EUR,19807,120005,13100986,58782,271675
NAT,19415,98739,7782,8465037,48
SAS,7614,430690,344859,45487,7341277
