In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
# find data, load
data_dir='/scratch/users/magu/deepmix/data/simulated_chr20/'

yhat_raw=pd.read_table(data_dir+'vcf/rf_out/dev_10gen.no_OCE_WAS.query.msp.tsv', skiprows=1)
y=np.load(data_dir+'label/dev_10gen.no_OCE_WAS.result.npz')

In [4]:
# match samples
S=np.array([s.replace('_S1','.0').replace('_S2','.1') for s in y['S']])
print(S.shape, np.sum(S==yhat_raw.columns[6:]))

(200,) 200


In [5]:
# expand rfmix windows into variant-level assignments
V_pos=y['V'][:,1].astype(int)
yhat=pd.DataFrame(index=['_'.join(s) for s in y['V']], columns=S)

for ix in range(yhat_raw.shape[0]):
    ids=(yhat_raw.iloc[ix,1] <= V_pos) & (V_pos <= yhat_raw.iloc[ix,2])
    yhat.iloc[ids,:]=np.vstack([yhat_raw.iloc[ix,6:] for _ in range(sum(ids))]).astype(int)+1

In [7]:
# confusion
cm=confusion_matrix(y['L'].flatten(), yhat.T.values.flatten().astype(int))

In [16]:
# accuracy
np.sum(np.diag(cm))/np.sum(cm) 

0.9797275934063595

In [17]:
# rows are truth, columns are labels;
# sensitivity (row-normalized, diagonal is fraction of A which we say is A_hat)
anc_label=['AFR', 'EAS', 'EUR', 'NAT', 'SAS']
pd.DataFrame(cm, index=anc_label, columns=anc_label).divide(cm.sum(axis=1), axis=0)

Unnamed: 0,AFR,EAS,EUR,NAT,SAS
AFR,0.992882,0.003226,0.002506,0.000625,0.000762
EAS,0.001376,0.981706,0.010637,0.002008,0.004273
EUR,0.001679,0.008379,0.975924,0.001055,0.012963
NAT,0.00225,0.008404,0.000906,0.988354,8.6e-05
SAS,0.001403,0.052143,0.029355,0.000711,0.916388


In [18]:
# specificity (column-normalized, diagonal is fraction of A_hat which is truly A)
pd.DataFrame(cm, index=anc_label, columns=anc_label).divide(cm.sum(axis=0), axis=1)

Unnamed: 0,AFR,EAS,EUR,NAT,SAS
AFR,0.996451,0.002431,0.005608,0.002275,0.003039
EAS,0.001834,0.982865,0.031624,0.009713,0.022635
EUR,0.000729,0.002732,0.945099,0.001662,0.022367
NAT,0.000619,0.001735,0.000555,0.985677,9.4e-05
SAS,0.000367,0.010237,0.017114,0.000674,0.951865


In [20]:
pd.DataFrame(cm, index=anc_label, columns=anc_label)

Unnamed: 0,AFR,EAS,EUR,NAT,SAS
AFR,31140268,101173,78588,19595,23903
EAS,57323,40902264,443180,83667,178036
EUR,22787,113708,13244516,14317,175927
NAT,19332,72203,7782,8490966,738
SAS,11459,426008,239827,5807,7486826
