In [39]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [40]:
import sys

sys.path.append('../../fastai/')

from fastai.structured import *
from fastai.column_data import *
import pandas as pd
from torch.nn import functional as F

In [41]:
pd.set_option('display.max_columns', None)

In [42]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                 y_range=None, use_bn=False):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        
        szs = [n_emb+n_cont] + szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: kaiming_normal(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        kaiming_normal(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn,self.y_range = use_bn,y_range

    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.relu6(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        if self.y_range:
            x = F.log_softmax(x) # F.sigmoid(x) for binary classification
            x = x*(self.y_range[1] - self.y_range[0])
            x = x+self.y_range[0]
        return x

In [43]:
class ColumnarDataset(Dataset):
    def __init__(self, cats, conts, y):
        n = len(cats[0]) if cats else len(conts[0])
        self.cats = np.stack(cats, 1).astype(np.int64) if cats else np.zeros((n,1))
        self.conts = np.stack(conts, 1).astype(np.float32) if conts else np.zeros((n,1))
        self.y = np.zeros((n,1)) if y is None else y.values # THIS LINE IS CHANGED FROM y[:, None]
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

    @classmethod
    def from_data_frames(cls, df_cat, df_cont, y=None):
        cat_cols = [c.values for n,c in df_cat.items()]
        cont_cols = [c.values for n,c in df_cont.items()]
        return cls(cat_cols, cont_cols, y)

    @classmethod
    def from_data_frame(cls, df, cat_flds, y=None):
        return cls.from_data_frames(df[cat_flds], df.drop(cat_flds, axis=1), y)

In [44]:
class ColumnarModelData(ModelData):
    def __init__(self, path, trn_ds, val_ds, bs, test_ds=None, shuffle=True):
        test_dl = DataLoader(test_ds, bs, shuffle=False, num_workers=1) if test_ds is not None else None
        super().__init__(path, DataLoader(trn_ds, bs, shuffle=shuffle, num_workers=1),
            DataLoader(val_ds, bs*2, shuffle=False, num_workers=1), test_dl)

    @classmethod
    def from_arrays(cls, path, val_idxs, xs, y, bs=64, test_xs=None, shuffle=True):
        ((val_xs, trn_xs), (val_y, trn_y)) = split_by_idx(val_idxs, xs, y)
        test_ds = PassthruDataset(*(test_xs.T), [0] * len(test_xs)) if test_xs is not None else None
        return cls(path, PassthruDataset(*(trn_xs.T), trn_y), PassthruDataset(*(val_xs.T), val_y),
                   bs=bs, shuffle=shuffle, test_ds=test_ds)

    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, test_df=None):
        test_ds = ColumnarDataset.from_data_frame(test_df, cat_flds) if test_df is not None else None
        return cls(path, ColumnarDataset.from_data_frame(trn_df, cat_flds, trn_y),
                    ColumnarDataset.from_data_frame(val_df, cat_flds, val_y), bs, test_ds=test_ds)

    @classmethod
    def from_data_frame(cls, path, val_idxs, df, y, cat_flds, bs, test_df=None):
        ((val_df, trn_df), (val_y, trn_y)) = split_by_idx(val_idxs, df, y)
        return cls.from_data_frames(path, trn_df, val_df, trn_y, val_y, cat_flds, bs, test_df=test_df)

    def get_learner(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                    y_range=None, use_bn=False, **kwargs):
        model = MixedInputModel(emb_szs, n_cont, emb_drop, out_sz, szs, drops, y_range, use_bn)
        return StructuredLearner(self, StructuredModel(to_gpu(model)), opt_fn=optim.Adam, **kwargs)

In [45]:
PATH = 'data/biaobin/'
data_txt = f'{PATH}/mmm_p15_m8.txt'

In [46]:
df = pd.read_table(data_txt)

In [47]:
df.head()

Unnamed: 0,Lung,Breast,Colon,Central_Nervous_System,Pancreas,Ovary,Prostate,Uterus,Kidney,Head_and_Neck,Stomach,Bladder,Liver,Skin,Thyroid,mut_ABL1,mut_AKT1,mut_AKT2,mut_AKT3,mut_ALK,mut_ALOX12B,mut_AMER1,mut_APC,mut_AR,mut_ARAF,mut_ARID1A,mut_ARID2,mut_ASXL1,mut_ATM,mut_ATR,mut_ATRX,mut_AURKA,mut_AURKB,mut_AXL,mut_BAP1,mut_BARD1,mut_BCL2,mut_BCL6,mut_BCOR,mut_BLM,mut_BRAF,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_BTK,mut_CARD11,mut_CASP8,mut_CBFB,mut_CBL,mut_CCND1,mut_CCND2,mut_CCND3,mut_CCNE1,mut_CD79A,mut_CD79B,mut_CDC73,mut_CDH1,mut_CDK12,mut_CDK4,mut_CDK6,mut_CDK8,mut_CDKN1B,mut_CDKN2A,mut_CDKN2B,mut_CDKN2C,mut_CEBPA,mut_CHEK1,mut_CHEK2,mut_CIC,mut_CREBBP,mut_CRKL,mut_CRLF2,mut_CSF1R,mut_CTCF,mut_CTNNB1,mut_DAXX,mut_DDR2,mut_DIS3,mut_DNMT3A,mut_DOT1L,mut_EGFR,mut_EP300,mut_EPHA3,mut_EPHA5,mut_EPHB1,mut_ERBB2,mut_ERBB3,mut_ERBB4,mut_ERG,mut_ESR1,mut_EZH2,mut_FAM46C,mut_FANCA,mut_FANCC,mut_FBXW7,mut_FGF19,mut_FGF3,mut_FGF4,mut_FGFR1,mut_FGFR2,mut_FGFR3,mut_FGFR4,mut_FIP1L1,mut_FLT1,mut_FLT3,mut_FLT4,mut_FOXL2,mut_GATA1,mut_GATA2,mut_GATA3,mut_GNA11,mut_GNAQ,mut_GNAS,mut_GRIN2A,mut_GSK3B,mut_HGF,mut_HLA_A,mut_HRAS,mut_IDH1,mut_IDH2,mut_IGF1,mut_IGF1R,mut_IGF2,mut_IKBKE,mut_IKZF1,mut_IL7R,mut_INHBA,mut_INSRR,mut_IRF4,mut_IRS2,mut_JAK1,mut_JAK2,mut_JAK3,mut_JUN,mut_KDM5A,mut_KDM5C,mut_KDM6A,mut_KDR,mut_KEAP1,mut_KIT,mut_KMT2A,mut_KMT2D,mut_KRAS,mut_LMO1,mut_MAP2K1,mut_MAP2K2,mut_MAP2K4,mut_MAP3K1,mut_MAP3K13,mut_MCL1,mut_MDM2,mut_MDM4,mut_MED12,mut_MEF2B,mut_MEN1,mut_MET,mut_MITF,mut_MLH1,mut_MPL,mut_MRE11A,mut_MSH2,mut_MSH6,mut_MTOR,mut_MUTYH,mut_MYC,mut_MYCL,mut_MYCN,mut_MYD88,mut_NBN,mut_NCOR1,mut_NF1,mut_NF2,mut_NFE2L2,mut_NFKBIA,mut_NKX2_1,mut_NOTCH1,mut_NOTCH2,mut_NOTCH3,mut_NOTCH4,mut_NPM1,mut_NRAS,mut_NSD1,mut_NTRK1,mut_NTRK2,mut_NTRK3,mut_NUP93,mut_PAK7,mut_PALB2,mut_PARP1,mut_PAX5,mut_PBRM1,mut_PDGFRA,mut_PDGFRB,mut_PDPK1,mut_PIK3C2G,mut_PIK3C3,mut_PIK3CA,mut_PIK3CG,mut_PIK3R1,mut_PIK3R2,mut_PMS2,mut_PNRC1,mut_PPP2R1A,mut_PRDM1,mut_PRKAR1A,mut_PTCH1,mut_PTEN,mut_PTPN11,mut_RAD50,mut_RAD51,mut_RAD51B,mut_RAD51C,mut_RAD51D,mut_RAD52,mut_RAD54L,mut_RAF1,mut_RARA,mut_RB1,mut_REL,mut_RET,mut_RICTOR,mut_RNF43,mut_RPTOR,mut_RUNX1,mut_SETD2,mut_SF3B1,mut_SH2B3,mut_SMAD2,mut_SMAD4,mut_SMARCA4,mut_SMARCB1,mut_SMARCD1,mut_SMO,mut_SOCS1,mut_SOX2,mut_SPEN,mut_SPOP,mut_SRC,mut_STAG2,mut_STK11,mut_SUFU,mut_SYK,mut_TBX3,mut_TET2,mut_TGFBR2,mut_TNFAIP3,mut_TNFRSF14,mut_TOP1,mut_TP53,mut_TSC1,mut_TSC2,mut_TSHR,mut_VHL,mut_WT1,mut_XPO1,cna_ABL1,cna_AKT1,cna_AKT2,cna_AKT3,cna_ALK,cna_ALOX12B,cna_AMER1,cna_APC,cna_AR,cna_ARAF,cna_ARID1A,cna_ARID2,cna_ASXL1,cna_ATM,cna_ATR,cna_ATRX,cna_AURKA,cna_AURKB,cna_AXL,cna_BAP1,cna_BARD1,cna_BCL2,cna_BCL6,cna_BCOR,cna_BLM,cna_BRAF,cna_BRCA1,cna_BRCA2,cna_BRIP1,cna_BTK,cna_CARD11,cna_CASP8,cna_CBFB,cna_CBL,cna_CCND1,cna_CCND2,cna_CCND3,cna_CCNE1,cna_CD79A,cna_CD79B,cna_CDC73,cna_CDH1,cna_CDK12,cna_CDK4,cna_CDK6,cna_CDK8,cna_CDKN1B,cna_CDKN2A,cna_CDKN2B,cna_CDKN2C,cna_CEBPA,cna_CHEK1,cna_CHEK2,cna_CIC,cna_CREBBP,cna_CRKL,cna_CRLF2,cna_CSF1R,cna_CTCF,cna_CTNNB1,cna_DAXX,cna_DDR2,cna_DIS3,cna_DNMT3A,cna_DOT1L,cna_EGFR,cna_EP300,cna_EPHA3,cna_EPHA5,cna_EPHB1,cna_ERBB2,cna_ERBB3,cna_ERBB4,cna_ERG,cna_ESR1,cna_EZH2,cna_FANCA,cna_FANCC,cna_FBXW7,cna_FGF19,cna_FGF3,cna_FGF4,cna_FGFR1,cna_FGFR2,cna_FGFR3,cna_FGFR4,cna_FLT1,cna_FLT3,cna_FLT4,cna_FOXL2,cna_GATA1,cna_GATA2,cna_GATA3,cna_GNA11,cna_GNAQ,cna_GNAS,cna_GRIN2A,cna_GSK3B,cna_HGF,cna_HLA_A,cna_HRAS,cna_IDH1,cna_IDH2,cna_IGF1,cna_IGF1R,cna_IGF2,cna_IKBKE,cna_IKZF1,cna_IL7R,cna_INHA,cna_INHBA,cna_IRF4,cna_IRS2,cna_JAK1,cna_JAK2,cna_JAK3,cna_JUN,cna_KDM5A,cna_KDM5C,cna_KDM6A,cna_KDR,cna_KEAP1,cna_KIT,cna_KMT2A,cna_KMT2D,cna_KRAS,cna_LMO1,cna_MAP2K1,cna_MAP2K2,cna_MAP2K4,cna_MAP3K1,cna_MAP3K13,cna_MCL1,cna_MDM2,cna_MDM4,cna_MED12,cna_MEF2B,cna_MEN1,cna_MET,cna_MITF,cna_MLH1,cna_MPL,cna_MRE11A,cna_MSH2,cna_MSH6,cna_MTOR,cna_MUTYH,cna_MYC,cna_MYCL,cna_MYCN,cna_MYD88,cna_NBN,cna_NCOR1,cna_NF1,cna_NF2,cna_NFE2L2,cna_NFKBIA,cna_NKX2_1,cna_NOTCH1,cna_NOTCH2,cna_NOTCH3,cna_NOTCH4,cna_NPM1,cna_NRAS,cna_NSD1,cna_NTRK1,cna_NTRK2,cna_NTRK3,cna_NUP93,cna_PAK7,cna_PALB2,cna_PARP1,cna_PAX5,cna_PBRM1,cna_PDGFRA,cna_PDGFRB,cna_PDPK1,cna_PIK3C2G,cna_PIK3C3,cna_PIK3CA,cna_PIK3CG,cna_PIK3R1,cna_PIK3R2,cna_PMS2,cna_PNRC1,cna_PPP2R1A,cna_PRDM1,cna_PRKAR1A,cna_PTCH1,cna_PTEN,cna_PTPN11,cna_RAD50,cna_RAD51,cna_RAD51B,cna_RAD51C,cna_RAD51D,cna_RAD52,cna_RAD54L,cna_RAF1,cna_RARA,cna_RB1,cna_REL,cna_RET,cna_RICTOR,cna_RNF43,cna_RPTOR,cna_RUNX1,cna_SETD2,cna_SF3B1,cna_SH2B3,cna_SMAD2,cna_SMAD4,cna_SMARCA4,cna_SMARCB1,cna_SMARCD1,cna_SMO,cna_SOX2,cna_SPEN,cna_SPOP,cna_SRC,cna_STAG2,cna_STK11,cna_SUFU,cna_SYK,cna_TBX3,cna_TET2,cna_TGFBR2,cna_TNFAIP3,cna_TNFRSF14,cna_TOP1,cna_TP53,cna_TSC1,cna_TSC2,cna_TSHR,cna_VHL,cna_WT1,cna_XPO1,CIN,destination
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7


In [48]:
# try dropping some columns
#combined.drop(combined.iloc[:, 30:460], inplace=True, axis=1)
#combined.head()
df = df.drop(columns=['CIN'])

# Abodomen
# df = df[df['destination'] != 1]
# Bone
# df = df[df['destination'] != 2]
# Central Nervous System
# df = df[df['destination'] != 3]
# Chest
# df = df[df['destination'] != 4]
# Liver
# df = df[df['destination'] != 5]
# Lung
# df = df[df['destination'] != 6]
# Lymph Node
# df = df[df['destination'] != 7]
# Soft Tissue
# df = df[df['destination'] != 8]

target = 'Lung'
# +++ only use Lung data
df = df[df[target] == 1]
df.drop(df.iloc[:,0:16], inplace=True, axis=1)
# --- only use Lung data
df.head()

Unnamed: 0,mut_AKT1,mut_AKT2,mut_AKT3,mut_ALK,mut_ALOX12B,mut_AMER1,mut_APC,mut_AR,mut_ARAF,mut_ARID1A,mut_ARID2,mut_ASXL1,mut_ATM,mut_ATR,mut_ATRX,mut_AURKA,mut_AURKB,mut_AXL,mut_BAP1,mut_BARD1,mut_BCL2,mut_BCL6,mut_BCOR,mut_BLM,mut_BRAF,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_BTK,mut_CARD11,mut_CASP8,mut_CBFB,mut_CBL,mut_CCND1,mut_CCND2,mut_CCND3,mut_CCNE1,mut_CD79A,mut_CD79B,mut_CDC73,mut_CDH1,mut_CDK12,mut_CDK4,mut_CDK6,mut_CDK8,mut_CDKN1B,mut_CDKN2A,mut_CDKN2B,mut_CDKN2C,mut_CEBPA,mut_CHEK1,mut_CHEK2,mut_CIC,mut_CREBBP,mut_CRKL,mut_CRLF2,mut_CSF1R,mut_CTCF,mut_CTNNB1,mut_DAXX,mut_DDR2,mut_DIS3,mut_DNMT3A,mut_DOT1L,mut_EGFR,mut_EP300,mut_EPHA3,mut_EPHA5,mut_EPHB1,mut_ERBB2,mut_ERBB3,mut_ERBB4,mut_ERG,mut_ESR1,mut_EZH2,mut_FAM46C,mut_FANCA,mut_FANCC,mut_FBXW7,mut_FGF19,mut_FGF3,mut_FGF4,mut_FGFR1,mut_FGFR2,mut_FGFR3,mut_FGFR4,mut_FIP1L1,mut_FLT1,mut_FLT3,mut_FLT4,mut_FOXL2,mut_GATA1,mut_GATA2,mut_GATA3,mut_GNA11,mut_GNAQ,mut_GNAS,mut_GRIN2A,mut_GSK3B,mut_HGF,mut_HLA_A,mut_HRAS,mut_IDH1,mut_IDH2,mut_IGF1,mut_IGF1R,mut_IGF2,mut_IKBKE,mut_IKZF1,mut_IL7R,mut_INHBA,mut_INSRR,mut_IRF4,mut_IRS2,mut_JAK1,mut_JAK2,mut_JAK3,mut_JUN,mut_KDM5A,mut_KDM5C,mut_KDM6A,mut_KDR,mut_KEAP1,mut_KIT,mut_KMT2A,mut_KMT2D,mut_KRAS,mut_LMO1,mut_MAP2K1,mut_MAP2K2,mut_MAP2K4,mut_MAP3K1,mut_MAP3K13,mut_MCL1,mut_MDM2,mut_MDM4,mut_MED12,mut_MEF2B,mut_MEN1,mut_MET,mut_MITF,mut_MLH1,mut_MPL,mut_MRE11A,mut_MSH2,mut_MSH6,mut_MTOR,mut_MUTYH,mut_MYC,mut_MYCL,mut_MYCN,mut_MYD88,mut_NBN,mut_NCOR1,mut_NF1,mut_NF2,mut_NFE2L2,mut_NFKBIA,mut_NKX2_1,mut_NOTCH1,mut_NOTCH2,mut_NOTCH3,mut_NOTCH4,mut_NPM1,mut_NRAS,mut_NSD1,mut_NTRK1,mut_NTRK2,mut_NTRK3,mut_NUP93,mut_PAK7,mut_PALB2,mut_PARP1,mut_PAX5,mut_PBRM1,mut_PDGFRA,mut_PDGFRB,mut_PDPK1,mut_PIK3C2G,mut_PIK3C3,mut_PIK3CA,mut_PIK3CG,mut_PIK3R1,mut_PIK3R2,mut_PMS2,mut_PNRC1,mut_PPP2R1A,mut_PRDM1,mut_PRKAR1A,mut_PTCH1,mut_PTEN,mut_PTPN11,mut_RAD50,mut_RAD51,mut_RAD51B,mut_RAD51C,mut_RAD51D,mut_RAD52,mut_RAD54L,mut_RAF1,mut_RARA,mut_RB1,mut_REL,mut_RET,mut_RICTOR,mut_RNF43,mut_RPTOR,mut_RUNX1,mut_SETD2,mut_SF3B1,mut_SH2B3,mut_SMAD2,mut_SMAD4,mut_SMARCA4,mut_SMARCB1,mut_SMARCD1,mut_SMO,mut_SOCS1,mut_SOX2,mut_SPEN,mut_SPOP,mut_SRC,mut_STAG2,mut_STK11,mut_SUFU,mut_SYK,mut_TBX3,mut_TET2,mut_TGFBR2,mut_TNFAIP3,mut_TNFRSF14,mut_TOP1,mut_TP53,mut_TSC1,mut_TSC2,mut_TSHR,mut_VHL,mut_WT1,mut_XPO1,cna_ABL1,cna_AKT1,cna_AKT2,cna_AKT3,cna_ALK,cna_ALOX12B,cna_AMER1,cna_APC,cna_AR,cna_ARAF,cna_ARID1A,cna_ARID2,cna_ASXL1,cna_ATM,cna_ATR,cna_ATRX,cna_AURKA,cna_AURKB,cna_AXL,cna_BAP1,cna_BARD1,cna_BCL2,cna_BCL6,cna_BCOR,cna_BLM,cna_BRAF,cna_BRCA1,cna_BRCA2,cna_BRIP1,cna_BTK,cna_CARD11,cna_CASP8,cna_CBFB,cna_CBL,cna_CCND1,cna_CCND2,cna_CCND3,cna_CCNE1,cna_CD79A,cna_CD79B,cna_CDC73,cna_CDH1,cna_CDK12,cna_CDK4,cna_CDK6,cna_CDK8,cna_CDKN1B,cna_CDKN2A,cna_CDKN2B,cna_CDKN2C,cna_CEBPA,cna_CHEK1,cna_CHEK2,cna_CIC,cna_CREBBP,cna_CRKL,cna_CRLF2,cna_CSF1R,cna_CTCF,cna_CTNNB1,cna_DAXX,cna_DDR2,cna_DIS3,cna_DNMT3A,cna_DOT1L,cna_EGFR,cna_EP300,cna_EPHA3,cna_EPHA5,cna_EPHB1,cna_ERBB2,cna_ERBB3,cna_ERBB4,cna_ERG,cna_ESR1,cna_EZH2,cna_FANCA,cna_FANCC,cna_FBXW7,cna_FGF19,cna_FGF3,cna_FGF4,cna_FGFR1,cna_FGFR2,cna_FGFR3,cna_FGFR4,cna_FLT1,cna_FLT3,cna_FLT4,cna_FOXL2,cna_GATA1,cna_GATA2,cna_GATA3,cna_GNA11,cna_GNAQ,cna_GNAS,cna_GRIN2A,cna_GSK3B,cna_HGF,cna_HLA_A,cna_HRAS,cna_IDH1,cna_IDH2,cna_IGF1,cna_IGF1R,cna_IGF2,cna_IKBKE,cna_IKZF1,cna_IL7R,cna_INHA,cna_INHBA,cna_IRF4,cna_IRS2,cna_JAK1,cna_JAK2,cna_JAK3,cna_JUN,cna_KDM5A,cna_KDM5C,cna_KDM6A,cna_KDR,cna_KEAP1,cna_KIT,cna_KMT2A,cna_KMT2D,cna_KRAS,cna_LMO1,cna_MAP2K1,cna_MAP2K2,cna_MAP2K4,cna_MAP3K1,cna_MAP3K13,cna_MCL1,cna_MDM2,cna_MDM4,cna_MED12,cna_MEF2B,cna_MEN1,cna_MET,cna_MITF,cna_MLH1,cna_MPL,cna_MRE11A,cna_MSH2,cna_MSH6,cna_MTOR,cna_MUTYH,cna_MYC,cna_MYCL,cna_MYCN,cna_MYD88,cna_NBN,cna_NCOR1,cna_NF1,cna_NF2,cna_NFE2L2,cna_NFKBIA,cna_NKX2_1,cna_NOTCH1,cna_NOTCH2,cna_NOTCH3,cna_NOTCH4,cna_NPM1,cna_NRAS,cna_NSD1,cna_NTRK1,cna_NTRK2,cna_NTRK3,cna_NUP93,cna_PAK7,cna_PALB2,cna_PARP1,cna_PAX5,cna_PBRM1,cna_PDGFRA,cna_PDGFRB,cna_PDPK1,cna_PIK3C2G,cna_PIK3C3,cna_PIK3CA,cna_PIK3CG,cna_PIK3R1,cna_PIK3R2,cna_PMS2,cna_PNRC1,cna_PPP2R1A,cna_PRDM1,cna_PRKAR1A,cna_PTCH1,cna_PTEN,cna_PTPN11,cna_RAD50,cna_RAD51,cna_RAD51B,cna_RAD51C,cna_RAD51D,cna_RAD52,cna_RAD54L,cna_RAF1,cna_RARA,cna_RB1,cna_REL,cna_RET,cna_RICTOR,cna_RNF43,cna_RPTOR,cna_RUNX1,cna_SETD2,cna_SF3B1,cna_SH2B3,cna_SMAD2,cna_SMAD4,cna_SMARCA4,cna_SMARCB1,cna_SMARCD1,cna_SMO,cna_SOX2,cna_SPEN,cna_SPOP,cna_SRC,cna_STAG2,cna_STK11,cna_SUFU,cna_SYK,cna_TBX3,cna_TET2,cna_TGFBR2,cna_TNFAIP3,cna_TNFRSF14,cna_TOP1,cna_TP53,cna_TSC1,cna_TSC2,cna_TSHR,cna_VHL,cna_WT1,cna_XPO1,destination
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7
29,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,7
38,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7
39,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7
46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4


In [49]:
df = df.sort_values(['destination'])
df.to_pickle(PATH + 'lung.pkl')

In [50]:
len(df[df['destination'] == 1])

40

In [51]:
len(df[df['destination'] == 2])

182

In [52]:
len(df[df['destination'] == 3])

261

In [53]:
len(df[df['destination'] == 4])

487

In [54]:
len(df[df['destination'] == 5])

329

In [55]:
len(df[df['destination'] == 6])

0

In [56]:
len(df[df['destination'] == 7])

778

In [57]:
len(df[df['destination'] == 8])

114

In [123]:
df_1 = df[df['mut_AKT1'] == 1]
df_2 = df[df['mut_AKT3'] == 1]

df_1.head()

Unnamed: 0,mut_AKT1,mut_AKT2,mut_AKT3,mut_ALK,mut_ALOX12B,mut_AMER1,mut_APC,mut_AR,mut_ARAF,mut_ARID1A,mut_ARID2,mut_ASXL1,mut_ATM,mut_ATR,mut_ATRX,mut_AURKA,mut_AURKB,mut_AXL,mut_BAP1,mut_BARD1,mut_BCL2,mut_BCL6,mut_BCOR,mut_BLM,mut_BRAF,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_BTK,mut_CARD11,mut_CASP8,mut_CBFB,mut_CBL,mut_CCND1,mut_CCND2,mut_CCND3,mut_CCNE1,mut_CD79A,mut_CD79B,mut_CDC73,mut_CDH1,mut_CDK12,mut_CDK4,mut_CDK6,mut_CDK8,mut_CDKN1B,mut_CDKN2A,mut_CDKN2B,mut_CDKN2C,mut_CEBPA,mut_CHEK1,mut_CHEK2,mut_CIC,mut_CREBBP,mut_CRKL,mut_CRLF2,mut_CSF1R,mut_CTCF,mut_CTNNB1,mut_DAXX,mut_DDR2,mut_DIS3,mut_DNMT3A,mut_DOT1L,mut_EGFR,mut_EP300,mut_EPHA3,mut_EPHA5,mut_EPHB1,mut_ERBB2,mut_ERBB3,mut_ERBB4,mut_ERG,mut_ESR1,mut_EZH2,mut_FAM46C,mut_FANCA,mut_FANCC,mut_FBXW7,mut_FGF19,mut_FGF3,mut_FGF4,mut_FGFR1,mut_FGFR2,mut_FGFR3,mut_FGFR4,mut_FIP1L1,mut_FLT1,mut_FLT3,mut_FLT4,mut_FOXL2,mut_GATA1,mut_GATA2,mut_GATA3,mut_GNA11,mut_GNAQ,mut_GNAS,mut_GRIN2A,mut_GSK3B,mut_HGF,mut_HLA_A,mut_HRAS,mut_IDH1,mut_IDH2,mut_IGF1,mut_IGF1R,mut_IGF2,mut_IKBKE,mut_IKZF1,mut_IL7R,mut_INHBA,mut_INSRR,mut_IRF4,mut_IRS2,mut_JAK1,mut_JAK2,mut_JAK3,mut_JUN,mut_KDM5A,mut_KDM5C,mut_KDM6A,mut_KDR,mut_KEAP1,mut_KIT,mut_KMT2A,mut_KMT2D,mut_KRAS,mut_LMO1,mut_MAP2K1,mut_MAP2K2,mut_MAP2K4,mut_MAP3K1,mut_MAP3K13,mut_MCL1,mut_MDM2,mut_MDM4,mut_MED12,mut_MEF2B,mut_MEN1,mut_MET,mut_MITF,mut_MLH1,mut_MPL,mut_MRE11A,mut_MSH2,mut_MSH6,mut_MTOR,mut_MUTYH,mut_MYC,mut_MYCL,mut_MYCN,mut_MYD88,mut_NBN,mut_NCOR1,mut_NF1,mut_NF2,mut_NFE2L2,mut_NFKBIA,mut_NKX2_1,mut_NOTCH1,mut_NOTCH2,mut_NOTCH3,mut_NOTCH4,mut_NPM1,mut_NRAS,mut_NSD1,mut_NTRK1,mut_NTRK2,mut_NTRK3,mut_NUP93,mut_PAK7,mut_PALB2,mut_PARP1,mut_PAX5,mut_PBRM1,mut_PDGFRA,mut_PDGFRB,mut_PDPK1,mut_PIK3C2G,mut_PIK3C3,mut_PIK3CA,mut_PIK3CG,mut_PIK3R1,mut_PIK3R2,mut_PMS2,mut_PNRC1,mut_PPP2R1A,mut_PRDM1,mut_PRKAR1A,mut_PTCH1,mut_PTEN,mut_PTPN11,mut_RAD50,mut_RAD51,mut_RAD51B,mut_RAD51C,mut_RAD51D,mut_RAD52,mut_RAD54L,mut_RAF1,mut_RARA,mut_RB1,mut_REL,mut_RET,mut_RICTOR,mut_RNF43,mut_RPTOR,mut_RUNX1,mut_SETD2,mut_SF3B1,mut_SH2B3,mut_SMAD2,mut_SMAD4,mut_SMARCA4,mut_SMARCB1,mut_SMARCD1,mut_SMO,mut_SOCS1,mut_SOX2,mut_SPEN,mut_SPOP,mut_SRC,mut_STAG2,mut_STK11,mut_SUFU,mut_SYK,mut_TBX3,mut_TET2,mut_TGFBR2,mut_TNFAIP3,mut_TNFRSF14,mut_TOP1,mut_TP53,mut_TSC1,mut_TSC2,mut_TSHR,mut_VHL,mut_WT1,mut_XPO1,cna_ABL1,cna_AKT1,cna_AKT2,cna_AKT3,cna_ALK,cna_ALOX12B,cna_AMER1,cna_APC,cna_AR,cna_ARAF,cna_ARID1A,cna_ARID2,cna_ASXL1,cna_ATM,cna_ATR,cna_ATRX,cna_AURKA,cna_AURKB,cna_AXL,cna_BAP1,cna_BARD1,cna_BCL2,cna_BCL6,cna_BCOR,cna_BLM,cna_BRAF,cna_BRCA1,cna_BRCA2,cna_BRIP1,cna_BTK,cna_CARD11,cna_CASP8,cna_CBFB,cna_CBL,cna_CCND1,cna_CCND2,cna_CCND3,cna_CCNE1,cna_CD79A,cna_CD79B,cna_CDC73,cna_CDH1,cna_CDK12,cna_CDK4,cna_CDK6,cna_CDK8,cna_CDKN1B,cna_CDKN2A,cna_CDKN2B,cna_CDKN2C,cna_CEBPA,cna_CHEK1,cna_CHEK2,cna_CIC,cna_CREBBP,cna_CRKL,cna_CRLF2,cna_CSF1R,cna_CTCF,cna_CTNNB1,cna_DAXX,cna_DDR2,cna_DIS3,cna_DNMT3A,cna_DOT1L,cna_EGFR,cna_EP300,cna_EPHA3,cna_EPHA5,cna_EPHB1,cna_ERBB2,cna_ERBB3,cna_ERBB4,cna_ERG,cna_ESR1,cna_EZH2,cna_FANCA,cna_FANCC,cna_FBXW7,cna_FGF19,cna_FGF3,cna_FGF4,cna_FGFR1,cna_FGFR2,cna_FGFR3,cna_FGFR4,cna_FLT1,cna_FLT3,cna_FLT4,cna_FOXL2,cna_GATA1,cna_GATA2,cna_GATA3,cna_GNA11,cna_GNAQ,cna_GNAS,cna_GRIN2A,cna_GSK3B,cna_HGF,cna_HLA_A,cna_HRAS,cna_IDH1,cna_IDH2,cna_IGF1,cna_IGF1R,cna_IGF2,cna_IKBKE,cna_IKZF1,cna_IL7R,cna_INHA,cna_INHBA,cna_IRF4,cna_IRS2,cna_JAK1,cna_JAK2,cna_JAK3,cna_JUN,cna_KDM5A,cna_KDM5C,cna_KDM6A,cna_KDR,cna_KEAP1,cna_KIT,cna_KMT2A,cna_KMT2D,cna_KRAS,cna_LMO1,cna_MAP2K1,cna_MAP2K2,cna_MAP2K4,cna_MAP3K1,cna_MAP3K13,cna_MCL1,cna_MDM2,cna_MDM4,cna_MED12,cna_MEF2B,cna_MEN1,cna_MET,cna_MITF,cna_MLH1,cna_MPL,cna_MRE11A,cna_MSH2,cna_MSH6,cna_MTOR,cna_MUTYH,cna_MYC,cna_MYCL,cna_MYCN,cna_MYD88,cna_NBN,cna_NCOR1,cna_NF1,cna_NF2,cna_NFE2L2,cna_NFKBIA,cna_NKX2_1,cna_NOTCH1,cna_NOTCH2,cna_NOTCH3,cna_NOTCH4,cna_NPM1,cna_NRAS,cna_NSD1,cna_NTRK1,cna_NTRK2,cna_NTRK3,cna_NUP93,cna_PAK7,cna_PALB2,cna_PARP1,cna_PAX5,cna_PBRM1,cna_PDGFRA,cna_PDGFRB,cna_PDPK1,cna_PIK3C2G,cna_PIK3C3,cna_PIK3CA,cna_PIK3CG,cna_PIK3R1,cna_PIK3R2,cna_PMS2,cna_PNRC1,cna_PPP2R1A,cna_PRDM1,cna_PRKAR1A,cna_PTCH1,cna_PTEN,cna_PTPN11,cna_RAD50,cna_RAD51,cna_RAD51B,cna_RAD51C,cna_RAD51D,cna_RAD52,cna_RAD54L,cna_RAF1,cna_RARA,cna_RB1,cna_REL,cna_RET,cna_RICTOR,cna_RNF43,cna_RPTOR,cna_RUNX1,cna_SETD2,cna_SF3B1,cna_SH2B3,cna_SMAD2,cna_SMAD4,cna_SMARCA4,cna_SMARCB1,cna_SMARCD1,cna_SMO,cna_SOX2,cna_SPEN,cna_SPOP,cna_SRC,cna_STAG2,cna_STK11,cna_SUFU,cna_SYK,cna_TBX3,cna_TET2,cna_TGFBR2,cna_TNFAIP3,cna_TNFRSF14,cna_TOP1,cna_TP53,cna_TSC1,cna_TSC2,cna_TSHR,cna_VHL,cna_WT1,cna_XPO1,destination
6598,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
5762,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
5865,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
5563,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
6122,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4


In [124]:
df_2.head()

Unnamed: 0,mut_AKT1,mut_AKT2,mut_AKT3,mut_ALK,mut_ALOX12B,mut_AMER1,mut_APC,mut_AR,mut_ARAF,mut_ARID1A,mut_ARID2,mut_ASXL1,mut_ATM,mut_ATR,mut_ATRX,mut_AURKA,mut_AURKB,mut_AXL,mut_BAP1,mut_BARD1,mut_BCL2,mut_BCL6,mut_BCOR,mut_BLM,mut_BRAF,mut_BRCA1,mut_BRCA2,mut_BRIP1,mut_BTK,mut_CARD11,mut_CASP8,mut_CBFB,mut_CBL,mut_CCND1,mut_CCND2,mut_CCND3,mut_CCNE1,mut_CD79A,mut_CD79B,mut_CDC73,mut_CDH1,mut_CDK12,mut_CDK4,mut_CDK6,mut_CDK8,mut_CDKN1B,mut_CDKN2A,mut_CDKN2B,mut_CDKN2C,mut_CEBPA,mut_CHEK1,mut_CHEK2,mut_CIC,mut_CREBBP,mut_CRKL,mut_CRLF2,mut_CSF1R,mut_CTCF,mut_CTNNB1,mut_DAXX,mut_DDR2,mut_DIS3,mut_DNMT3A,mut_DOT1L,mut_EGFR,mut_EP300,mut_EPHA3,mut_EPHA5,mut_EPHB1,mut_ERBB2,mut_ERBB3,mut_ERBB4,mut_ERG,mut_ESR1,mut_EZH2,mut_FAM46C,mut_FANCA,mut_FANCC,mut_FBXW7,mut_FGF19,mut_FGF3,mut_FGF4,mut_FGFR1,mut_FGFR2,mut_FGFR3,mut_FGFR4,mut_FIP1L1,mut_FLT1,mut_FLT3,mut_FLT4,mut_FOXL2,mut_GATA1,mut_GATA2,mut_GATA3,mut_GNA11,mut_GNAQ,mut_GNAS,mut_GRIN2A,mut_GSK3B,mut_HGF,mut_HLA_A,mut_HRAS,mut_IDH1,mut_IDH2,mut_IGF1,mut_IGF1R,mut_IGF2,mut_IKBKE,mut_IKZF1,mut_IL7R,mut_INHBA,mut_INSRR,mut_IRF4,mut_IRS2,mut_JAK1,mut_JAK2,mut_JAK3,mut_JUN,mut_KDM5A,mut_KDM5C,mut_KDM6A,mut_KDR,mut_KEAP1,mut_KIT,mut_KMT2A,mut_KMT2D,mut_KRAS,mut_LMO1,mut_MAP2K1,mut_MAP2K2,mut_MAP2K4,mut_MAP3K1,mut_MAP3K13,mut_MCL1,mut_MDM2,mut_MDM4,mut_MED12,mut_MEF2B,mut_MEN1,mut_MET,mut_MITF,mut_MLH1,mut_MPL,mut_MRE11A,mut_MSH2,mut_MSH6,mut_MTOR,mut_MUTYH,mut_MYC,mut_MYCL,mut_MYCN,mut_MYD88,mut_NBN,mut_NCOR1,mut_NF1,mut_NF2,mut_NFE2L2,mut_NFKBIA,mut_NKX2_1,mut_NOTCH1,mut_NOTCH2,mut_NOTCH3,mut_NOTCH4,mut_NPM1,mut_NRAS,mut_NSD1,mut_NTRK1,mut_NTRK2,mut_NTRK3,mut_NUP93,mut_PAK7,mut_PALB2,mut_PARP1,mut_PAX5,mut_PBRM1,mut_PDGFRA,mut_PDGFRB,mut_PDPK1,mut_PIK3C2G,mut_PIK3C3,mut_PIK3CA,mut_PIK3CG,mut_PIK3R1,mut_PIK3R2,mut_PMS2,mut_PNRC1,mut_PPP2R1A,mut_PRDM1,mut_PRKAR1A,mut_PTCH1,mut_PTEN,mut_PTPN11,mut_RAD50,mut_RAD51,mut_RAD51B,mut_RAD51C,mut_RAD51D,mut_RAD52,mut_RAD54L,mut_RAF1,mut_RARA,mut_RB1,mut_REL,mut_RET,mut_RICTOR,mut_RNF43,mut_RPTOR,mut_RUNX1,mut_SETD2,mut_SF3B1,mut_SH2B3,mut_SMAD2,mut_SMAD4,mut_SMARCA4,mut_SMARCB1,mut_SMARCD1,mut_SMO,mut_SOCS1,mut_SOX2,mut_SPEN,mut_SPOP,mut_SRC,mut_STAG2,mut_STK11,mut_SUFU,mut_SYK,mut_TBX3,mut_TET2,mut_TGFBR2,mut_TNFAIP3,mut_TNFRSF14,mut_TOP1,mut_TP53,mut_TSC1,mut_TSC2,mut_TSHR,mut_VHL,mut_WT1,mut_XPO1,cna_ABL1,cna_AKT1,cna_AKT2,cna_AKT3,cna_ALK,cna_ALOX12B,cna_AMER1,cna_APC,cna_AR,cna_ARAF,cna_ARID1A,cna_ARID2,cna_ASXL1,cna_ATM,cna_ATR,cna_ATRX,cna_AURKA,cna_AURKB,cna_AXL,cna_BAP1,cna_BARD1,cna_BCL2,cna_BCL6,cna_BCOR,cna_BLM,cna_BRAF,cna_BRCA1,cna_BRCA2,cna_BRIP1,cna_BTK,cna_CARD11,cna_CASP8,cna_CBFB,cna_CBL,cna_CCND1,cna_CCND2,cna_CCND3,cna_CCNE1,cna_CD79A,cna_CD79B,cna_CDC73,cna_CDH1,cna_CDK12,cna_CDK4,cna_CDK6,cna_CDK8,cna_CDKN1B,cna_CDKN2A,cna_CDKN2B,cna_CDKN2C,cna_CEBPA,cna_CHEK1,cna_CHEK2,cna_CIC,cna_CREBBP,cna_CRKL,cna_CRLF2,cna_CSF1R,cna_CTCF,cna_CTNNB1,cna_DAXX,cna_DDR2,cna_DIS3,cna_DNMT3A,cna_DOT1L,cna_EGFR,cna_EP300,cna_EPHA3,cna_EPHA5,cna_EPHB1,cna_ERBB2,cna_ERBB3,cna_ERBB4,cna_ERG,cna_ESR1,cna_EZH2,cna_FANCA,cna_FANCC,cna_FBXW7,cna_FGF19,cna_FGF3,cna_FGF4,cna_FGFR1,cna_FGFR2,cna_FGFR3,cna_FGFR4,cna_FLT1,cna_FLT3,cna_FLT4,cna_FOXL2,cna_GATA1,cna_GATA2,cna_GATA3,cna_GNA11,cna_GNAQ,cna_GNAS,cna_GRIN2A,cna_GSK3B,cna_HGF,cna_HLA_A,cna_HRAS,cna_IDH1,cna_IDH2,cna_IGF1,cna_IGF1R,cna_IGF2,cna_IKBKE,cna_IKZF1,cna_IL7R,cna_INHA,cna_INHBA,cna_IRF4,cna_IRS2,cna_JAK1,cna_JAK2,cna_JAK3,cna_JUN,cna_KDM5A,cna_KDM5C,cna_KDM6A,cna_KDR,cna_KEAP1,cna_KIT,cna_KMT2A,cna_KMT2D,cna_KRAS,cna_LMO1,cna_MAP2K1,cna_MAP2K2,cna_MAP2K4,cna_MAP3K1,cna_MAP3K13,cna_MCL1,cna_MDM2,cna_MDM4,cna_MED12,cna_MEF2B,cna_MEN1,cna_MET,cna_MITF,cna_MLH1,cna_MPL,cna_MRE11A,cna_MSH2,cna_MSH6,cna_MTOR,cna_MUTYH,cna_MYC,cna_MYCL,cna_MYCN,cna_MYD88,cna_NBN,cna_NCOR1,cna_NF1,cna_NF2,cna_NFE2L2,cna_NFKBIA,cna_NKX2_1,cna_NOTCH1,cna_NOTCH2,cna_NOTCH3,cna_NOTCH4,cna_NPM1,cna_NRAS,cna_NSD1,cna_NTRK1,cna_NTRK2,cna_NTRK3,cna_NUP93,cna_PAK7,cna_PALB2,cna_PARP1,cna_PAX5,cna_PBRM1,cna_PDGFRA,cna_PDGFRB,cna_PDPK1,cna_PIK3C2G,cna_PIK3C3,cna_PIK3CA,cna_PIK3CG,cna_PIK3R1,cna_PIK3R2,cna_PMS2,cna_PNRC1,cna_PPP2R1A,cna_PRDM1,cna_PRKAR1A,cna_PTCH1,cna_PTEN,cna_PTPN11,cna_RAD50,cna_RAD51,cna_RAD51B,cna_RAD51C,cna_RAD51D,cna_RAD52,cna_RAD54L,cna_RAF1,cna_RARA,cna_RB1,cna_REL,cna_RET,cna_RICTOR,cna_RNF43,cna_RPTOR,cna_RUNX1,cna_SETD2,cna_SF3B1,cna_SH2B3,cna_SMAD2,cna_SMAD4,cna_SMARCA4,cna_SMARCB1,cna_SMARCD1,cna_SMO,cna_SOX2,cna_SPEN,cna_SPOP,cna_SRC,cna_STAG2,cna_STK11,cna_SUFU,cna_SYK,cna_TBX3,cna_TET2,cna_TGFBR2,cna_TNFAIP3,cna_TNFRSF14,cna_TOP1,cna_TP53,cna_TSC1,cna_TSC2,cna_TSHR,cna_VHL,cna_WT1,cna_XPO1,destination
5683,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
5504,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
5533,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
5455,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
5219,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3


In [125]:
len(df_1)

21

In [126]:
len(df_2)

31

In [134]:
df_merged = pd.merge(df_1, df_2, how='outer')

In [135]:
len(df_merged)

51

In [None]:
df['destination'].unique()

In [None]:
len(df)

# turn this into binary classification or not

In [None]:
df['destination'] = df['destination'] -1

In [None]:
#samp_df = pd.concat([df[df.destination == 1], df[df.destination == 2], df[df.destination == 3], df[df.destination == 4]])
samp_df = df

# turn to 0 based labels

In [None]:
samp_df = samp_df.sample(frac=1)

In [None]:
samp_df.head()

In [None]:
msk = np.random.rand(len(samp_df)) < 0.8
data = samp_df[msk]
test = samp_df[~msk]

In [None]:
len(msk)

In [None]:
len(data)

In [None]:
len(test)

In [None]:
data.head()

In [None]:
test.head()

In [None]:
data.shape

In [None]:
test.shape

In [None]:
# data.head(1).T

In [None]:
combined = pd.concat([data, test])
0 in combined.values

In [None]:
combined['destination'].unique()

In [None]:
if target == 'Lung':
    combined.loc[combined['destination'] == 6, 'destination'] -= 1
    combined.loc[combined['destination'] == 7, 'destination'] -= 1
combined['destination'].unique()

In [None]:
cats = [col for col in combined.columns if col not in ['destination']]

In [None]:
for c in cats:
    combined[c] = combined[c].astype('category').cat.codes

In [None]:
combined.info(memory_usage='deep')

In [None]:
#emb_szs, n_cont, emb_drop, out_sz, szs, drops
cat_sz = [(c, len(combined[c].unique())) for c in cats]

In [None]:
data = combined[:len(data)]
test = combined[len(data):]

In [None]:
data.reset_index()
data.head()

In [None]:
data = data.reset_index(drop=True)
data.to_feather('data/biaobin/data.feather')

In [None]:
test = test.reset_index(drop=True)
test.to_feather('data/biaobin/test.feather')

In [None]:
data.head()

In [None]:
test.head()

# Model and Training

# Parameters

n_cont=0, emb_drop=0, out_sz=out_sz, szs=[500], drops=[0.5]

In [None]:
n_cont = 0
emb_drop = 0
szs=[3000, 2000, 500]
drops=[0.5, 0.5, 0.5]

emb_szs = [(c, min(500, (c+500) // 2)) for _, c in cat_sz]
emb_szs

bs = 256

optim = optim.SGD

use_bn = False

out_sz = combined['destination'].nunique()
print(out_sz)
combined['destination'].unique()

In [None]:
model = MixedInputModel(emb_szs, n_cont=0, emb_drop=emb_drop, out_sz=out_sz, szs=szs, drops=drops, use_bn=use_bn).cuda()

In [None]:
#bm = BasicModel(model, 'binary_classifier')
bm = BasicModel(model, 'multi_classifier')

In [None]:
val_idx = list(range(int(len(data)*0.8), len(data)))

In [None]:
trn_df, trn_y = data[cats].iloc[:val_idx[0]], data.destination[:val_idx[0]]
val_df, val_y = data[cats].iloc[val_idx[0:]], data.destination[val_idx[0:]]

In [None]:
#display(DataFrameSummary(trn_df).summary())

In [None]:
md = ColumnarModelData.from_data_frames('data/biaobin/tmp', trn_df, val_df, 
                                        trn_y.astype('int'), val_y.astype('int'),
                                       cats, bs, test_df=test[cats])

In [None]:
md

In [None]:
metrics=[accuracy]

In [None]:
# you can simply create learner with any custom model and data
# source code is here
class StructuredLearner(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)
        self.crit = F.nll_loss


learn = StructuredLearner(md, bm, opt_fn=optim, metrics=metrics)

In [None]:
learn

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot(2)

In [None]:
lr = 0.001
learn.fit(lr, 10, cycle_len=3, cycle_mult=1)

In [None]:
preds = learn.predict_dl(md.test_dl)

In [None]:
test_destination = pd.DataFrame(test.destination, dtype=int)

In [None]:
# softmax
expsums = np.exp(preds).sum(axis=1)
probs = np.exp(preds) / expsums[:, None]

In [None]:
probs = np.clip(probs, 0.05, 0.95)

In [None]:
len(probs)

In [None]:
len(test_destination)

In [None]:
for p in probs:
    print(p)
    max_index = np.where(p==p.max())
    print(max_index)

### 