In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pylab as pl
import scipy as sp
import sys

import os, sys

%matplotlib inline

In [2]:
from box import Box
import glob

In [3]:
TOP = os.getcwd().replace("notebooks/tt", "")
D1 = TOP + 'data/'
figures_dir = TOP + 'manuscript/figures/'

### DATA

In [4]:
CHM1 = pd.read_csv(D1 + 'S7-chem.tsv', sep='\t', low_memory=False)
CHM1 = CHM1.replace(['-'], np.nan)
CHM1.isnull().values.any()
CHM1 = CHM1.dropna()
CHM1
CHM1_nm = CHM1[['DTXSID', 'PREFERRED_NAME']].set_index('DTXSID')
X = CHM1 = CHM1.drop(['PREFERRED_NAME'], axis=1).set_index('DTXSID')

CHM11 = Box()
for i in set([i[0] for i in X.columns.str.split('_') ]):
    CHM11[i] =  X[X.columns[X.columns.str.match(i)]]
CHM11['all'] = X

In [5]:
BIO1 = pd.read_csv(D1+'S2-bio.tsv', sep='\t') 
BIO1 = BIO1.rename(columns = {'dsstox_substance_id':'DTXSID'})

BIO1 = BIO1.drop(['chnm', 'casn'], axis = 1).set_index('DTXSID')
BIO11 = Box()
for i in set([i[0] for i in BIO1.columns.str.split('_')]):
    BIO11[i]=BIO1[BIO1.columns[BIO1.columns.str.match(i)]]
BIO11['all']=BIO1

In [6]:
TOX1 = pd.read_csv(D1+'S1-tox.tsv', sep='\t')

TOX1 = TOX1.rename(columns = {'dsstox_substance_id':'DTXSID'})
TOX1 = TOX1.drop(['preferred_name'], axis=1).set_index('DTXSID')
#set([i[0] for i in TOX1.columns.str.split('_')])
TOX11 = Box()
for i in set([i[0] for i in TOX1.columns.str.split('_')]):
    TOX11[i] = TOX1[TOX1.columns[TOX1.columns.str.match(i)]]
TOX11['all'] = TOX1

In [7]:
DATA = Box(chm=CHM11,bio=BIO11,tox=TOX11)
DATA['bc'] = Box()

for c,b in [(i,j) for i in CHM11.keys() for j in BIO11.keys()]:
    print(b,c)
    DATA['bc']['{}_{}'.format(b,c)] = CHM11[c].merge(BIO11[b],left_index=True,right_index=True)

gene tptr
assay tptr
all tptr
gene mrgn
assay mrgn
all mrgn
gene toxp
assay toxp
all toxp
gene all
assay all
all all


In [8]:
[e for e in TOX1.columns if 'liver' in e]

['chr_liver',
 'dev_liver',
 'dev_premature_delivery',
 'dnt_liver',
 'mgr_liver',
 'neu_liver',
 'oth_liver',
 'rep_liver',
 'sac_liver',
 'sub_liver']

In [9]:
effects = ['chr_liver', 'dev_liver']

### DEF getDATA

In [10]:
def getData(b,dt='chm',st='all',Data=DATA):
    """
    Return an input and output dataframe for analysis
    
    b = toxicity class
    dt= input data type (chm or bio)
    st= inpute subdata type ()
    """
    
    Y = Data.tox.all[b].dropna()
    INT=Data[dt][st].index.intersection(Y.index)
    if len(INT)>0:
        Yf= Y.copy()[INT]
        Yf[Yf>0]=1
        Yb = Yf.copy()
        Yb = Yb.astype(np.int32)
        X  = Data[dt][st].loc[INT].fillna(0)
    else:
        pass
    
    return X,Yb

In [11]:
#getData('chr_liver',dt='chm',st='mrgn')


In [12]:
mydict = {}
for e in effects:
    data = getData(e,dt='chm',st='mrgn')
    mydict[e] = data

In [13]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings

## Balance with RandomUnderSampler

In [14]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.metrics import make_scorer, recall_score, f1_score, accuracy_score, precision_score
from collections import Counter

### Model Validation Using chm descriptors After Undersampling

In [15]:
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB
#from sklearn.tree import DecisionTreeClassifier

from genra.rax.skl.cls import *
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import Pipeline

In [16]:
def model_cv(d,  label = None):
    Res = []
    Est = [('Random Forest', RandomForestClassifier(random_state=42)),
          ('NB', BernoulliNB()),
          ('LR', LogisticRegression(max_iter=1000)),
          ('Gradient Boosting', GradientBoostingClassifier()),
          ('SVC', SVC(gamma='auto',probability=True)),
          ('KNN',KNeighborsClassifier()),
          ('ANN1',MLPClassifier(solver='sgd')),
          ('GenRA', GenRAPredClass(n_neighbors=10,metric='jaccard'))]
    
    X, Y = d
    for model in Est:
        LR, Clf = model
        steps = [('undersample', RandomUnderSampler(sampling_strategy='majority')), model]
        pipeline = Pipeline(steps=steps)
        score = cross_validate(pipeline, X, Y,
                           cv=StratifiedKFold(n_splits = 5),
                           scoring= {'accuracy': make_scorer(accuracy_score),
                                     'f1': make_scorer(f1_score),
                                     'sensitivity': make_scorer(recall_score),
                                     'specificity': make_scorer(recall_score,pos_label=0),
                                    'precision': make_scorer(precision_score)},
                           n_jobs=-1, verbose=1)
    #elapsed_time = datetime.now() - start_time
        SC = pd.DataFrame(score)
        SC.insert(0,'LR',LR)
        Res.append(SC)
    return Res

In [17]:
def plugin(d, label=None):
    summary = {}
    X, Y = d
    summary['label'] = label
    summary['score'] = model_cv(d)
    
    return summary

In [18]:
chm_summaries = []
for k in list(mydict.keys()):
    try:
        chm_summaries.append(plugin(mydict[k], k))
    except Exception as error:
        print(k)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.6s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.4s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.8s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

In [19]:
myperm2 = {}
myperm = {}
for i in range(len(chm_summaries)):
    k = chm_summaries[i]['label']
    Perf = pd.concat(chm_summaries[i]['score'])
    Perf_agg = Perf.groupby(['LR']).aggregate(dict(test_f1=[np.mean,np.std],
                                    test_sensitivity=[np.mean,np.std],
                                    test_specificity=[np.mean,np.std],
                                    test_accuracy=[np.mean,np.std],
                                    test_precision=[np.mean,np.std],))\
    .round(decimals=3)
    myperm[k] = Perf_agg
    myperm2[k] = Perf       

In [20]:
Perf_mgrn = pd.concat(myperm)    
Perf_mgrn_raw = pd.concat(myperm2)   

In [21]:
Perf_mgrn

Perf_mgrn_raw.to_csv(D1 + '/ML_under_mgrn_all_140823.csv' )
Perf_mgrn.to_csv(D1 + '/ML_under_mgrn_summary_140823.csv' )

## Model Validation Using Bio (gene hitc) Descriptors

In [22]:
bio_dict = {}
for e in effects:
    data = getData(e,dt='bio',st='gene')
    bio_dict[e] = data

In [23]:
bio_summaries = []
for k in list(bio_dict.keys()):
    try:
        bio_summaries.append(plugin(bio_dict[k], k))
    except Exception as error:
        print(k)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

In [24]:
bio_perm2 = {}
bio_perm = {}
for i in range(len(bio_summaries)):
    k = bio_summaries[i]['label']
    Perf = pd.concat(bio_summaries[i]['score'])
    Perf_agg = Perf.groupby(['LR']).aggregate(dict(test_f1=[np.mean,np.std],
                                    test_sensitivity=[np.mean,np.std],
                                    test_specificity=[np.mean,np.std],
                                    test_accuracy=[np.mean,np.std],
                                    test_precision=[np.mean,np.std],))\
    .round(decimals=3)
    bio_perm[k] = Perf_agg
    bio_perm2[k] = Perf

In [25]:
Perf_bio = pd.concat(bio_perm)  
Perf_bio_raw = pd.concat(bio_perm2)  

In [26]:
Perf_bio_raw.to_csv(D1 + '/ML_under_bio_all_140823.csv' )
Perf_bio.to_csv(D1 + '/ML_under_bio_summary_140823.csv' )        
        

## Model Validation Using Hybrid (chm(mrgn) + bio(hitc)) Descriptors

In [27]:
hybrid_dict = {}
for e in effects:
    data = getData(e,dt='bc',st='gene_mrgn')
    hybrid_dict[e] = data

In [28]:
hybrid_summaries = []
for k in list(hybrid_dict.keys()):
    try:
        hybrid_summaries.append(plugin(hybrid_dict[k], k))
    except Exception as error:
        print(k)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.8s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

In [30]:
myperm2 = {}
myperm = {}
for i in range(len(hybrid_summaries)):
    k = hybrid_summaries[i]['label']
    Perf = pd.concat(hybrid_summaries[i]['score'])
    Perf_agg = Perf.groupby(['LR']).aggregate(dict(test_f1=[np.mean,np.std],
                                    test_sensitivity=[np.mean,np.std],
                                    test_specificity=[np.mean,np.std],
                                    test_accuracy=[np.mean,np.std],
                                    test_precision=[np.mean,np.std],))\
    .round(decimals=3)
    myperm[k] = Perf_agg
    myperm2[k] = Perf

In [31]:
Perf_hybrid = pd.concat(myperm)  
Perf_hybrid_raw = pd.concat(myperm2) 

In [32]:
Perf_hybrid_raw.to_csv(D1 + '/ML_under_hybrid_all_140823.csv' )
Perf_hybrid.to_csv(D1 + '/ML_under_hybrid_summary_140823.csv' )  

## Model Validation Using All Chm (mrgn + tptr + toxp) Descriptors     

In [33]:
ca_dict = {}
for e in effects:
    data = getData(e,dt='chm',st='all')
    ca_dict[e] = data




In [34]:
ca_summaries = []
for k in list(ca_dict.keys()):
    try:
        ca_summaries.append(plugin(ca_dict[k], k))
    except Exception as error:
        print(k)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.2s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.1s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.1s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.8s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

In [35]:
ca_perm2 = {}
ca_perm = {}
for i in range(len(ca_summaries)):
    k = ca_summaries[i]['label']
    Perf = pd.concat(ca_summaries[i]['score'])
    Perf_agg = Perf.groupby(['LR']).aggregate(dict(test_f1=[np.mean,np.std],
                                    test_sensitivity=[np.mean,np.std],
                                    test_specificity=[np.mean,np.std],
                                    test_accuracy=[np.mean,np.std],
                                    test_precision=[np.mean,np.std],))\
    .round(decimals=3)
    ca_perm[k] = Perf_agg
    ca_perm2[k] = Perf

In [36]:
Perf_ca = pd.concat(ca_perm)  
Perf_ca_raw = pd.concat(ca_perm2)  

In [37]:
Perf_ca_raw.to_csv(D1 + '/ML_under_ca_all_140823.csv' )
Perf_ca.to_csv(D1 + '/ML_under_ca_summary_140823.csv' )  

## Model Validation Using All Chm + bio (mrgn + tptr + toxp + bio) Descriptors

In [38]:
bc_dict = {}
for e in effects:
    data = getData(e,dt='bc',st='gene_all')
    bc_dict[e] = data

In [39]:
bc_summaries = []
for k in list(bc_dict.keys()):
    try:
        bc_summaries.append(plugin(bc_dict[k], k))
    except Exception as error:
        print(k)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.2s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.1s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.1s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

In [40]:
bc_perm2 = {}
bc_perm = {}
for i in range(len(bc_summaries)):
    k = bc_summaries[i]['label']
    Perf = pd.concat(bc_summaries[i]['score'])
    Perf_agg = Perf.groupby(['LR']).aggregate(dict(test_f1=[np.mean,np.std],
                                    test_sensitivity=[np.mean,np.std],
                                    test_specificity=[np.mean,np.std],
                                    test_accuracy=[np.mean,np.std],
                                    test_precision=[np.mean,np.std],))\
    .round(decimals=3)
    bc_perm[k] = Perf_agg
    bc_perm2[k] = Perf

In [41]:
Perf_bc = pd.concat(bc_perm)  
Perf_bc_raw = pd.concat(bc_perm2)  

In [42]:
Perf_bc_raw.to_csv(D1 + '/ML_under_cba_all_140823.csv' )
Perf_bc.to_csv(D1 + '/ML_under_cba_all_summary_140823.csv' )    

In [43]:
#Perf_bc

In [44]:
data = dict( tox = ['chr_liver', 'chr_liver','chr_liver','chr_liver','chr_liver', 'dev_liver','dev_liver','dev_liver','dev_liver','dev_liver'], dt = ['mrgn', 'gene', 'cb', 'ca', 'cba', 'mrgn', 'gene', 'cb', 'ca', 'cba', ] ,pos = [128,134,128,128,128, 43,47,43,43,43], neg = [128,134,128,128,128,43,47,43,43,43])

In [45]:
df_counts = pd.DataFrame(data)

In [46]:
df_counts['balance'] = 'random'
df_counts['sample'] = 'under'

In [47]:
df_counts

Unnamed: 0,tox,dt,pos,neg,balance,sample
0,chr_liver,mrgn,128,128,random,under
1,chr_liver,gene,134,134,random,under
2,chr_liver,cb,128,128,random,under
3,chr_liver,ca,128,128,random,under
4,chr_liver,cba,128,128,random,under
5,dev_liver,mrgn,43,43,random,under
6,dev_liver,gene,47,47,random,under
7,dev_liver,cb,43,43,random,under
8,dev_liver,ca,43,43,random,under
9,dev_liver,cba,43,43,random,under


In [48]:
Perf_mgrn.reset_index()

Unnamed: 0_level_0,level_0,LR,test_f1,test_f1,test_sensitivity,test_sensitivity,test_specificity,test_specificity,test_accuracy,test_accuracy,test_precision,test_precision
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,chr_liver,ANN1,0.649,0.074,0.618,0.148,0.508,0.204,0.58,0.054,0.708,0.066
1,chr_liver,GenRA,0.589,0.083,0.495,0.127,0.695,0.137,0.566,0.04,0.758,0.047
2,chr_liver,Gradient Boosting,0.628,0.053,0.546,0.083,0.656,0.127,0.585,0.041,0.751,0.063
3,chr_liver,KNN,0.237,0.175,0.157,0.134,0.876,0.113,0.409,0.071,0.676,0.142
4,chr_liver,LR,0.684,0.064,0.627,0.113,0.64,0.098,0.632,0.045,0.764,0.02
5,chr_liver,NB,0.622,0.108,0.52,0.14,0.758,0.082,0.604,0.081,0.797,0.048
6,chr_liver,Random Forest,0.587,0.108,0.491,0.162,0.725,0.169,0.574,0.061,0.78,0.052
7,chr_liver,SVC,0.536,0.156,0.41,0.177,0.868,0.129,0.571,0.092,0.864,0.093
8,dev_liver,ANN1,0.275,0.1,0.536,0.245,0.54,0.185,0.539,0.139,0.191,0.073
9,dev_liver,GenRA,0.229,0.074,0.511,0.227,0.434,0.15,0.446,0.112,0.149,0.044


In [49]:
def pref_files(df, dt, st, bl):
    df = df.reset_index()
    df.columns = ['tox', 'classifier', 'mean_f1', 'std_f1', 'mean_sensitivity', 'std_sensitivity', 'mean_specificity', 'std_specificity', 'mean_accuracy', 'std_accuracy', 'mean_precision', 'std_precision']
    #df = df.drop([0,1])
    df['dt'] = dt 
    df['balance'] = bl
    df['sampling'] = st
    #df = pd.merge(df, df_counts, on = 'tox', how = 'left')
    return df

In [50]:
#pref_files(Perf_mgrn, 'mgrn')

In [51]:
mylst = ['mrgn', 'gene', 'cb', 'ca', 'cba']

In [52]:
mylst2 = list(zip(mylst, [Perf_mgrn, 
Perf_bio, 
Perf_hybrid,
Perf_ca, 
Perf_bc ], ))

In [53]:
mylst3 = []
for e in mylst2:
    a,b = e
    mylst3.append(pref_files(b,a, st = 'under', bl = 'random'))

In [54]:
final_df = pd.concat(mylst3)

In [55]:
final_df

Unnamed: 0,tox,classifier,mean_f1,std_f1,mean_sensitivity,std_sensitivity,mean_specificity,std_specificity,mean_accuracy,std_accuracy,mean_precision,std_precision,dt,balance,sampling
0,chr_liver,ANN1,0.649,0.074,0.618,0.148,0.508,0.204,0.580,0.054,0.708,0.066,mrgn,random,under
1,chr_liver,GenRA,0.589,0.083,0.495,0.127,0.695,0.137,0.566,0.040,0.758,0.047,mrgn,random,under
2,chr_liver,Gradient Boosting,0.628,0.053,0.546,0.083,0.656,0.127,0.585,0.041,0.751,0.063,mrgn,random,under
3,chr_liver,KNN,0.237,0.175,0.157,0.134,0.876,0.113,0.409,0.071,0.676,0.142,mrgn,random,under
4,chr_liver,LR,0.684,0.064,0.627,0.113,0.640,0.098,0.632,0.045,0.764,0.020,mrgn,random,under
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,dev_liver,KNN,0.220,0.149,0.411,0.303,0.630,0.121,0.596,0.059,0.152,0.098,cba,random,under
12,dev_liver,LR,0.238,0.061,0.539,0.228,0.430,0.169,0.447,0.120,0.155,0.039,cba,random,under
13,dev_liver,NB,0.340,0.092,0.775,0.174,0.424,0.221,0.481,0.177,0.222,0.074,cba,random,under
14,dev_liver,Random Forest,0.343,0.070,0.744,0.182,0.484,0.154,0.527,0.114,0.226,0.053,cba,random,under


In [57]:
writer = pd.ExcelWriter(D1+'random_under_all_140823.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.

final_df.to_excel(writer, sheet_name = 'all')

writer.save()