In [1]:
import random
import math

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,LabelBinarizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score,f1_score,accuracy_score, RocCurveDisplay, PrecisionRecallDisplay, ConfusionMatrixDisplay




In [2]:
#currently not all assays
tox21_all_assays = pd.read_excel("../data/50_assay/assay_list.xls",sheet_name=None)

In [3]:
tox21_all_assays.keys()

dict_keys(['sheet0', 'sheet1', 'sheet2', 'sheet3', 'sheet4', 'sheet5', 'sheet6', 'sheet7', 'sheet8', 'sheet9', 'sheet10', 'sheet11', 'sheet12', 'sheet13', 'sheet14', 'sheet15', 'sheet16', 'sheet17', 'sheet18', 'sheet19', 'sheet20', 'sheet21', 'sheet22', 'sheet23', 'sheet24', 'sheet25', 'sheet26', 'sheet27', 'sheet28', 'sheet29', 'sheet30', 'sheet31', 'sheet32', 'sheet33'])

In [4]:
[tox21_all_assays[i].columns[1] for i in tox21_all_assays.keys()]

['tox21-ahr-p1',
 'tox21-ap1-agonist-p1',
 'tox21-ar-bla-antagonist-p1',
 'tox21-ar-mda-kb2-luc-agonist-p3',
 'tox21-ar-mda-kb2-luc-antagonist-p1',
 'tox21-ar-mda-kb2-luc-antagonist-p2',
 'tox21-are-bla-p1',
 'tox21-aromatase-p1',
 'tox21-car-agonist-p1',
 'tox21-car-antagonist-p1',
 'tox21-elg1-luc-agonist-p1',
 'tox21-er-bla-antagonist-p1',
 'tox21-er-luc-bg1-4e2-agonist-p4',
 'tox21-er-luc-bg1-4e2-antagonist-p1',
 'tox21-er-luc-bg1-4e2-antagonist-p2',
 'tox21-erb-bla-antagonist-p1',
 'tox21-erb-bla-p1',
 'tox21-err-p1',
 'tox21-esre-bla-p1',
 'tox21-fxr-bla-agonist-p2',
 'tox21-fxr-bla-antagonist-p1',
 'tox21-gh3-tre-antagonist-p1',
 'tox21-gr-hela-bla-antagonist-p1',
 'tox21-h2ax-cho-p2',
 'tox21-hdac-p1',
 'tox21-hre-bla-agonist-p1',
 'tox21-hse-bla-p1',
 'tox21-mitotox-p1',
 'tox21-nfkb-bla-agonist-p1',
 'tox21-p53-bla-p1',
 'tox21-pgc-err-p1',
 'tox21-ppard-bla-agonist-p1',
 'tox21-ppard-bla-antagonist-p1',
 'tox21-pparg-bla-antagonist-p1']

In [13]:
RANDOM_SEED = 101
np.random.seed(RANDOM_SEED)
coef_list = []
performance_list = []
for sheet in ['sheet0','sheet1','sheet3','sheet33']:#tox21_all_assays.keys():
    
    # -- Pre-processing
    one_assay = tox21_all_assays[sheet]
    one_assay = one_assay.drop(one_assay.columns[0],axis=1)
    one_assay = one_assay.drop_duplicates()
    
    X = np.array(one_assay.iloc[:,2:])
    y = one_assay.iloc[:,0]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    
    #Use scaler fit to training data to transform testing data
    X_test = scaler.transform(X_test)
    
    # -- Training
    logistic_cv_unbalanced = LogisticRegressionCV(penalty='l2',Cs=10,max_iter=10000,class_weight=None).fit(X_train, y_train)
    logistic_cv_balanced = LogisticRegressionCV(penalty='l2',Cs=10,max_iter=10000,class_weight='balanced').fit(X_train, y_train)
    
    
    # -- Testing & Reporting
    y_pred_unbalanced = logistic_cv_unbalanced.predict(X_test)
    y_pred_balanced = logistic_cv_balanced.predict(X_test)
    
    performance_df = pd.DataFrame([
        {'PROTOCOL_NAME':one_assay.columns[0],
         'model':'unbalanced',
         'accuracy':accuracy_score(y_test,y_pred_unbalanced),
         'balanced accuracy':balanced_accuracy_score(y_test,y_pred_unbalanced),
         'f1_score':f1_score(y_test,y_pred_unbalanced)},
        {'PROTOCOL_NAME':one_assay.columns[0],
         'model':'balanced',
         'accuracy':accuracy_score(y_test,y_pred_balanced),
         'balanced accuracy':balanced_accuracy_score(y_test,y_pred_balanced),
         'f1_score':f1_score(y_test,y_pred_balanced)}])
        
    
    coef_df = pd.DataFrame({'PROTOCOL_NAME':one_assay.columns[0],
                            'Feature':one_assay.columns[2:], 
                            'Unbalanced_coef':logistic_cv_unbalanced.coef_[0],
                            'Balanced_coef':logistic_cv_balanced.coef_[0]}).sort_values(by="Unbalanced_coef")
    
    performance_list.append(performance_df)
    coef_list.append(coef_df)
    

In [14]:
performance_results = pd.concat(performance_list)
coef_results = pd.concat(coef_list)

In [15]:
# Fraction CSP3 coef values and ranks
coef_results['ranks'] = coef_results['Unbalanced_coef'].abs().groupby(coef_results['PROTOCOL_NAME']).rank(ascending=False)
coef_results[coef_results['Feature']=="FractionCSP3"]

Unnamed: 0,PROTOCOL_NAME,Feature,Unbalanced_coef,Balanced_coef,ranks
101,tox21-ahr-p1,FractionCSP3,-0.562335,-1.345111,1.0
101,tox21-ap1-agonist-p1,FractionCSP3,-0.249842,-0.959599,64.0
101,tox21-ar-mda-kb2-luc-agonist-p3,FractionCSP3,-0.51043,-0.709943,1.0
101,tox21-pparg-bla-antagonist-p1,FractionCSP3,-0.500756,-1.039924,1.0


In [16]:
performance_results

Unnamed: 0,PROTOCOL_NAME,model,accuracy,balanced accuracy,f1_score
0,tox21-ahr-p1,unbalanced,0.920933,0.606832,0.322222
1,tox21-ahr-p1,balanced,0.815295,0.78279,0.394904
0,tox21-ap1-agonist-p1,unbalanced,0.949359,0.592773,0.285714
1,tox21-ap1-agonist-p1,balanced,0.827819,0.749616,0.285714
0,tox21-ar-mda-kb2-luc-agonist-p3,unbalanced,0.896226,0.640731,0.407692
1,tox21-ar-mda-kb2-luc-agonist-p3,balanced,0.812668,0.787968,0.483271
0,tox21-pparg-bla-antagonist-p1,unbalanced,0.916999,0.590277,0.287671
1,tox21-pparg-bla-antagonist-p1,balanced,0.830806,0.76346,0.407821
