In [16]:
from pathlib import Path
import pandas as pd
import numpy as np

In [42]:
data_root = Path('data')
tggate_data_path = data_root.joinpath('TGGATE_INHANDS_pub2.xlsx')
assert tggate_data_path.exists(), 'tggate excel file does not exist'

In [18]:
raw_data = pd.read_excel(tggate_data_path,sheet_name='ALL_data')

In [19]:
time = '29day'
finding_col_name = 'Finding: Final INHANDS nomenclature'
data_29 = raw_data[raw_data['Time'] == time]

In [20]:
def process_data(data, finding_col_name):
    """Process raw annotation data and extract dili-pos/neg compounds

    Args:
        data (pd.DataFrame): annotation raw data
        time (str): study time      
        finding_col_name (str): name of the column where phnotypes(INHAND) are given

    Returns:
        pd.DataFrame: Dataframe with phenotype
    """    
    data.loc[data[finding_col_name].isna(),finding_col_name] = 'negative' # filling those rows with no phenotype observation with negative

    # extract the compounds in each phenotype
    res = data.groupby(by=finding_col_name).agg({"COMPOUND_NAME": lambda x: set(x)})
    # Filter phenotype findings that represented by at least 5 records
    which_classes = res[
        res['COMPOUND_NAME'].apply(lambda x: len(x)) >= 5
    ].index.to_list()

    which_classes.remove("negative")
    data_active = data[data[finding_col_name].isin(which_classes)]
    
    neutral_example_ids = data.groupby(["COMPOUND_NAME"]).agg({f"{finding_col_name}": lambda x: set(x)})
    neutral_example_ids = neutral_example_ids[
        neutral_example_ids[finding_col_name].apply(lambda x: x == set(["negative"]))
    ].index.to_list()
    data_inactive = data[data['COMPOUND_NAME'].isin(neutral_example_ids)]
    all_data = pd.concat((data_active, data_inactive))
    return all_data

In [32]:
processed_data = process_data(data_29,finding_col_name) 
# processed_data = processed_data[~processed_data['COMPOUND.Abbr'].isin(["TCP","CLM","TMD","CMP","PB"])]
processed_data.head()

Unnamed: 0,COMPOUND.Abbr,COMPOUND_NO,COMPOUND_NAME,Dose_Level,Dose,Dose_Unit,Time,Finding: Final INHANDS nomenclature,cell type
6,APAP,1,acetaminophen,High,1000.0,mg/kg,29day,Cytoplasmic alteration (Eosinophilic),(Hepatocyte)
8,APAP,1,acetaminophen,Low,300.0,mg/kg,29day,Cytoplasmic alteration (Eosinophilic),(Hepatocyte)
9,APAP,1,acetaminophen,Middle,600.0,mg/kg,29day,Cytoplasmic alteration (Eosinophilic),(Hepatocyte)
48,INAH,2,isoniazid,High,200.0,mg/kg,29day,Cytoplasmic alteration (Eosinophilic),(Hepatocyte)
83,CCL4,3,carbon tetrachloride,Low,30.0,mg/kg,29day,Fibrosis,0


## Preprocessing Step

The preprocessing considers those compounds with any observed phenotypes DILI-positive otherwise DILI-negative in 29days in vivo study.

In [33]:
# convert to a table each row represents a compounds and the columns are corresponding phenotypes
r = processed_data.groupby("COMPOUND_NAME").agg({f"{finding_col_name}": lambda x: list(set(x))})
complete_df = pd.DataFrame(index=r.index, columns=processed_data[f"{finding_col_name}"].unique())
for i, j_list in r.iterrows():
    for j in j_list[f"{finding_col_name}"]:
        complete_df.loc[i, j] = 1

# complete_df.rename(
#     columns={
#         "Cytoplasmic alteration (glycogen)": "Cytoplasmic alteration (Basophilic/glycogen depletion)"
#     },
#     inplace=True,
# )
# complete_df.columns = [
#     "fin_"
#     + i.lower()
#     .replace(" ", "_")
#     .replace(",", "")
#     .replace("(", "")
#     .replace(")", "")
#     .replace("/", "")
#     for i in complete_df.columns
# ]
complete_df = complete_df.fillna(0)
complete_df.head()

Unnamed: 0_level_0,Cytoplasmic alteration (Eosinophilic),Fibrosis,"Hypertrophy, hepatocellular",Cytoplasmic alteration (Basophilic/glycogen depletion),Vacuolation,Pigmentation (pigment deposition),Single Cell Necrosis,Hypertrophy/Hyperplasia,Extramedullary Hematopoiesis,negative
COMPOUND_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
WY-14643,0,0,1,0,0,0,0,0,0,0
acarbose,0,0,0,0,0,0,0,0,0,1
acetamidofluorene,0,0,1,0,0,1,0,0,0,0
acetaminophen,1,0,0,0,0,0,0,0,0,0
acetazolamide,0,0,0,0,0,0,0,0,0,1


In [35]:
assert (
        complete_df[complete_df.columns.difference(["negative"])].any(axis=1)
        != complete_df["negative"]
    ).all(), "there is a record which is both neutral and active"
assert (
        (
            complete_df[complete_df.columns.difference(["negative"])].any(axis=1)
            != complete_df["negative"]
        ).sum()
        == complete_df.shape[0]
    ), "there is a record which is both neutral and active"

In [41]:
complete_df['negative'].value_counts()

0    80
1    46
Name: negative, dtype: int64

In [54]:
complete_df['positive'] = np.logical_xor(complete_df['negative'],1).astype(int)

## Comparing DILI profile with DILIrank


In [96]:
dilirank_data = pd.read_excel(data_root.joinpath('DILIrank-DILIscore_List.xlsx'),header=1,index_col='Compound Name')
dilirank_data.head()

Unnamed: 0_level_0,LTKBID,Severity Class,Label Section,vDILIConcern,Version
Compound Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mercaptopurine,LT00003,8,Warnings and precautions,vMost-DILI-Concern,1
acetaminophen,LT00004,5,Warnings and precautions,vMost-DILI-Concern,2
azathioprine,LT00006,5,Warnings and precautions,vMost-DILI-Concern,1
chlorpheniramine,LT00009,0,No match,vNo-DILI-Concern,2
clofibrate,LT00011,3,Warnings and precautions,vLess-DILI-Concern,1


In [97]:
common_compounds_tggate_dilirank = dilirank_data.index.intersection(complete_df.index)
print(f'number of common compounds : {len(common_compounds_tggate_dilirank)}')

number of common compounds : 92


In [98]:
common_df = complete_df[['negative','positive']].join(dilirank_data['vDILIConcern'],how='inner').reset_index()
common_df.head()

Unnamed: 0,index,negative,positive,vDILIConcern
0,acarbose,1,0,vMost-DILI-Concern
1,acetaminophen,0,1,vMost-DILI-Concern
2,acetazolamide,1,0,vMost-DILI-Concern
3,allopurinol,1,0,vMost-DILI-Concern
4,amiodarone,0,1,vMost-DILI-Concern


In [99]:
common_df.groupby(by='vDILIConcern').agg({'negative':sum,'positive':sum,'index':lambda x: set(x)}).to_csv('tab2.csv')

In [72]:
complete_df.loc['chloramphenicol']

Cytoplasmic alteration (Eosinophilic)                     0
Fibrosis                                                  0
Hypertrophy, hepatocellular                               1
Cytoplasmic alteration (Basophilic/glycogen depletion)    0
Vacuolation                                               0
Pigmentation (pigment deposition)                         0
Single Cell Necrosis                                      0
Hypertrophy/Hyperplasia                                   0
Extramedullary Hematopoiesis                              0
negative                                                  0
positive                                                  1
Name: chloramphenicol, dtype: int64

## Comparing with DILIst dataset


In [87]:
dilist_data = pd.read_excel(data_root.joinpath('DILIst Supplementary Table.xlsx'),index_col='CompoundName')
dilist_data.head()

Index(['DILIST_ID', 'DILIst Classification ', 'Routs of Administration '], dtype='object')

In [81]:
common_compounds_tggate_dilist = dilist_data.index.intersection(complete_df.index)
print(f'number of common compounds : {len(common_compounds_tggate_dilist)}')

number of common compounds : 95


In [88]:
common_df = complete_df[['negative','positive']].join(dilist_data['DILIst Classification '],how='inner').reset_index()
common_df.head()

Unnamed: 0,index,negative,positive,DILIst Classification
0,acarbose,1,0,1
1,acetaminophen,0,1,1
2,acetazolamide,1,0,1
3,allopurinol,1,0,1
4,allyl alcohol,0,1,1


In [89]:
common_df.groupby(by='DILIst Classification ').agg({'negative':sum,'positive':sum,'index':lambda x: set(x)})


Unnamed: 0_level_0,negative,positive,index
DILIst Classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,4,"{furosemide, theophylline, chlorpheniramine, f..."
1,28,58,"{chlorpropamide, propylthiouracil, phenacetin,..."


In [94]:
common_df.groupby(by='DILIst Classification ').agg({'negative':sum,'positive':sum,'index':lambda x: set(x)}).to_csv('tab.csv')

In [101]:
data_29.groupby(by=finding_col_name).agg({'COMPOUND_NAME':lambda x: len(set(x))})

Unnamed: 0_level_0,COMPOUND_NAME
Finding: Final INHANDS nomenclature,Unnamed: 1_level_1
Apoptosis,2
Apoptosis/Single cell necrosis,1
"Atypia, nuclear",1
Bile Duct Hyperplasia,4
Cytoplasmic alteration (Basophilic/glycogen depletion),6
Cytoplasmic alteration (Eosinophilic),17
Cytoplasmic alteration (Glycogen accumulation),4
"Degeneration, Hydropic",2
Extramedullary Hematopoiesis,5
Fibrosis,5
