### Descriptive statistics

In [None]:
import pandas as pd

# Load data
data = pd.read_csv("data/all4M_data.csv")

first_col = data.iloc[:, 0]
numeric_data = data.iloc[:, 1:]

Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)

# Remove columns where Q1 and Q3 are the same
numeric_data = numeric_data.loc[:, Q1 != Q3]

out_of_range = (numeric_data < -1e6) | (numeric_data > 1e6)
out_of_range_proportion = out_of_range.mean()
threshold = 0.5  
columns_to_keep = out_of_range_proportion[out_of_range_proportion <= threshold].index
numeric_data = numeric_data[columns_to_keep]
numeric_data = numeric_data[(numeric_data >= -1e6) & (numeric_data <= 1e6)].dropna()

In [None]:
random_columns = numeric_data.sample(n=15, axis=1, random_state=42)
describe_transposed = random_columns.describe().T
describe_transposed.to_csv('results/random_columns_summary.csv')

### TDC Datasets prediction

In [None]:
import os
import json
import pandas as pd

# path with TDC results - obtained with smile-to-bert-tdc
directory = "results_paper/"
json_files = [f for f in os.listdir(directory) if f.endswith('.json')]

data = {}
for i in range(len(json_files)):
    filename = f"results_paper/{json_files[i]}"
    with open(filename, "r") as file:
        json_data = json.load(file)
        list_names = json_files[i].replace('.json', '').split('_')
        if len(list_names)==3:
            column_name = f"{list_names[0]} with {list_names[1]} -- {list_names[2]}"
        elif len(list_names)==2:
            column_name = f"{list_names[1]} with {list_names[0]} -- 256"
        else:
            column_name = f"Combined {list_names[0]} and Smile-to-Bert with {list_names[2]} -- {list_names[3]}"
        data[column_name] = json_data

df_total = pd.DataFrame(data)

In [None]:
from tdc.metadata import admet_metrics
from tdc.single_pred import ADME
from tdc import utils
import pandas as pd
from tdc.benchmark_group import admet_group

adme_datasets = utils.retrieve_benchmark_names('ADMET_Group')


adme_info_dict = {}

for dataset_name in adme_datasets:
    metric = admet_metrics[dataset_name]

    if metric in {"mae", "spearman"}:
        task = "regression"
    else:
        task = "classification"

    group = admet_group(path = 'admet/')
    benchmark = group.get(dataset_name) 
    train, test = benchmark['train_val'], benchmark['test']
    total_data = pd.concat([train, test])
    
    adme_info_dict[dataset_name] = [task,len(total_data),metric]

In [None]:
import numpy as np
df_adme_info = pd.DataFrame.from_dict(adme_info_dict, orient="index")
df_adme_info.columns = ['Task', 'Sample size', 'Metric']
df_final = df_adme_info.merge(df_total, left_index=True, right_index=True, how="left")

list_idxs = [0,1,2]
for i in range(len(df_final)):
    row = df_final.iloc[i, :]
    results = row[3:] 
    values = np.array([val[0] for val in results])
    if row['Metric']=='mae':
        best_result = values.min()
    else:
        best_result = values.max()
    best_idx = np.where(values == best_result)[0][0]
    list_idxs.append(best_idx+3)

unique_values = list(set(list_idxs))

In [None]:
df_best = df_final.iloc[:,unique_values]
for i in range(len(df_best)):
    for j in range(3,len(df_best.columns)):
        df_best.iloc[i,j] = f"{df_best.iloc[i,j][0]:.3f} ({df_best.iloc[i,j][1]:.3f})"

df_best.to_csv('results/tdc_results.csv')
df_final.to_csv('results/alltdc_results.csv')