In [8]:
import pandas as pd
from tabulate import tabulate

# Read the data

In [None]:
all_data_path = "../../data/preprocessed_data/preprocessed_dataset_XO.xlsx"
dataset = pd.read_excel(all_data_path, sheet_name='Preprocessed_dataset')
len(dataset)

483

In [13]:
dataset.head()

Unnamed: 0,CID,SMILES,IC50(nM),aid,Type,Substructure
0,190,C1=NC2=NC=NC(=C2N1)N,10890.0,287937,inactive,3
1,471,C1=CC(=C(C=C1C2C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,100000.0,399340,inactive,14
2,675,CC1=CC2=C(C=C1C)N=CN2,200000.0,287937,inactive,16
3,938,C1=CC(=CN=C1)C(=O)O,518230.0,1444598,inactive,16
4,4947,CCCOC(=O)C1=CC(=C(C(=C1)O)O)O,628000.0,378145,inactive,2


# 1. Profile of the data

In [39]:
def check_activity_distribution(dataset, col_name):
    active_rows = dataset.loc[dataset[col_name] == "active"]
    inactive_rows = dataset.loc[dataset[col_name] == "inactive"]

    dataset_length = len(dataset)
    table = [['', 'Active', 'Inactive'], 
            ['Number', len(active_rows), len(inactive_rows)],
            ['Percentage (%)', len(active_rows)/dataset_length*100, len(inactive_rows)/dataset_length*100]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [40]:
print('All dataset size: ',len(dataset))
print("All dataset activity distribution:")
check_activity_distribution(dataset=dataset, col_name='Type')

All dataset size:  483
All dataset activity distribution:
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 217      │   266      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  44.9275 │    55.0725 │
╘════════════════╧══════════╧════════════╛


# 2. Train test split

In [None]:
from sklearn.model_selection import train_test_split

train_test_validation_data_path = "../../data/train_test_validation_data/"
train_test_validation_file_name = "XO_train_test_validation_data.xlsx"

labels = dataset['Type']  

train_test_data, validation_dataset, train_labels, validation_labels = train_test_split(
    dataset, labels, test_size= 0.15, random_state=42, stratify=labels
)

train_dataset, test_dataset, train_labels, test_labels = train_test_split(
    train_test_data, train_labels, test_size=0.1765, random_state=42, stratify=train_labels
)

In [30]:
print(len(train_dataset), len(validation_dataset), len(test_dataset))

337 73 73


In [31]:
train_dataset.head()

Unnamed: 0,CID,SMILES,IC50(nM),aid,Type,Substructure
21,65275,COC1=NC(=NC2=C1NC=N2)N,200000.0,287937,inactive,3
77,5318255,C1=CC(=C(C=C1C2=CC(=O)C3=C(O2)C(=C(C=C3O)O)O[C...,100000.0,1453375,inactive,11
295,137660639,COC1=C2C(=C(C=C1)/C=C/C(=O)N[C@H](CC3=CNC4=CC=...,60000.0,1433308,inactive,2
397,156780235,CC(C)CCN1C=C(C2=C1C=CC(=C2)C3=NC=CC(=O)N3)C#N,600.0,1888100,active,3
20,40634,CC1=C(C2=C(CCC(O2)(C)C(=O)O)C(=C1O)C)C,1000000.0,295041,inactive,16


In [41]:
print("Train dataset activity distribution:")
check_activity_distribution(train_dataset, 'Type')
print("Test dataset activity distribution:")
check_activity_distribution(test_dataset, 'Type')
print("Validation dataset activity distribution:")
check_activity_distribution(validation_dataset, 'Type')
print()

Train dataset activity distribution:
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │ 151      │   186      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  44.8071 │    55.1929 │
╘════════════════╧══════════╧════════════╛
Test dataset activity distribution:
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │  33      │    40      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  45.2055 │    54.7945 │
╘════════════════╧══════════╧════════════╛
Validation dataset activity distribution:
╒════════════════╤══════════╤════════════╕
│                │   Active │   Inactive │
╞════════════════╪══════════╪════════════╡
│ Number         │  33      │    40      │
├────────────────┼──────────┼────────────┤
│ Percentage (%) │  45.2055 │    54.7945 │
╘════════════════╧═══════

In [None]:
# Write to file
with pd.ExcelWriter(train_test_validation_data_path + train_test_validation_file_name, engine='openpyxl') as writer:
    train_dataset.to_excel(writer, sheet_name='train_dataset', index=False)
    test_dataset.to_excel(writer, sheet_name='test_dataset', index=False)
    validation_dataset.to_excel(writer, sheet_name='validation_dataset', index=False)