In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import gc
import glob
import sys
import random
import string
import tqdm
import json
import time
import sqlite3
import warnings
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger

from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.spe2vec import Corpus

from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

from multiprocessing import Pool

from fastai import *
from fastai.text import *
#from utils import *
import torch

supp_script_path = '../../supp_scripts'
sys.path.append(supp_script_path) # path for support scripts folder
import supp_utils as su

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device,torch.cuda.is_available()

(device(type='cuda'), True)

In [5]:
# To remove warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")
# To remove rdkit warning
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [6]:
parameter_filename = "parameters_molpmofit.json" 

In [7]:
# Whole section is to read parameters from the parameter file
parameter_file = open(parameter_filename)
parameters = json.load(parameter_file)
parameter_file.close()

# User inputs
input_file_train = parameters["input_file_train"] # input file
input_file_test = parameters["input_file_test"] # input file

trial = parameters["trial"] # setting False saves the output files else not saved

if not trial:
    run_folder = parameters["run_folder"]

gpu_id = int(parameters["gpu_id"])
if gpu_id != None:
    device = "cuda:" + str(gpu_id)
else:
    gpu_id = 0

torch.cuda.set_device(device)
# User inputs
trial = parameters["trial"] # setting False saves the output files else not saved

k_fold_value = int(parameters["k_fold_value"]) # Number of folds

label_wise_augmentation = parameters["augmentation"]["label_wise_augmentation"]
number_of_augmentation = int(parameters["augmentation"]["number_of_augmentation"])
iteration = int(parameters["augmentation"]["iteration"])

tokenization = parameters["tokens"]["tokenization"] # options are SPE,atomwise,vocab_file
if tokenization == "SPE":
    spe_token_path = parameters["tokens"]["spe_token_path"]
else:
    spe_token_path = ""
    
#####################
# Network parameters#
#####################
load_model = parameters["pretrained_model"]["load_model"]
#if load_model is True set the path for pretrained_model_path
pretrained_model_path = parameters["pretrained_model"]["pretrained_model_path"]
pretraining_new_wt = parameters["pretrained_model"]["pretraining_new_wt"]
pretraining_new_vocab = parameters["pretrained_model"]["pretraining_new_vocab"]

batch_size = int(parameters["network_parameters"]["batch_size"])
enable_class_weight = parameters["network_parameters"]["enable_class_weight"]

Number_of_workers = int(parameters["Number_of_workers"])


##################
### Do not edit###
##################
os.system("mkdir " + str(run_folder))

atomwise_tokenization = False
train_SPE = False

if tokenization == "SPE":
    train_SPE = True
else:
    atomwise_tokenization = True

if not trial:
    network_parameter_output = open(str(run_folder) + "/network_parameters.txt","w",1)
    for parameter in parameters:
        network_parameter_output.write(str(parameter) + " = " + str(parameters[parameter]) + "\n")

In [8]:
# Reading train_valid and test splits from the file and make it to a dataframe
smiles_label_test = {line.split()[0]:line.split()[1] for line in open(input_file_test,"r").readlines()}
smiles_label_test = dict(sorted(smiles_label_test.items(), key=lambda item: item[1]))

smiles_label_train = {line.split()[0]:line.split()[1] for line in open(input_file_train,"r").readlines()}
smiles_label_train = dict(sorted(smiles_label_train.items(), key=lambda item: item[1]))

train_valid_df = su.dict_to_label(smiles_label_train)
train_valid_df = train_valid_df
test_df = su.dict_to_label(smiles_label_test)
test_df = test_df.sample(frac=1).reset_index(drop=True)

data_path = Path(run_folder)
#name = 'classification_new'
path = data_path
path.mkdir(exist_ok=True, parents=True)

gc.collect()
torch.cuda.empty_cache()

def get_accuracy(yhat,y):
    softmax = torch.exp(yhat.float())
    prob = softmax.cpu().detach().numpy()
    predictions = np.argmax(prob, axis=1)
    y_truth = y.cpu().detach().numpy()
    accuracy_check = (y_truth==predictions)
    count = np.count_nonzero(accuracy_check)
    accuracy = (count/len(accuracy_check))
    return accuracy

In [10]:
for fold in range(k_fold_value):
    print ("FOLD " + str(fold) + "/" + str(k_fold_value))
    if not trial:
        log_file = open(str(run_folder) + "/model_" + str(fold) + ".txt","w")
        
    piece_count = fold + 1
    # create train and valid dataframe of the current fold
    train,valid,piece_count = su.CV.get_K_fold_cv_data(train_valid_df,k_fold_value,piece_count,shuffle_output=True)
    train_df = pd.DataFrame(train.items(),columns=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True)
    valid_df = pd.DataFrame(valid.items(),columns=["Smiles", "Label"]).sample(frac=1).reset_index(drop=True)
    
    # calculate class_weight
    if enable_class_weight:
        class_weight = torch.FloatTensor(su.get_class_weight(train_df)).cuda()
        if not trial:
            log_file.write("Class weight for loss (balancing weights)= " + str(class_weight) + "\n")
    
    if not trial:
        log_file.write("Class distribution before augmentation\n")
        log_file.write("Train data\n")
        log_file.write(str(train_df.groupby('Label').count()) + "\n")
        log_file.write("Valid data\n")
        log_file.write(str(valid_df.groupby('Label').count()) + "\n")
        log_file.write("Test data\n")
        log_file.write(str(test_df.groupby('Label').count()) + "\n")
        
    # Data augmentation
    if number_of_augmentation > 0:
        if label_wise_augmentation:
            # label wise augmentation list calculation
            train_augmentation_list = su.get_augmentation_list(train_df,number_of_augmentation)
            number_of_augmentation_train = train_augmentation_list

        else:   
            number_of_augmentation_train = number_of_augmentation
        
        # Augmentation
        train_data = su.smiles_augmentation(train_df,
                                            N_rounds=number_of_augmentation_train,
                                            iteration=iteration,
                                            data_set_type="train_data",
                                            Number_of_workers=Number_of_workers)     
        
        if not trial:
            log_file.write("number of augmentation = " + str(number_of_augmentation) + "\n")
            log_file.write("Class distribution after augmentation\n")
            log_file.write("Train data\n")
            log_file.write(str(train_data.groupby('Label').count()) + "\n")
            log_file.write("Valid data\n")
            log_file.write(str(valid_data.groupby('Label').count()) + "\n")
            log_file.write("Test data\n")
            log_file.write(str(valid_data.groupby('Label').count()) + "\n")
    else:
        train_data = train_df
    valid_data = valid_df
    test_data = test_df
        
    # Tokenizer initialization    
    if tokenization == "SPE":
        MolTokenizer = su.molpmofit.MolTokenizer_spe_sos_eos
    else:
        MolTokenizer = su.molpmofit.MolTokenizer_atomwise_sos_eos

    tok = Tokenizer(partial(MolTokenizer,token_path=spe_token_path), n_cpus=Number_of_workers, pre_rules=[], post_rules=[])
    
    # databunch for training data for vocab for initializing model
    qsar_vocab = TextLMDataBunch.from_df(path, train_data, valid_data, bs=batch_size, tokenizer=tok,chunksize=50000, text_cols=0,label_cols=1, max_vocab=60000, include_bos=False)
    
    # Loading and saving pretrained model with required parameter
    pretrained_model_path = Path(pretrained_model_path)

    pretrained_fnames = [pretraining_new_wt, pretraining_new_vocab]
    fnames = [pretrained_model_path/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]

    lm_learner = language_model_learner(qsar_vocab, AWD_LSTM, drop_mult=1.0)
    lm_learner = lm_learner.load_pretrained(*fnames)
    lm_learner.freeze()
    lm_learner.save_encoder(f'lm_encoder')
    
    # make dataclass (data for classification) using train and valid df
    data_clas = TextClasDataBunch.from_df(path, train_data, valid_data, bs=batch_size, tokenizer=tok, 
                                              chunksize=50000, text_cols='Smiles',label_cols='Label', 
                                              vocab=qsar_vocab.vocab, max_vocab=60000, include_bos=False)
    
    
    
    cls_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    # Loading model for finetuning
    cls_learner.load_encoder(f'lm_encoder',device=device)
    
    
    
    if enable_class_weight:
        cls_learner.loss_func = nn.CrossEntropyLoss(weight=class_weight)
        #cls_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2,loss_func=nn.CrossEntropyLoss(weight=class_weight))
    else:
        cls_learner.loss_func = nn.CrossEntropyLoss()
        #cls_learner = text_classifier_learner(data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)#,loss_func=nn.CrossEntropyLoss(weight=class_weight))
        
    cls_learner.freeze()
    cls_learner.fit_one_cycle(4, 3e-3, moms=(0.8,0.7))
    cls_learner.freeze_to(-2)
    cls_learner.fit_one_cycle(4, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
    cls_learner.freeze_to(-3)
    cls_learner.fit_one_cycle(4, slice(5e-4/(2.6**4),5e-4), moms=(0.8,0.7))
    cls_learner.unfreeze()
    cls_learner.fit_one_cycle(6, slice(5e-5/(2.6**4),5e-5), moms=(0.8,0.7))
    
    # Saving model
    split_type = ""
    split_id = "model_" + str(fold)
    cls_learner.save(f'{split_type}_{split_id}_clas')
    gc.collect()
    torch.cuda.empty_cache()
    
    # Calculating statistics for train, valid and test data on the final model of the fold
    train_data_clas = TextClasDataBunch.from_df(path, train_data, train_data, bs=batch_size, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)
    learner = text_classifier_learner(train_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    learner.load(f'{split_type}_{split_id}_clas', purge=False);
    pred,lbl,loss = learner.get_preds(with_loss=True,ordered=True)
    
    accuracy = str(get_accuracy(pred,lbl))
    loss = str(sum(loss)/len(loss))
    softmax = torch.exp(pred.float())
    prob = softmax.cpu().detach().numpy()
    predictions = np.argmax(prob, axis=1)
    y_truth = lbl.cpu().detach().numpy()
    target_names = ["class " + str(entry) for entry in range(len(set(y_truth)))]
    report = classification_report(y_truth, predictions, target_names=target_names,digits=7)
    log_file.write("\n\n\nTrain data : Accu-" + str(accuracy) + "\tLoss-" + str(loss) + "\n")
    log_file.write("Train data report \n-" + str(report) + "\n\n\n\n\n")
    gc.collect()
    torch.cuda.empty_cache()
    
    valid_data_clas = TextClasDataBunch.from_df(path, train_data, valid_data, bs=batch_size, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)
    learner = text_classifier_learner(valid_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    learner.load(f'{split_type}_{split_id}_clas', purge=False);
    pred,lbl,loss = learner.get_preds(with_loss=True,ordered=True)
    
    accuracy = str(get_accuracy(pred,lbl))
    loss = str(sum(loss)/len(loss))
    softmax = torch.exp(pred.float())
    prob = softmax.cpu().detach().numpy()
    predictions = np.argmax(prob, axis=1)
    y_truth = lbl.cpu().detach().numpy()
    target_names = ["class " + str(entry) for entry in range(len(set(y_truth)))]
    report = classification_report(y_truth, predictions, target_names=target_names,digits=7)
    log_file.write("\n\n\nValid data : Accu-" + str(accuracy) + "\tLoss-" + str(loss) + "\n")
    log_file.write("Valid data report \n-" + str(report) + "\n\n\n\n\n")
    gc.collect()
    torch.cuda.empty_cache()
    
    test_data_clas = TextClasDataBunch.from_df(path, train_data, test_data, bs=batch_size, tokenizer=tok, 
                              chunksize=50000, text_cols='Smiles',label_cols='Label', vocab=qsar_vocab.vocab, max_vocab=60000,
                                              include_bos=False)
    learner = text_classifier_learner(test_data_clas, AWD_LSTM, pretrained=False, drop_mult=0.2)
    learner.load(f'{split_type}_{split_id}_clas', purge=False);
    pred,lbl,loss = learner.get_preds(with_loss=True,ordered=True)
    
    accuracy = str(get_accuracy(pred,lbl))
    loss = str(sum(loss)/len(loss))
    softmax = torch.exp(pred.float())
    prob = softmax.cpu().detach().numpy()
    predictions = np.argmax(prob, axis=1)
    y_truth = lbl.cpu().detach().numpy()
    target_names = ["class " + str(entry) for entry in range(len(set(y_truth)))]
    report = classification_report(y_truth, predictions, target_names=target_names,digits=7)
    log_file.write("\n\n\nTest data : Accu-" + str(accuracy) + "\tLoss-" + str(loss) + "\n")
    log_file.write("Test data report \n-" + str(report) + "\n\n\n\n\n")
    gc.collect()
    torch.cuda.empty_cache()

FOLD 0/10


epoch,train_loss,valid_loss,accuracy,time
0,1.766532,1.712904,0.552716,00:14
1,1.391983,0.996795,0.71885,00:14
2,1.103948,0.733317,0.785942,00:15
3,0.878609,0.637058,0.833866,00:13


epoch,train_loss,valid_loss,accuracy,time
0,0.776049,0.896091,0.757188,00:17
1,0.821616,0.858727,0.744409,00:16
2,0.654546,0.370206,0.910543,00:17
3,0.463467,0.318041,0.904153,00:18


epoch,train_loss,valid_loss,accuracy,time


RuntimeError: CUDA out of memory. Tried to allocate 92.00 MiB (GPU 0; 3.95 GiB total capacity; 2.38 GiB already allocated; 74.69 MiB free; 145.08 MiB cached)