# Import Libraries

In [1]:
import os
import csv 
import math
import joblib
import pickle
import warnings
from tqdm import tqdm

import pandas as pd
import numpy as np
import statistics as stat
import matplotlib.pyplot as plt 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

# Splitting Data

## Concate data padel with target

In [None]:
# Import data from csv into pandas DataFrame
dfPadel = pd.read_csv("data/padel_desc.csv")
dtRawIC50 = pd.read_csv("data/pIC50_target.csv")
dtIC50 = dtRawIC50['pIC50']

# Concatenate data dataframe (Padel,Cdk,Target)
df = pd.concat([dfPadel,dtIC50], axis=1)
df = df.drop(columns = ['Name'])

# Export concatenated data
# df.to_csv (r'data\all_data.csv',index=False, header=True,sep=',')

## Split Train & Test + PCC Correlation

In [None]:
# Loop for finding correlation combination
for i in range(10):
    # Divide into training data and test data
    dfTrain = df.sample(frac=0.8,random_state=i)
    dfTest = df.drop(dfTrain.index)

    # Find 100 descriptors with strongest correlation with pIC50
    train_corr = dfTrain.corr()
    train_corr = train_corr.iloc[:-1,[-1]]
    train_corr.sort_values("pIC50", ascending = False, inplace = True)
    train_corr = train_corr.iloc[:100,:]

    # Check for WTPT-3 only
    if train_corr.index[0] == "WTPT-3":
        print("i: %04d"%i)
        # Dump training and test data
        dfTrain.to_pickle("./pickle/train_%04d.pkl"%i)
        dfTest.to_pickle("./pickle/test_%04d.pkl"%i)
        # Dump correlation data
        train_corr.to_csv(r'data/dataCorr_%04d.pkl.csv'%i, index= None, header = True)
        # Dump list of 100 strongest des
        corr_100 = train_corr.index.values.tolist()[:100]
        with open('./pickle/corr100_%04d.pkl'%i, 'wb') as f:
            pickle.dump(corr_100, f)

# Feature Selection : SA

## SA Functions

In [2]:
def acceptanceProbability (bestMSE, newMSE, temp , K):
        # If the new solution is worse, calculate an acceptance probability
        return np.exp( K * (newMSE - bestMSE) / temp)
    
def calcMSE(combDesc, x_train, x_val, y_train, y_val, model):
    x_train_slice = x_train.iloc[:,combDesc]
    x_val_slice = x_val.iloc[:,combDesc]
    
    scaler = MinMaxScaler()
    scaler.fit(x_train_slice)
    scale_x_train = scaler.transform(x_train_slice)
    scale_x_val = scaler.transform(x_val_slice)
    
    model.fit(scale_x_train, y_train)
    y_pred = model.predict(scale_x_val)
    return mean_squared_error(y_val, y_pred)

## SA Prepare Data

In [9]:
corrs = []
cor1 = joblib.load('./p/corr100_0003.pkl')
cor2 = joblib.load('./p/corr100_0005.pkl')
cor3 = joblib.load('./p/corr100_0006.pkl')
cor4 = joblib.load('./p/corr100_0007.pkl')
cor5 = joblib.load('./p/corr100_0008.pkl')

trains = []
tr1 = joblib.load('./p/train_0003.pkl')
tr2 = joblib.load('./p/train_0005.pkl')
tr3 = joblib.load('./p/train_0006.pkl')
tr4 = joblib.load('./p/train_0007.pkl')
tr5 = joblib.load('./p/train_0008.pkl')

corrs.extend([cor1, cor2, cor3, cor4, cor5])
trains.extend([tr1, tr2, tr3, tr4, tr5])

#change index for data
corr_100 = corrs[0]
dft = trains[0]

train = dft.loc[:,corr_100]
train["pIC50"] = dft.iloc[:, -1]
descName = train.columns.values

trained, val = train_test_split(train, test_size = 0.2, random_state = 10)

x_train = trained.iloc[:,:-1]
x_val = val.iloc[:,:-1]
y_train = trained.iloc[:, [-1]]
y_val = val.iloc[:,[-1]]

model = LinearRegression()
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((59, 100), (15, 100), (59, 1), (15, 1))

## SA algorithm

In [11]:
combos = [5,10,15,20,25]

for k in tqdm(range(5)):

    ## INTO DE SA DE LA'SOIN MODDED ##
    descNum = combos[k]
    descMSE_descNum = []

    for j in tqdm(range(20)):
        # Initialize Values
        initTemp, temp = 100, 100
        tempEnd = 0.1
        cooling_rate = 0.98
        iteration = 50

        # Simulated Annealing
        bestList = []
        bestMSEList = []
        tempChange = []
        sounded = True

        # Initialize solution & Best MSE (Random descNum descriptor)
        bestSol = np.random.choice(x_train.shape[1],descNum, replace=False)
        bestSol = list(bestSol)
        bestSol.sort()

        # First MSE
        bestMSE = calcMSE(bestSol, x_train, x_val, y_train, y_val, model)
        while temp > tempEnd:
            for i in range(iteration):

                # Create new solution & new MSE
                newSol = np.random.choice(x_train.shape[1], descNum, replace=False)
                newMSE = calcMSE(newSol, x_train, x_val, y_train, y_val, model)
                deltaMSE = newMSE - bestMSE
                
                # New solution is better based on MSE value
                if newMSE < bestMSE :
                    bestMSE = newMSE
                    bestSol = newSol
                    bestList.append([bestMSE,bestSol])
                    tempChange.append(temp)
                    bestMSEList.append(bestMSE)
                    
                #Probability to accept bad solution
                else :
                    K = (initTemp * np.log(0.8)) / deltaMSE
                    if acceptanceProbability (bestMSE, newMSE, temp , K) > np.random.rand(0,1):
                        bestMSE = newMSE
                        bestSol = newSol
                        bestList.append([bestMSE,bestSol])
                        tempChange.append(temp)
                        bestMSEList.append(bestMSE)

            temp *= cooling_rate

        descMSE_descNum.append([bestList[-1][0],bestList[-1][1],bestMSEList,tempChange])
       
    # Extracting results
    df_SA = pd.DataFrame(descMSE_descNum)
    df_SA.columns=["MSE","solution","growth", "temp"]
    df_SA.reset_index(drop=True, inplace= True)

    # Sort values
    df_SA_sort = df_SA.copy()
    df_SA_sort.sort_values('MSE', inplace=True)

    # Get Descriptors name    
    bestDescriptor = []
    for i in df_SA_sort.iloc[0,1]:
        bestDescriptor.append(descName[i])

    bestSAGrowth = df_SA_sort.iloc[0,2]
        
    df_SA.to_pickle("./p/raw_0003_%02d.pkl"%k)
    joblib.dump(bestDescriptor, "./p/best_0003_%02.pkl"%k)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s][A
  5%|████▏                                                                              | 1/20 [01:16<24:15, 76.61s/it][A

KeyboardInterrupt: 