In [1]:
from pyswarm import pso
import numpy as np
import pandas as pd
import warnings 
%matplotlib inline
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
from sklearn import preprocessing
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor

In [2]:
#导入数据，20列特征,543个三个性质好的数据
data_f = pd.read_excel('ADMET_select.xlsx',index_col=0)
data_label = pd.read_excel('data_label_ER.xlsx',index_col=0)

In [3]:
#根据(543,20)数据训练出一个回归器，再使用粒子群算法优化该回归器
rfr = RandomForestRegressor(n_estimators=1000,max_depth=100)
rfr.fit(data_f,data_label['pIC50'])

RandomForestRegressor(max_depth=100, n_estimators=1000)

In [4]:
pre = rfr.predict(np.array(data_f.iloc[1]).reshape(1,-1))

In [7]:
import random
import numpy as np
import matplotlib.pyplot as plt
 
 
class PSO:
    def __init__(self, parameters):
        """
        particle swarm optimization
        parameter: a list type, like [NGEN, pop_size, var_num_min, var_num_max]
        """
        # 初始化
        self.NGEN = parameters[0]    # 迭代的代数
        self.pop_size = parameters[1]    # 种群大小
        self.var_num = 20     # 变量个数
        self.bound = []                 # 变量的约束范围
        self.bound.append(parameters[2])
        self.bound.append(parameters[3])
 
        self.pop_x = np.zeros((self.pop_size, self.var_num))    # 所有粒子的位置
        self.pop_v = np.zeros((self.pop_size, self.var_num))    # 所有粒子的速度
        self.p_best = np.zeros((self.pop_size, self.var_num))   # 每个粒子最优的位置
        self.g_best = np.zeros((1, self.var_num))   # 全局最优的位置
 
        # 初始化第0代初始全局最优解
        temp = -1
        for i in range(self.pop_size):
            for j in range(self.var_num):
                self.pop_x[i][j] = random.uniform(self.bound[0][j], self.bound[1][j])
                self.pop_v[i][j] = random.uniform(0, 1)
            self.p_best[i] = self.pop_x[i]      # 储存最优的个体
            fit = self.fitness(self.p_best[i])
            if fit > temp:
                self.g_best = self.p_best[i]
                temp = fit
 
    def fitness(self, ind_var):
        """
        个体适应值计算
        """
#         x1 = ind_var[0]
#         x2 = ind_var[1]
#         x3 = ind_var[2]
#         x4 = ind_var[3]
#         y = x1 ** 2 + x2 ** 2 + x3 ** 3 + x4 ** 4
        x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20 = ind_var[0],ind_var[1],ind_var[2],ind_var[3],\
        ind_var[4],ind_var[5],ind_var[6],ind_var[7],ind_var[8],ind_var[9],ind_var[10],ind_var[11],ind_var[12],ind_var[13],\
        ind_var[14],ind_var[15],ind_var[16],ind_var[17],ind_var[18],ind_var[19]
        data_pre = np.array([x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,x20]).reshape(1,-1)
        y = rfr.predict(data_pre)
        return float(y)
 
    def update_operator(self, pop_size):
        """
        更新算子：更新下一时刻的位置和速度
        """
        c1 = 2     # 学习因子，一般为2
        c2 = 2
        w = random.uniform(0,1)    # 自身权重因子
        for i in range(pop_size):
            # 更新速度
            self.pop_v[i] = w * self.pop_v[i] + c1 * random.uniform(0, 1) * (
                        self.p_best[i] - self.pop_x[i]) + c2 * random.uniform(0, 1) * (self.g_best - self.pop_x[i])
            # 更新位置
            self.pop_x[i] = self.pop_x[i] + self.pop_v[i]
            # 越界保护
            for j in range(self.var_num):
                if self.pop_x[i][j] < self.bound[0][j]:
                    self.pop_x[i][j] = self.bound[0][j]
                if self.pop_x[i][j] > self.bound[1][j]:
                    self.pop_x[i][j] = self.bound[1][j]
            # 更新p_best和g_best
            if self.fitness(self.pop_x[i]) > self.fitness(self.p_best[i]):
                self.p_best[i] = self.pop_x[i]
            if self.fitness(self.pop_x[i]) > self.fitness(self.g_best):
                self.g_best = self.pop_x[i]
        return w
 
    def main(self,log_save):
        popobj = []
        self.ng_best = np.zeros((1, self.var_num))[0]
        for gen in range(self.NGEN):
            w = self.update_operator(self.pop_size)
            popobj.append(self.fitness(self.g_best))
            print('############ Generation {} ############'.format(str(gen + 1)))
            if self.fitness(self.g_best) > self.fitness(self.ng_best):
                self.ng_best = self.g_best.copy()
            log_save.iloc[gen] = list(self.ng_best).append(self.fitness(self.ng_best))
            print('最好的位置：{}'.format(self.ng_best))
            print('自身权重因子：{}'.format(w))
            print('最大的函数值：{}'.format(self.fitness(self.ng_best)))
        print("---- End of (successful) Searching ----")

In [8]:
NGEN = 100
popsize = 100
low = list(data_f.min())
up = list(data_f.max())
log_save = pd.DataFrame(index=range(0,NGEN))
parameters = [NGEN, popsize, low, up]
pso = PSO(parameters)
pso.main(log_save)

############ Generation 1 ############
最好的位置：[30.6626696   3.39769581  4.40552433  7.44131608 19.84984137  0.38078336
  7.88637358  0.85277215  0.28357227  0.7626169   9.33110112  3.44954303
  5.         14.80390147  2.15719587  0.33110082 89.72002212  1.91429058
  2.50553813 12.95222271]
自身权重因子：0.5973554180559285
最大的函数值：7.129996299720817
############ Generation 2 ############
最好的位置：[ 38.02124667   4.37486191   1.62735962   9.43683428  15.89817683
   0.49774232   8.82194      0.85277215   0.19762131   0.7626169
   8.7039234    5.           5.          16.74829079   0.82253673
   0.38836416 104.96337086  -7.5157835    2.50553813   8.71144499]
自身权重因子：0.7580449567267429
最大的函数值：7.140976153931994
############ Generation 3 ############
最好的位置：[ 3.86033267e+01  4.38454516e+00  1.51690513e+01  0.00000000e+00
  2.90000000e+01  1.84750430e-02  8.82194000e+00  8.52772151e-01
  9.14876326e-02  7.62616899e-01  0.00000000e+00  0.00000000e+00
  5.00000000e+00  1.46246674e+01  0.00000000e+00  3.8836415

############ Generation 22 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.17418694995318884
最大的函数值：7.372073377876611
############ Generation 23 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.23057672635297388
最大的函数值：7.372073377876611
############ Generation 24 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194

############ Generation 42 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.8752644566967565
最大的函数值：7.372073377876611
############ Generation 43 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.379185080819879
最大的函数值：7.372073377876611
############ Generation 44 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000

############ Generation 62 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.5090676486217984
最大的函数值：7.372073377876611
############ Generation 63 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.20887362656653752
最大的函数值：7.372073377876611
############ Generation 64 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.821940

############ Generation 82 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.16574382446339053
最大的函数值：7.372073377876611
############ Generation 83 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194000e+00  8.52772151e-01
  0.00000000e+00  7.62616899e-01  0.00000000e+00  1.58939696e+00
  5.00000000e+00  1.39663131e+01  0.00000000e+00  3.88364158e-01
  1.46039565e+02 -1.17673817e+01  2.50553813e+00  1.22158375e+01]
自身权重因子：0.32544829617986126
最大的函数值：7.372073377876611
############ Generation 84 ############
最好的位置：[ 3.86033267e+01  4.43000000e+00  1.60616761e+01  2.46234648e-02
  2.89766591e+01  8.53213166e-02  8.82194

In [9]:
log_save.to_excel('log_pso.xlsx')