# 熵权法

In [3]:
import matplotlib.pyplot as plt
import scipy as sp
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd

plt.rcParams["font.sans-serif"] = ["SimHei"]  #设置字体
plt.rcParams["axes.unicode_minus"] = False  #该语句解决图像中的“-”负号的乱码问题

In [28]:
data = pd.read_csv('topsis.csv')
data

Unnamed: 0,var1,var2,var3,var4,var5,var6
0,171.33,151.33,0.28,0.0,106.36,0.05
1,646.66,370.0,1.07,61.0,1686.79,1.64
2,533.33,189.66,0.59,0.0,242.31,0.57
3,28.33,0.0,0.17,0.0,137.85,2.29
4,620.0,234.0,0.88,41.33,428.33,0.13
5,192.33,177.66,0.16,0.0,128.68,1.07
6,111.0,94.0,0.18,0.0,234.27,0.22
7,291.0,654.0,1.21,65.66,2.26,0.0
8,421.33,247.0,0.7,0.0,0.4,0.0
9,193.0,288.66,0.16,0.0,0.0,0.0


第一步：数据标准化。类似`sklearn.preprocess.Standard`的方法。之后计算出每一项所占的比重

In [10]:
for column in data.columns:
    data[column] = data[column].apply(
        lambda x: (x - np.min(data[column])) / (np.max(data[column] - np.min(data[column]))))
    sum = np.sum(data[column])
    data[column] = data[column].apply(lambda x: x / sum)
data

Unnamed: 0,var1,var2,var3,var4,var5,var6
0,0.015741,0.018075,0.011945,0.0,0.011487,0.003497
1,0.068063,0.044192,0.056883,0.032755,0.182181,0.114685
2,0.055588,0.022653,0.029579,0.0,0.026171,0.03986
3,0.0,0.0,0.005688,0.0,0.014888,0.16014
4,0.065129,0.027948,0.046075,0.022193,0.046262,0.009091
5,0.018052,0.021219,0.005119,0.0,0.013898,0.074825
6,0.0091,0.011227,0.006257,0.0,0.025302,0.015385
7,0.028914,0.078112,0.064846,0.035257,0.000244,0.0
8,0.04326,0.029501,0.035836,0.0,4.3e-05,0.0
9,0.018126,0.034477,0.005119,0.0,0.0,0.0


第二步：求各个指标的信息熵。
公式如下：
$$
e_j = -K \sum_{i=1}^m y_{ij} lny_{ij}, \quad e_j>0 \\
K = \frac{1}{ln m}, \quad 其中，m为影响因素的个数
$$

In [18]:
K = 1 / np.log(data.columns.size)
data.columns.size

6

In [16]:
x = np.array(data)
x
# x.shape

array([[1.57408549e-02, 1.80745196e-02, 1.19453925e-02, 0.00000000e+00,
        1.14873629e-02, 3.49650350e-03],
       [6.80632364e-02, 4.41919795e-02, 5.68828214e-02, 3.27550193e-02,
        1.82180979e-01, 1.14685315e-01],
       [5.55883337e-02, 2.26525698e-02, 2.95790671e-02, 0.00000000e+00,
        2.61705802e-02, 3.98601399e-02],
       [0.00000000e+00, 0.00000000e+00, 5.68828214e-03, 0.00000000e+00,
        1.48884259e-02, 1.60139860e-01],
       [6.51286127e-02, 2.79484411e-02, 4.60750853e-02, 2.21928680e-02,
        4.62615848e-02, 9.09090909e-03],
       [1.80524490e-02, 2.12193164e-02, 5.11945392e-03, 0.00000000e+00,
        1.38980243e-02, 7.48251748e-02],
       [9.09997534e-03, 1.12271516e-02, 6.25711035e-03, 0.00000000e+00,
        2.53022237e-02, 1.53846154e-02],
       [2.89136388e-02, 7.81123097e-02, 6.48464164e-02, 3.52572880e-02,
        2.44090262e-04, 0.00000000e+00],
       [4.32598320e-02, 2.95011323e-02, 3.58361775e-02, 0.00000000e+00,
        4.32018162e-05, 

In [21]:
x[1,:]

array([0.06806324, 0.04419198, 0.05688282, 0.03275502, 0.18218098,
       0.11468531])

In [0]:
li = []
for i in range(x.shape[0]):
    e_i = -(K*np.sum(x[i,:]))
    li.append(e_i)

其中$e_i$就是计算得出的30项目标的熵值。
接下来，计算每一项指标的差异系数。
差异系数：
$$
g_i = 1-e_i
$$

In [23]:
li = [1-i for i in li]

最后，计算各项指标的权重。
$$
W_i = \frac{g_i}{ \sum_{1}^{m} g_i
$$

In [24]:
sum_weight = np.sum(li)
weight = [x/sum_weight for x in li]
weight_dic = dict(list(enumerate(weight)))
weight_dic

{0: 0.03204436051114899,
 1: 0.03609870383397827,
 2: 0.033091290604232194,
 3: 0.03315484240593499,
 4: 0.03348788841848215,
 5: 0.0327142283108847,
 6: 0.03210477044535342,
 7: 0.03340158603222835,
 8: 0.03248769185378638,
 9: 0.03201638769108399,
 10: 0.03260216921356548,
 11: 0.033164288359362075,
 12: 0.03347735912388226,
 13: 0.03350260424119412,
 14: 0.04056684808243422,
 15: 0.031814995228932004,
 16: 0.03291474091340054,
 17: 0.03352262534322613,
 18: 0.033112932580859525,
 19: 0.034119381922857885,
 20: 0.033374737608058085,
 21: 0.032916176543524606,
 22: 0.0319328541220345,
 23: 0.032657465324992176,
 24: 0.032750728327984126,
 25: 0.0318064593403426,
 26: 0.03909389901859041,
 27: 0.0315458565520355,
 28: 0.03248827199962576,
 29: 0.03203385604598457}

In [27]:
sorted(weight_dic.items(),key=lambda x:x[1])

[(27, 0.0315458565520355),
 (25, 0.0318064593403426),
 (15, 0.031814995228932004),
 (22, 0.0319328541220345),
 (9, 0.03201638769108399),
 (29, 0.03203385604598457),
 (0, 0.03204436051114899),
 (6, 0.03210477044535342),
 (8, 0.03248769185378638),
 (28, 0.03248827199962576),
 (10, 0.03260216921356548),
 (23, 0.032657465324992176),
 (5, 0.0327142283108847),
 (24, 0.032750728327984126),
 (16, 0.03291474091340054),
 (21, 0.032916176543524606),
 (2, 0.033091290604232194),
 (18, 0.033112932580859525),
 (3, 0.03315484240593499),
 (11, 0.033164288359362075),
 (20, 0.033374737608058085),
 (7, 0.03340158603222835),
 (12, 0.03347735912388226),
 (4, 0.03348788841848215),
 (13, 0.03350260424119412),
 (17, 0.03352262534322613),
 (19, 0.034119381922857885),
 (1, 0.03609870383397827),
 (26, 0.03909389901859041),
 (14, 0.04056684808243422)]

In [5]:
data['var2'][1]

0.04419197951403153

In [7]:
data.index.size

30

In [0]:
import pandas as pd
import numpy as np
import math
from numpy import array

# 1读取数据
df = pd.read_csv('topsis.csv')
# 2数据预处理 ,去除空值的记录
df.dropna()


#定义熵值法函数
def cal_weight(x):
    '''熵值法计算变量的权重'''
    # 标准化
    x = x.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))

    # 求k
    rows = x.index.size  # 行
    cols = x.columns.size  # 列
    k = 1.0 / math.log(rows)

    lnf = [[None] * cols for i in range(rows)]

    # 矩阵计算--
    # 信息熵
    # p=array(p)
    x = array(x)
    lnf = [[None] * cols for i in range(rows)]
    lnf = array(lnf)
    for i in range(0, rows):
        for j in range(0, cols):
            if x[i][j] == 0:
                lnfij = 0.0
            else:
                p = x[i][j] / x.sum(axis=0)[j]
                lnfij = math.log(p) * p * (-k)
            lnf[i][j] = lnfij
    lnf = pd.DataFrame(lnf)
    E = lnf

    # 计算冗余度
    d = 1 - E.sum(axis=0)
    # 计算各指标的权重
    w = [[None] * 1 for i in range(cols)]
    for j in range(0, cols):
        wj = d[j] / sum(d)
        w[j] = wj
        # 计算各样本的综合得分,用最原始的数据

    w = pd.DataFrame(w)
    return w


w = cal_weight(df)  # 调用cal_weight
w.index = df.columns
w.columns = ['weight']
print(w)
print('运行完成!')

**使用PCA主成分分析查看结果**

In [29]:
from sklearn.decomposition import KernelPCA,PCA
data

Unnamed: 0,var1,var2,var3,var4,var5,var6
0,171.33,151.33,0.28,0.0,106.36,0.05
1,646.66,370.0,1.07,61.0,1686.79,1.64
2,533.33,189.66,0.59,0.0,242.31,0.57
3,28.33,0.0,0.17,0.0,137.85,2.29
4,620.0,234.0,0.88,41.33,428.33,0.13
5,192.33,177.66,0.16,0.0,128.68,1.07
6,111.0,94.0,0.18,0.0,234.27,0.22
7,291.0,654.0,1.21,65.66,2.26,0.0
8,421.33,247.0,0.7,0.0,0.4,0.0
9,193.0,288.66,0.16,0.0,0.0,0.0


In [36]:
x = np.array(data).T
x.shape

(6, 30)

In [37]:
sc = StandardScaler()
x_std = sc.fit_transform(x)

In [38]:
x_std

array([[ 1.34851825,  0.31143692,  1.92895303,  0.0044562 ,  1.6974762 ,
         1.28083244,  0.44139176,  0.50846021,  1.87451844,  0.96470121,
        -0.28542013,  1.41569074, -0.42316005,  0.00591501,  0.93809845,
         0.97760041,  0.15274958,  1.37311093, -0.28865815,  0.92650054,
         0.77182414,  1.58039574,  1.57506145,  1.21350535,  1.59596715,
         1.33502942,  1.71630549,  1.46269832, -0.20877823,  1.17558519],
       [ 1.07819737, -0.15313248,  0.14811394, -0.56081554,  0.05621805,
         1.10846993,  0.24247001,  2.02160645,  0.81955151,  1.78356587,
        -0.15408428,  1.41240562, -0.26432991,  0.44510435,  1.40888802,
         1.7748882 , -0.26858645,  1.4444496 , -0.34723174,  0.0554214 ,
         0.43065974,  1.14723817,  0.34282364,  1.59748758,  1.21493831,
         1.49047628,  0.46603095,  1.36441521, -0.25570446, -0.09036235],
       [-0.9634011 , -0.77264231, -0.83161442, -0.55742351, -0.93499982,
        -0.97703421, -0.85534402, -0.69951403, -0

In [39]:
pca_model = PCA(n_components=5)
x_pca = pca_model.fit_transform(x_std)

In [40]:
x_pca

array([[ 5.74395233e+00, -1.91449022e+00, -1.78866672e+00,
        -1.91069723e-01, -9.89359886e-06],
       [ 4.13228825e+00, -3.10486483e+00,  1.93859932e+00,
         6.06503280e-02, -3.96961530e-05],
       [-4.21271139e+00, -5.76275075e-01, -3.26323097e-02,
        -6.61870793e-01,  2.34193793e-02],
       [-3.55993803e+00, -1.07723478e+00, -3.81174951e-01,
         1.38096004e+00, -6.11927111e-05],
       [ 2.11188579e+00,  7.22159886e+00,  2.97294961e-01,
         7.79484444e-02,  6.40480399e-05],
       [-4.21547695e+00, -5.48733961e-01, -3.34202970e-02,
        -6.66618292e-01, -2.33726449e-02]])

**注意到主成分分析与熵权法的差异：主成分分析不会给出原数据各项指标的重要程度，仅仅会直接给出处理后的topK项重要特征。这些特征，具备相关性低（可以通过协方差矩阵或者是数据的相关性计算得出）、代表性高的特点。
但是熵权法从信息论的角度出发，可以给出每一项指标的重要程度。**
接下来尝试一下随机森林方法。

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
data = pd.read_csv('topsis.csv')

In [5]:
data_arr = np.array(data)

In [7]:
data_arr

array([[1.71330e+02, 1.51330e+02, 2.80000e-01, 0.00000e+00, 1.06360e+02,
        5.00000e-02],
       [6.46660e+02, 3.70000e+02, 1.07000e+00, 6.10000e+01, 1.68679e+03,
        1.64000e+00],
       [5.33330e+02, 1.89660e+02, 5.90000e-01, 0.00000e+00, 2.42310e+02,
        5.70000e-01],
       [2.83300e+01, 0.00000e+00, 1.70000e-01, 0.00000e+00, 1.37850e+02,
        2.29000e+00],
       [6.20000e+02, 2.34000e+02, 8.80000e-01, 4.13300e+01, 4.28330e+02,
        1.30000e-01],
       [1.92330e+02, 1.77660e+02, 1.60000e-01, 0.00000e+00, 1.28680e+02,
        1.07000e+00],
       [1.11000e+02, 9.40000e+01, 1.80000e-01, 0.00000e+00, 2.34270e+02,
        2.20000e-01],
       [2.91000e+02, 6.54000e+02, 1.21000e+00, 6.56600e+01, 2.26000e+00,
        0.00000e+00],
       [4.21330e+02, 2.47000e+02, 7.00000e-01, 0.00000e+00, 4.00000e-01,
        0.00000e+00],
       [1.93000e+02, 2.88660e+02, 1.60000e-01, 0.00000e+00, 0.00000e+00,
        0.00000e+00],
       [8.23300e+01, 1.18000e+02, 1.10000e-01, 0.0

In [9]:
data_arr = data_arr.T

In [10]:
random_forest = RandomForestClassifier(
    n_estimators=64,
    random_state=0
)