In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
import numpy as np
import math


In [2]:
class Fisher_myself():

    def __init__(self):        
        breast_cancer = load_breast_cancer()
        x = breast_cancer['data']
        y = breast_cancer['target']
        # 随机采样，将10%的数据作为测试样本
        x_train, x_test, self.y_train, self.y_test = train_test_split(x, y, random_state=11, test_size=0.2)
        
        # 标准化处理
        ss = StandardScaler()
        # 分别对训练和测试数据的特征以及目标值进行标准化处理
        self.x_train = ss.fit_transform(x_train)
        self.x_test = ss.fit_transform(x_test)


    def get_mean_vector(self, target):
        """
        此函数用于求均值向量
        :param target:
        :return:
        """
        m_target_list = [0 for i in range(self.x_train.shape[1])]
        count = 0
        for i in range(self.x_train.shape[0]):
            if self.y_train[i] == target:
                count = count + 1
                temp = self.x_train[i].tolist()
                m_target_list = [m_target_list[j] + temp[j] for j in range(self.x_train.shape[1])]
        m_target_list = [x / count for x in m_target_list]
        # 其实可以用类似torch的压缩维度的函数直接求和
        return m_target_list
    
    
    def get_dispersion_matrix(self, target, mean_vector):
        """
        求样本内离散度矩阵
        :param target:
        :param mean_vector:
        :return:
        """
        s_target_matrix = np.zeros((self.x_train.shape[1], self.x_train.shape[1]))
        for i in range(self.x_train.shape[0]):
            if self.y_train[i] == target:
                temp = np.multiply(self.x_train[i] - mean_vector, (self.x_train[i] - mean_vector).transpose())
                s_target_matrix = s_target_matrix + temp
        return s_target_matrix
    
    
    def get_sample_divergence(self, mean_vector1, mean_vector2):
        """
        求样本间离散度
        :param mean_vector1:
        :param mean_vector2:
        :return:
        """
        return np.multiply((mean_vector1 - mean_vector2), (mean_vector1 - mean_vector2).transpose())
    
    
    def get_w_star(self, dispersion_matrix, mean_vector1, mean_vector2):
        """
        求Fisher准则函数的w_star解
        :param dispersion_matrix:
        :param mean_vector1:
        :param mean_vector2:
        :return:
        """
        return np.matmul(np.linalg.inv(dispersion_matrix), (mean_vector1 - mean_vector2))
    
    
    def get_sample_projection(self, w_star, x):
        """
        求一特征向量在w_star上的投影
        :param w_star:
        :param x:
        :return:
        """
        return np.matmul(w_star.transpose(), x)
    
    
    def get_segmentation_threshold(self, w_star, way_flag):
        """
        求分割阈值
        :param w_star:
        :param way_flag:
        :return:
        """
        if way_flag == 0:
            y0_list = []
            y1_list = []
            for i in range(self.x_train.shape[0]):
                if self.y_train[i] == 0:
                    y0_list.append(self.get_sample_projection(w_star, self.x_train[i]))
                else:
                    y1_list.append(self.get_sample_projection(w_star, self.x_train[i]))
            ny0 = len(y0_list)
            ny1 = len(y1_list)
            my0 = sum(y0_list) / ny0
            my1 = sum(y1_list) / ny1
            segmentation_threshold = (ny0 * my0 + ny1 * my1) / (ny0 + ny1)
            return segmentation_threshold
        elif way_flag == 1:
            y0_list = []
            y1_list = []
            for i in range(self.x_train.shape[0]):
                if self.y_train[i] == 0:
                    y0_list.append(self.get_sample_projection(w_star, self.x_train[i]))
                else:
                    y1_list.append(self.get_sample_projection(w_star, self.x_train[i]))
            ny0 = len(y0_list)
            ny1 = len(y1_list)
            my0 = sum(y0_list) / ny0
            my1 = sum(y1_list) / ny1
            py0 = ny0 / (ny0 + ny1)
            py1 = ny1 / (ny0 + ny1)
            segmentation_threshold = (my0 + my1) / 2 + math.log(py0 / py1) / (ny0 - ny1 - 2)
            return segmentation_threshold
        else:
            return 0
    
    
    def test_single_smaple(self, w_star, y0, test_sample, test_target):
        """
        单例测试
        :param y0:
        :param x:
        :return:
        """
        self.y_test = self.get_sample_projection(w_star, test_sample)
        predection = 1
        if self.y_test > y0:
            predection = 0
        print("This x_vector's target is {}, and the predection is {}".format(test_target, predection))
    
    
    def test_single_smaple_check(self, w_star, y0, test_sample, test_target):
        '''
        单例测试（用于统计）
        :param y0:
        :param x:
        :return:
        '''
        y_test = self.get_sample_projection(w_star, test_sample)
        predection = 1
        if y_test > y0:
            predection = 0
        if test_target == predection:
            return True
        else:
            return False
    
    
    def test_check(self, w_star, y0):
        """
        统计测试样本
        :param w_star:
        :param y0:+6
        :return:
        """
        right_count = 0
        for i in range(self.x_test.shape[0]):
            boolean = self.test_single_smaple_check(w_star, y0, test_sample=self.x_test[i], test_target=self.y_test[i])
            if boolean == True:
                right_count = right_count + 1
        return self.x_test.shape[0], right_count, right_count / self.x_test.shape[0]

In [3]:
fisher = Fisher_myself()
m0 = np.array(fisher.get_mean_vector(0)).reshape(-1, 1)
m1 = np.array(fisher.get_mean_vector(1)).reshape(-1, 1)
s0 = fisher.get_dispersion_matrix(0, m0)
s1 = fisher.get_dispersion_matrix(1, m1)
sw = s0 + s1
sb = fisher.get_sample_divergence(m0, m1)
w_star = np.array(fisher.get_w_star(sw, m0, m1)).reshape(-1, 1)
y0 = fisher.get_segmentation_threshold(w_star, 0)
print("The segmentation_threshold is ", y0)
test_sum, right_sum, accuracy = fisher.test_check(w_star, y0)
print("Total specimen number:{}\nNumber of correctly predicted samples:{}\nAccuracy:{}\n".format(test_sum, right_sum,accuracy))

The segmentation_threshold is  [-4.45309235e-17]
Total specimen number:114
Number of correctly predicted samples:98
Accuracy:0.8596491228070176

