In [1]:
import numpy as np
import pandas as pd
from numpy import sqrt, e, log
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
# from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.font_manager
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors


In [2]:
class DecisionTreeClassifierWithWeight:
    def __init__(self):
        self.best_err = 1  # 最小的加权错误率 
        self.best_fea_id = 0  # 最优特征id
        self.best_thres = 0  # 选定特征的最优阈值
        self.best_op = 1  # 阈值符号，其中 1: >, 0: < 

    def fit(self, X, y, sample_weight=None):
        if sample_weight is None:
            sample_weight = np.ones(len(X)) / len(X)
        n = X.shape[1]
        for i in range(n):
            feature = X[:, i]  # 选定特征列
            fea_unique = np.sort(np.unique(feature))  # 将所有特征值从小到大排序
            for j in range(len(fea_unique)-1):
                thres = (fea_unique[j] + fea_unique[j+1]) / 2  # 逐一设定可能阈值
                for op in (0, 1):
                    y_ = 2*(feature >= thres)-1 if op==1 else 2*(feature < thres)-1  # 判断何种符号为最优
                    err = np.sum((y_ != y)*sample_weight)
                    if err < self.best_err:  # 当前参数组合可以获得更低错误率，更新最优参数
                        self.best_err = err
                        self.best_op = op
                        self.best_fea_id = i
                        self.best_thres = thres
        return self
       
    def predict(self, X):
        feature = X[:, self.best_fea_id]
        return 2*(feature >= self.best_thres)-1 if self.best_op==1 else 2*(feature < self.best_thres)-1

    def score(self, X, y, sample_weight=None):
        y_pre = self.predict(X)
        if sample_weight is not None:
            return np.sum((y_pre == y)*sample_weight) 
        return np.mean(y_pre == y)

In [3]:
class _AdaBoostClassifier:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators 
        self.estimators = []
        self.alphas = []
        self.betas = []
        self.k = 5 # k neighbors
        self.d = 1 # dimension
        self.train_PCA = np.zeros((self.d,))
        
        self.test_PCA = np.empty((0,) * self.d)
#—————————自己加的函数   计算社会程度  cal_similarity  ——————
    def cal_similarity(self, X, y):
        pca = PCA(n_components = 2)  # 将数据降至2维
        X_pca = pca.fit_transform(X)
        print("降维完成")
        k = self.k  # 设置K值为5
        nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(X_pca)
        # 查询每个点的最近的K个邻居
        distances, indices = nbrs.kneighbors(X_pca)
        # print("每个点的最近K个邻居的索引：", indices)
        # 计算每个点的邻居中与自身标签相同的比例y
        matching_ratios = []
        for i in range(len(X_pca)):
            neighbors = y[indices[i][1:]]  # 排除自身，获取邻居的标签
            matching_ratio = np.mean(neighbors == y[i])  # 计算相同标签的比例
            matching_ratios.append(matching_ratio)
        #print("每个点的邻居中与自身标签相同的比例：", matching_ratios)
        return matching_ratios
#—————————自己加的函数   计算辅助函数  cal_d  ——————

    
    def cal_d(self, s, pred_y, c, PCA_x):

        k = self.k
        # 找到类别c里离s最近的k个点的索引
        indices = np.where(pred_y == c)[0]
        # 使用'cityblock'距离度量，即曼哈顿距离
        distances = cdist(PCA_x[s][0].reshape(-1, 1), PCA_x[indices][0].reshape(-1, 1), 'cityblock')  
        # 找到距离最近的k个点
        nearest_indices = indices[np.argsort(distances.flatten())[:k]]
        # 计算这k个点到s的距离的均值
        mean_distance = np.mean(np.abs(PCA_x[nearest_indices] - PCA_x[s]))
        return mean_distance
    
    def cal_deviation(self, s, pred_y, c, X):
        pca = PCA(n_components = 2)  # 将数据降至2维
        PCA_x = pca.fit_transform(X)
        k = self.k
        # 找到类别c里离s最近的k个点
        indices = np.where(pred_y == c)[0]
        distances = cdist(PCA_x[s][0].reshape(-1, 1), PCA_x[indices][0].reshape(-1, 1), 'cityblock')  
        neighbors_index = indices[np.argsort(distances.flatten())[:k]]
        deviation = np.exp( - self.cal_d(s, pred_y, c, PCA_x) / np.mean([self.cal_d(i, pred_y, c, PCA_x) for i in neighbors_index]))

        return deviation
    
    
    def fit(self, X, y):
        pca = PCA(n_components = self.d)
        X_pca = pca.fit_transform(X).flatten()
        self.train_PCA = X_pca

        #print(self.train_PCA,"^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
        sample_weight = np.ones(len(X)) / len(X)  # 初始化样本权重为 1/N
        for _ in range(self.n_estimators):
            dtc = DecisionTreeClassifierWithWeight().fit(X, y, sample_weight)  # 训练弱学习器
            alpha = 1/2 * np.log((1-dtc.best_err)/dtc.best_err)  #权重系数alpha
            y_pred = dtc.predict(X)
            
            print("_______________________分界线______________________")
            print(X)
            print(y_pred)
            print(alpha)
           
            if _>40:
                self.cal_similarity(X, y_pred)
                for i in range(1,50):
                    print("______i______")
                    print(self.cal_deviation(i, y_pred, 1, X))
                    print(self.cal_deviation(i, y_pred, -1, X))
            
            
            
            sample_weight *= np.exp(-alpha * y_pred * y)  # 更新迭代样本权重
            sample_weight /= np.sum(sample_weight)  # 样本权重归一化
            self.estimators.append(dtc)
            self.alphas.append(alpha)
            # ------------------权重系数bata--------------------
            beta = 0
            self.betas.append(beta)
        return self   
   
    def predict(self, X):
        y_pred = np.empty((len(X), self.n_estimators))  # 预测结果二维数组，其中每一列代表一个弱学习器的预测结果
        for i in range(self.n_estimators):
            y_pred[:, i] = self.estimators[i].predict(X)
        y_pred = y_pred * np.array(self.alphas)  # 将预测结果与训练权重乘积作为集成预测结果
        return 2 * (np.sum(y_pred, axis=1) > 0) - 1  # 以0为阈值，判断并映射为-1和1

    def score(self, X, y):
        y_pred = self.predict(X)
        return np.mean(y_pred==y)
    
    #—————————自己加的函数——————
    #def cal_similarity(center_point，Neighbors)
    

In [4]:
def convert_zero(x):
    if x == 0:
        x = -1
    return x

df = pd.read_csv('dataslice.csv') 
df['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack'].apply(lambda x: int(x))
df['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack'].apply(convert_zero)

KeyError: 'HeartDiseaseorAttack'

In [None]:
df

In [None]:
X = df.drop(columns=['HeartDiseaseorAttack'])
y = df['HeartDiseaseorAttack']

X_train, X_test, y_test, y_train = train_test_split(X, y, test_size = 0.5)
#print(X_train)

In [None]:
_AdaBoostClassifier().fit(X_train.values, y_train.values).score(X_test.values, y_test.values)


In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn import metrics
# model_rf = RandomForestClassifier(n_estimators=1000, oob_score=True, n_jobs=-1,
#                                   random_state=50, max_features=None,
#                                   max_leaf_nodes=30)
# #print(X_train)
# model_rf.fit(X_train, y_train)

# # Make predictions
# prediction_test = model_rf.predict(X_test)
# print (metrics.accuracy_score(y_test, prediction_test))

In [None]:
# importances = model_rf.feature_importances_
# weights = pd.Series(importances,
#                  index=X_train.columns.values)
# weights.sort_values()[-50:].plot(kind = 'barh')

In [None]:
# weights = weights.sort_values(ascending=False)
# columns = list(weights.keys())

In [None]:
# len(df.columns)

In [None]:
# error_bounds = []
# errors_diff = []
# exceptions = 0
# indices = []


# for i in tqdm(range(1, len(columns))):
#     cols_to_use = columns[:i+1]
#     df_exp = df[cols_to_use]
#     df_exp['HeartDiseaseorAttack'] = df['HeartDiseaseorAttack']

#     R_test, R_train, error_bound = evaluate_error(df_exp, 0.05)
#     errors_diff.append(abs(R_test - R_train))
#     error_bounds.append(error_bound)
#     indices.append(i)

    

In [None]:
# df_error_bounds_d = pd.DataFrame()
# df_experiment_d = pd.DataFrame()

# #error_bounds_exp = [i/len(error_bounds) for i in error_bounds]
# df_error_bounds_d['d'] = indices
# df_error_bounds_d['error'] = error_bounds
# df_error_bounds_d['error'] = df_error_bounds_d['error'].apply(lambda x: x[0])

# df_experiment_d['d'] = indices
# df_experiment_d['error'] = errors_diff

In [None]:
# error_bounds_sep = [get_error_bound(20, d, len(df)//2, 0.05) for d in tqdm(indices)]

In [None]:
# df_bounds_d = pd.DataFrame()
# df_bounds_d['bound'] = error_bounds_sep
# df_bounds_d.to_csv('bounds_d.csv')

In [None]:
# df_experiment_d['bound'] = error_bounds_sep
# df_experiment_d.to_csv('experiment_d.csv')