In [None]:
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)
data_path = "/content/drive/MyDrive/Colab Notebooks/DL/"  # this is your drive

Mounted at /content/drive/


In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# 1. 读取数据，处理label2的格式
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    texts1 = [item['text1'] for item in data]
    labels1 = [item['label1'] for item in data]
    texts2 = [item['text2'] for item in data]
    labels2 = [item['label2'].split(';') if item['label2'] else [] for item in data]
    return texts1, labels1, texts2, labels2

# 文件路径
train_file = data_path + 'train_data/train_data.json'
val_file = data_path + 'train_data/val_data.json'
test_file = data_path + 'test_data/test_data.json'  # 测试集文件

# 加载训练集、验证集和测试集
train_texts1, train_labels1, train_texts2, train_labels2 = load_data(train_file)
val_texts1, val_labels1, val_texts2, val_labels2 = load_data(val_file)
test_texts1, test_labels1, test_texts2, test_labels2 = load_data(test_file)

# 2. 特征提取并拼接
tfidf1 = TfidfVectorizer(max_features=5000)
tfidf2 = TfidfVectorizer(max_features=5000)



In [None]:
# 训练集特征
X_train1 = tfidf1.fit_transform(train_texts1)
X_train2 = tfidf2.fit_transform(train_texts2)
X_train_combined = np.hstack((X_train1.toarray(), X_train2.toarray()))

# 验证集特征
X_val1 = tfidf1.transform(val_texts1)
X_val2 = tfidf2.transform(val_texts2)
X_val_combined = np.hstack((X_val1.toarray(), X_val2.toarray()))

# 测试集特征
X_test1 = tfidf1.transform(test_texts1)
X_test2 = tfidf2.transform(test_texts2)
X_test_combined = np.hstack((X_test1.toarray(), X_test2.toarray()))

# 3. 处理标签
mlb = MultiLabelBinarizer()
y_train2 = mlb.fit_transform(train_labels2)
y_val2 = mlb.transform(val_labels2)
y_test2 = mlb.transform(test_labels2)





In [None]:
# 4. 训练模型
# 任务1：单标签分类
svm1 = LinearSVC(max_iter=1000)
svm1.fit(X_train_combined, train_labels1)

# 任务2：多标签分类
svm2 = OneVsRestClassifier(LinearSVC(max_iter=1000))
svm2.fit(X_train_combined, y_train2)

# 5. 在测试集上预测
y_pred1_test = svm1.predict(X_test_combined)
y_pred2_test = svm2.predict(X_test_combined)

# 6. 自定义评价指标
def custom_acc(y_true_main, y_pred_main, y_true_other, y_pred_other, mlb):
    N = len(y_true_main)
    total_score = 0.0

    for i in range(N):
        # 单标签任务：主诊断是否正确
        main_correct = 1 if y_pred_main[i] == y_true_main[i] else 0

        # 多标签任务：计算交集和真实标签数量
        true_labels = set(y_true_other[i])  # 真实标签（列表）
        pred_labels = set(mlb.classes_[np.where(y_pred_other[i] == 1)[0]])  # 预测标签
        intersection = len(true_labels & pred_labels)
        num_true_other = len(true_labels)

        # 计算多标签部分的得分
        if num_true_other > 0:
            other_score = intersection / num_true_other
        else:
            other_score = 0.0  # 避免除以0

        # 总得分：0.5 * 单标签得分 + 0.5 * 多标签得分
        score = 0.5 * main_correct + 0.5 * other_score
        total_score += score

    # 平均得分
    acc = total_score / N
    return acc

# 7. 在测试集上评估
test_acc = custom_acc(test_labels1, y_pred1_test, test_labels2, y_pred2_test, mlb)
print(f"Test Set Custom Accuracy (Acc): {test_acc:.4f}")

# 8. 输出测试集前5个样本的预测结果
print("\nTest Set Sample Predictions:")
for i in range(min(5, len(test_texts1))):
    print(f"Sample {i+1}:")
    print(f"Text1: {test_texts1[i]}")
    print(f"Predicted Label1: {y_pred1_test[i]}, True Label1: {test_labels1[i]}")
    print(f"Text2: {test_texts2[i]}")
    pred_labels2 = mlb.classes_[np.where(y_pred2_test[i] == 1)[0]].tolist()
    print(f"Predicted Label2: {pred_labels2}")
    print(f"True Label2: {test_labels2[i]}")
    print("-" * 50)

Test Set Custom Accuracy (Acc): 0.5900

Test Set Sample Predictions:
Sample 1:
Text1: 疾病：冠状动脉粥样硬化性心脏病，糖尿病，冠心病不稳定型心绞痛，肝炎，高血压，结核，心绞痛，冠状动脉支架，胆囊息肉，囊肿，双肺细支气管炎，脂肪肝；症状：畸形，隆起，咳痰，咳嗽，大汗，对称，胸痛，喘，头痛，凹陷，胸闷，黑朦，憨喘，疼痛，头晕，放射痛，心绞痛，水肿，囊肿，结节，狭窄；检查项目：呼吸音，P，体重，湿性啰音，BP，心率，R1，T3，查体，载脂蛋白E，血细胞分析，血脂，血小板体积，肌钙蛋白I，酶法，化验，脂蛋白，小密低密度脂蛋白，大血小板比率，血清肌钙蛋白I，色谱法，甘油三酯，电解质葡萄糖测定，血小板压积，糖化血红蛋白测定，单核细胞计数，肝功，同型半胱氨酸，病理性杂音，血小板分布宽度，甲功五项，超声检查。于2024-11-11在介入诊疗中心行冠状动脉造影术。常规消毒，铺洞巾，局部麻醉，右侧动脉穿刺，行左右冠状动脉造影，感染标志物系列，动静脉超声，尿常规，颈动脉超声，CRP，L↑，补体C1q，BNP，动脉造影，血沉，糖化血红蛋白，胸部CT，降钙素原，大便分析，HBsAb，沉渣，心脏彩超，尿肾功，凝血常规，HCV-Ab，CMIA，心律，反跳痛，压痛；治疗方法：阿司匹林，血脂康；身体部位：双肺，心前区，各瓣膜听诊区，背部，痰，大小便，肩背部，胸廓，腹部，双下肢，血小板，脂肪肝，室间隔基底段，大便，脾，左前降支，双下肢动脉内中膜，室壁，胆，左冠，左主干，肝，胰，主动脉瓣，双侧颈动脉内中膜，右冠状动脉，腹，双下肢足背动脉，右冠；药物：血脂康，麝香保心丸，阿司匹林，硝酸甘油，他汀；手术：冠状动脉支架植入，冠状动脉造影术，球囊扩张术
Predicted Label1: I20.000, True Label1: I21.401
Text2: 疾病：肝炎，高血压，糖尿病，传染病，结核，胆囊息肉，囊肿，脂肪肝，双肺细支气管炎；症状：囊肿，结节，节段性室壁运动异常；室间隔基底段增厚，狭窄，感染，闭塞，出血，发黑；检查项目：大血小板比率，肌钙蛋白I，肝功，电解质葡萄糖测定，色谱法，血小板分布宽度，血小板压积，血清肌钙蛋白I测定，血脂肾功，酶法，血小板体积，血细胞分析，糖化血红蛋白测定，化验，单核细胞计数，糖化血红蛋

### 使用交叉验证做调参

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV

# 1. 读取数据，处理label2的格式
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    texts1 = [item['text1'] for item in data]
    labels1 = [item['label1'] for item in data]
    texts2 = [item['text2'] for item in data]
    labels2 = [item['label2'].split(';') if item['label2'] else [] for item in data]
    return texts1, labels1, texts2, labels2

# 文件路径
train_file = data_path + 'train_data/train_data.json'
val_file = data_path + 'train_data/val_data.json'
test_file = data_path + 'test_data/test_data.json'  # 测试集文件

# 加载训练集、验证集和测试集
train_texts1, train_labels1, train_texts2, train_labels2 = load_data(train_file)
val_texts1, val_labels1, val_texts2, val_labels2 = load_data(val_file)
test_texts1, test_labels1, test_texts2, test_labels2 = load_data(test_file)

# 合并训练集和验证集，用于交叉验证
combined_texts1 = train_texts1 + val_texts1
combined_labels1 = train_labels1 + val_labels1
combined_texts2 = train_texts2 + val_texts2
combined_labels2 = train_labels2 + val_labels2

# 2. 特征提取
tfidf1 = TfidfVectorizer(max_features=5000)
tfidf2 = TfidfVectorizer(max_features=5000)

# 合并数据特征（用于交叉验证）
X_combined1 = tfidf1.fit_transform(combined_texts1)
X_combined2 = tfidf2.fit_transform(combined_texts2)
X_combined = np.hstack((X_combined1.toarray(), X_combined2.toarray()))

# 训练集特征（用于最终训练）
X_train1 = tfidf1.transform(train_texts1)
X_train2 = tfidf2.transform(train_texts2)
X_train_combined = np.hstack((X_train1.toarray(), X_train2.toarray()))

# 测试集特征
X_test1 = tfidf1.transform(test_texts1)
X_test2 = tfidf2.transform(test_texts2)
X_test_combined = np.hstack((X_test1.toarray(), X_test2.toarray()))

# 3. 处理标签
mlb = MultiLabelBinarizer()
y_combined2 = mlb.fit_transform(combined_labels2)
y_train2 = mlb.transform(train_labels2)
y_test2 = mlb.transform(test_labels2)

# 4. 交叉验证调参
# 任务1：单标签分类
param_grid = {'C': [0.1, 1, 10, 100]}  # 调参范围
svm1 = LinearSVC(max_iter=1000)
grid_search1 = GridSearchCV(svm1, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search1.fit(X_combined, combined_labels1)
print(f"Best parameters for Task 1 (Single-label): {grid_search1.best_params_}")

# 任务2：多标签分类（用OneVsRestClassifier）
svm2_base = LinearSVC(max_iter=1000)
svm2 = OneVsRestClassifier(svm2_base)
param_grid_multi = {'estimator__C': [0.1, 1, 10, 100]}
grid_search2 = GridSearchCV(svm2, param_grid_multi, cv=5, scoring='f1_micro', n_jobs=-1)
grid_search2.fit(X_combined, y_combined2)
print(f"Best parameters for Task 2 (Multi-label): {grid_search2.best_params_}")

# 5. 用最优参数在训练集上重新训练
# 任务1
svm1_final = LinearSVC(C=grid_search1.best_params_['C'], max_iter=1000)
svm1_final.fit(X_train_combined, train_labels1)

# 任务2
svm2_final = OneVsRestClassifier(LinearSVC(C=grid_search2.best_params_['estimator__C'], max_iter=1000))
svm2_final.fit(X_train_combined, y_train2)

# 6. 在测试集上预测
y_pred1_test = svm1_final.predict(X_test_combined)
y_pred2_test = svm2_final.predict(X_test_combined)

# 7. 自定义评价指标
def custom_acc(y_true_main, y_pred_main, y_true_other, y_pred_other, mlb):
    N = len(y_true_main)
    total_score = 0.0
    for i in range(N):
        main_correct = 1 if y_pred_main[i] == y_true_main[i] else 0
        true_labels = set(y_true_other[i])
        pred_labels = set(mlb.classes_[np.where(y_pred_other[i] == 1)[0]])
        intersection = len(true_labels & pred_labels)
        num_true_other = len(true_labels)
        other_score = intersection / num_true_other if num_true_other > 0 else 0.0
        score = 0.5 * main_correct + 0.5 * other_score
        total_score += score
    acc = total_score / N
    return acc

# 8. 在测试集上评估
test_acc = custom_acc(test_labels1, y_pred1_test, test_labels2, y_pred2_test, mlb)
print(f"Test Set Custom Accuracy (Acc): {test_acc:.4f}")

# 9. 输出测试集前5个样本的预测结果
print("\nTest Set Sample Predictions:")
for i in range(min(5, len(test_texts1))):
    print(f"Sample {i+1}:")
    print(f"Text1: {test_texts1[i]}")
    print(f"Predicted Label1: {y_pred1_test[i]}, True Label1: {test_labels1[i]}")
    print(f"Text2: {test_texts2[i]}")
    pred_labels2 = mlb.classes_[np.where(y_pred2_test[i] == 1)[0]].tolist()
    print(f"Predicted Label2: {pred_labels2}")
    print(f"True Label2: {test_labels2[i]}")
    print("-" * 50)



Best parameters for Task 1 (Single-label): {'C': 0.1}
Best parameters for Task 2 (Multi-label): {'estimator__C': 10}
Test Set Custom Accuracy (Acc): 0.6025

Test Set Sample Predictions:
Sample 1:
Text1: 疾病：冠状动脉粥样硬化性心脏病，糖尿病，冠心病不稳定型心绞痛，肝炎，高血压，结核，心绞痛，冠状动脉支架，胆囊息肉，囊肿，双肺细支气管炎，脂肪肝；症状：畸形，隆起，咳痰，咳嗽，大汗，对称，胸痛，喘，头痛，凹陷，胸闷，黑朦，憨喘，疼痛，头晕，放射痛，心绞痛，水肿，囊肿，结节，狭窄；检查项目：呼吸音，P，体重，湿性啰音，BP，心率，R1，T3，查体，载脂蛋白E，血细胞分析，血脂，血小板体积，肌钙蛋白I，酶法，化验，脂蛋白，小密低密度脂蛋白，大血小板比率，血清肌钙蛋白I，色谱法，甘油三酯，电解质葡萄糖测定，血小板压积，糖化血红蛋白测定，单核细胞计数，肝功，同型半胱氨酸，病理性杂音，血小板分布宽度，甲功五项，超声检查。于2024-11-11在介入诊疗中心行冠状动脉造影术。常规消毒，铺洞巾，局部麻醉，右侧动脉穿刺，行左右冠状动脉造影，感染标志物系列，动静脉超声，尿常规，颈动脉超声，CRP，L↑，补体C1q，BNP，动脉造影，血沉，糖化血红蛋白，胸部CT，降钙素原，大便分析，HBsAb，沉渣，心脏彩超，尿肾功，凝血常规，HCV-Ab，CMIA，心律，反跳痛，压痛；治疗方法：阿司匹林，血脂康；身体部位：双肺，心前区，各瓣膜听诊区，背部，痰，大小便，肩背部，胸廓，腹部，双下肢，血小板，脂肪肝，室间隔基底段，大便，脾，左前降支，双下肢动脉内中膜，室壁，胆，左冠，左主干，肝，胰，主动脉瓣，双侧颈动脉内中膜，右冠状动脉，腹，双下肢足背动脉，右冠；药物：血脂康，麝香保心丸，阿司匹林，硝酸甘油，他汀；手术：冠状动脉支架植入，冠状动脉造影术，球囊扩张术
Predicted Label1: I20.000, True Label1: I21.401
Text2: 疾病：肝炎，高血压，糖尿病，传染病，结核，胆囊息肉，囊肿，脂肪肝，双肺细支气管炎；症状：囊肿，结节，节段性室壁运动异常；室间隔基底段增厚，