In [None]:
# -*- coding: utf-8 -*-
"""IMDB情感分析极速训练版（Colab GPU优化）"""
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TextVectorization
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 启用混合精度训练（提速关键！）
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

def load_local_imdb(data_path, dataset_type='train'):
    """加载本地IMDB数据（优化版）"""
    texts, labels = [], []
    for sentiment in ['pos', 'neg']:
        dir_path = os.path.join(data_path, dataset_type, sentiment)
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                with open(os.path.join(dir_path, filename), 'r', encoding='utf-8') as f:
                    texts.append(f.read())
                labels.append(1 if sentiment == 'pos' else 0)

    return texts, np.array(labels)

def build_vectorizer(texts, max_tokens=20000, max_len=300):  # 缩短序列长度至300
    """构建高效文本向量化层"""
    vectorize_layer = TextVectorization(
        max_tokens=max_tokens,
        output_mode='int',
        output_sequence_length=max_len,
        standardize='lower_and_strip_punctuation'
    )
    vectorize_layer.adapt(texts)
    return vectorize_layer

def build_fast_model(vectorize_layer):
    """极速模型结构（精度保持90%+）"""
    model = Sequential([
        # 输入层直接集成向量化层
        tf.keras.Input(shape=(), dtype=tf.string),
        vectorize_layer,

        # 减小嵌入维度至64（原始128）
        Embedding(input_dim=20000, output_dim=64),

        # 简化网络结构（单层BiLSTM）
        Bidirectional(LSTM(64)),  # 原始为LSTM(64)+Attention

        # 输出层
        Dense(1, activation='sigmoid')
    ])

    # 动态学习率（加速收敛）
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-3,
        decay_steps=1000,
        decay_rate=0.9)

    model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(lr_schedule),
        metrics=['accuracy']
    )
    return model
def evaluate_model(model, vectorize_layer, test_texts, test_labels):
    """
    综合评估模型性能
    :param model: 训练好的模型
    :param test_texts: 测试集文本
    :param test_labels: 测试集标签
    """
    # 1. 数据预处理
    test_data = vectorize_layer(test_texts)
    test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_labels))
    test_dataset = test_dataset.batch(256)

    # 2. 基础评估
    print("\n=== 基础评估 ===")
    test_loss, test_acc = model.evaluate(test_dataset, verbose=0)
    print(f"测试集损失: {test_loss:.4f}")
    print(f"测试集准确率: {test_acc*100:.2f}%")

    # 3. 预测时需要单独处理输入
    print("\n=== 分类报告 ===")
    test_data = vectorize_layer(test_texts)  # 必须显式向量化
    y_pred = model.predict(test_data)
    y_pred = np.round(y_pred).astype(int).flatten()

    # 3. 混淆矩阵可视化
    print("\n=== 混淆矩阵 ===")
    cm = confusion_matrix(test_labels, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=["Forecast negative", "Forecasted positive"], #["预测负面", "预测正面"]
               yticklabels=["True negative", "True positive"]) #["真实负面", "真实正面"]
    plt.xlabel("Prediction label") #预测标签
    plt.ylabel("True Label ") #真实标签
    plt.title("Confusion matrix") #混淆矩阵
    plt.show()
"""
    # 4. 错误案例分析
    print("\n=== 典型错误样本 ===")
    errors = np.where(y_pred != test_labels)[0]
    for i in errors[:3]:  # 展示前3个错误案例
        print(f"\n样本 {i}:")
        print(f"真实标签: {'正面' if test_labels[i] else '负面'}")
        print(f"预测概率: {y_pred[i][0]:.2f}")
        print("评论文本:")
        print(test_texts[i][:300] + "...")  # 显示前300字符
"""
def train_rapidly():
    """极速训练流程"""
    # 配置参数
    DATA_PATH = './aclImdb/aclImdb'  # Colab中上传数据到此路径
    BATCH_SIZE = 256  # 增大批次大小（利用GPU并行）
    EPOCHS = 1

    # 在Colab中执行
#    !ls {DATA_PATH}/train/pos | wc -l  # 应输出12500 实际输出2300
#    !ls {DATA_PATH}/train/neg | wc -l  # 应输出12500

    # 数据加载
    print("🚀 Loading data...") #加载数据
    train_texts, train_labels = load_local_imdb(DATA_PATH, 'train')
    test_texts, test_labels = load_local_imdb(DATA_PATH, 'test')

    # 文本向量化
    print("🔧 Building a text processor...")  #构建文本处理器
    vectorize_layer = build_vectorizer(train_texts)

    # 构建高效数据管道（关键优化！）
    train_dataset = (
        tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
        .cache()                         # 缓存到内存
        .shuffle(10000)                  # 减小shuffle buffer
        .batch(BATCH_SIZE)               # 大批次
        .prefetch(tf.data.AUTOTUNE)      # 预取
    )

    test_dataset = (
        tf.data.Dataset.from_tensor_slices((test_texts, test_labels))
        .batch(BATCH_SIZE)
    )

    # 构建模型
    print("🛠 Building fast model...") #构建极速模型
    model = build_fast_model(vectorize_layer)
    model.summary()

    # 训练配置
    callbacks = [
        EarlyStopping(patience=2, restore_best_weights=True),
    ]

    # 启动训练
    print("🔥 training...") #开始训练
    history = model.fit(
        train_dataset,
        validation_data=test_dataset,
        epochs=EPOCHS,
        callbacks=callbacks
    )

#    print(f"训练样本数: {len(train_texts)}")  # 应输出25000
#    print(f"实际词汇量: {len(vectorize_layer.get_vocabulary())}")  # 应输出20000

    # 在train_rapidly()中添加
#    print("样本示例:", train_texts[0][:50])  # 应显示原始文本
#    print("向量化示例:", vectorize_layer([train_texts[0]]).numpy()[0][:5])  # 应显示整数序列

    # 最终评估
    print("📊 Final evaluation result:")  #最终评估结果
    model.evaluate(test_dataset)


    # 保存模型（用于提交）
    model.save('imdb_fast_model.h5')
    print("✅ 模型已保存为 imdb_fast_model.h5")

     # ===== 插入模型验证代码的位置 =====
    print("\n🔍 Start model validation...")  #开始模型验证
    evaluate_model(model,vectorize_layer, test_texts, test_labels)  # 调用验证函数

if __name__ == "__main__":
    # 在Colab中运行时自动检测GPU
    print("GPU可用:", tf.config.list_physical_devices('GPU'))
    train_rapidly()

GPU可用: []
2131
12500
🚀 加载数据...
🔧 构建文本处理器...
🛠 构建极速模型...



=== 输入层验证 ===
模型输入形状: (None,)
🔥 开始训练...
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 3s/step - accuracy: 0.8065 - loss: 0.4920 - val_accuracy: 0.5000 - val_loss: 1.2128
训练样本数: 14631
实际词汇量: 20000
样本示例: This is my favorite movie EVER. I have watched it 
向量化示例: [  9   7  62 760  15]
📊 最终评估结果:
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 729ms/step - accuracy: 0.1607 - loss: 1.9895




✅ 模型已保存为 imdb_fast_model.h5

🔍 开始模型验证...

=== 基础评估 ===


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_7_1/Cast:0", shape=(None, 300), dtype=string). Expected shape (None,), but input has incompatible shape (None, 300)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 300), dtype=int64)
  • training=False
  • mask=None

In [None]:
from google.colab import files
uploaded = files.upload()  # 上传 your_folder.zip

Saving aclImdb_v1.tar.gz to aclImdb_v1.tar (1).gz


In [None]:
import tarfile

# 设置压缩文件路径
gz_path = "./aclImdb_v1.tar.gz"

# 设置解压目标文件夹
extract_path = "./aclImdb"

# 解压 tar.gz 文件
with tarfile.open(gz_path, "r:gz") as tar:
    tar.extractall(path=extract_path)

print(f"文件已解压到: {extract_path}")


KeyboardInterrupt: 