# 保存模型h5

对，步骤4的模型训练不应该直接使用步骤3中生成的特征向量（即BERT特征矩阵）。步骤4应该是基于您手动标记的数据集（即Excel文件）来训练模型。一旦模型训练完成，就可以将其应用于新的RFP文件中提取的文本（这些文本需要经过与训练数据相同的预处理和特征提取流程）。

以下是根据您提供的信息修改后的步骤4模型训练代码的大致结构：

In [87]:
# 检查BERT特征和标签的数量是否匹配
print("Number of BERT features:", bert_features.shape[0])
print("Number of labels:", len(categorical_labels))

# 如果数量不匹配，检查CSV文件的加载过程
# 确保CSV文件中的行数与您的标签数量相匹配


Number of BERT features: 6
Number of labels: 134


定义了一个全连接的神经网络模型，可以处理768维的BERT特征。
加入了Dropout层，以帮助防止过拟合。
使用了softmax激活函数的输出层，其大小与分类标签的数量相匹配。

In [85]:
print("BERT features shape:", bert_features.shape)
print("Categorical labels shape:", categorical_labels.shape)

BERT features shape: (6, 768)
Categorical labels shape: (134, 6)


In [83]:
bert_features_matrix = np.load('/Users/xiao1/Desktop/good and bad/bert_features_matrix.npy')
print("Number of BERT features:", bert_features_matrix.shape[0])


Number of BERT features: 117


In [91]:
missing_indices = set(df.index) - set(df.index[df.index.isin(range(bert_features_matrix.shape[0]))])
print("Missing indices:", missing_indices)


Missing indices: {128, 129, 130, 131, 132, 133, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127}


In [94]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import numpy as np
import pandas as pd
import joblib

# 加载BERT特征向量
bert_features_matrix = np.load('/Users/xiao1/Desktop/good and bad/bert_features_matrix.npy')

# 加载标签数据
df = pd.read_excel('/Users/xiao1/Desktop/LDMar10.xlsx')

# 处理缺失样本
missing_indices = {128, 129, 130, 131, 132, 133, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127}
df = df.drop(index=missing_indices)

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['Level'])
categorical_labels = to_categorical(encoded_labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(bert_features_matrix, categorical_labels, test_size=0.2, random_state=42)

# Model definition
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(768,)))
model.add(Dense(256, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate(X_test, y_test)
print(f'Test Loss: {evaluation[0]}')
print(f'Test Accuracy: {evaluation[1]}')

# Save model and label encoder
model.save('/Users/xiao1/Desktop/my_trained_model.h5')
joblib.dump(label_encoder, '/Users/xiao1/Desktop/label_encoder.pkl')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.263647437095642
Test Accuracy: 0.5416666865348816


  saving_api.save_model(


['/Users/xiao1/Desktop/label_encoder.pkl']

# 上面是正确的一版本带降维度bert

In [78]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# 确保已下载NLTK数据包
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 文本预处理函数
def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    # 移除标点符号
    text = re.sub(r'[^\w\s]', '', text)
    # 分词
    words = word_tokenize(text)
    # 移除停用词和单字词
    words = [word for word in words if word not in stopwords.words('english') and len(word) > 1]
    # 词形还原
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in words]
    # 重组为处理后的文本
    return ' '.join(lemmatized)

# Load data
file_path = '/Users/xiao1/Desktop/LDMar10.xlsx'
df = pd.read_excel(file_path)

# Preprocess text data
df['processed_text'] = df["Sentences with 'Key Words'"].apply(preprocess_text)

# Tokenization and padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['processed_text'].values)
sequences = tokenizer.texts_to_sequences(df['processed_text'].values)
data = pad_sequences(sequences, maxlen=100)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['Level'])
categorical_labels = to_categorical(encoded_labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(data, categorical_labels, test_size=0.2, random_state=42)

# Model definition
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32, input_length=100))
model.add(SimpleRNN(32))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate(X_test, y_test)
print(f'Test Loss: {evaluation[0]}')
print(f'Test Accuracy: {evaluation[1]}')

# Save model, tokenizer, and label encoder
model.save('/Users/xiao1/Desktop/mymodel.h5')
tokenizer_json = tokenizer.to_json()
with open('/Users/xiao1/Desktop/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)
joblib.dump(label_encoder, '/Users/xiao1/Desktop/label_encoder.pkl')


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.0341757535934448
Test Accuracy: 0.8518518805503845


  saving_api.save_model(


['/Users/xiao1/Desktop/label_encoder.pkl']

# 可用版本 但不匹配bert纬度
维度不匹配：BERT模型生成的特征向量维度是768维，而您的RNN模型期望的输入是一个长度为100的序列。它们的维度不匹配。
数据类型不匹配：BERT模型输出的是密集的实数向量，而RNN模型被训练为接受整数索引序列。


In [26]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
import joblib

# Load data
file_path = '/Users/xiao1/Desktop/LDMar10.xlsx'
df = pd.read_excel(file_path)

# Data preprocessing
df["Sentences with 'Key Words'"] = df["Sentences with 'Key Words'"].fillna("").astype(str)
texts = df["Sentences with 'Key Words'"].tolist()

# Tokenization
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=100)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['Level'])
categorical_labels = to_categorical(encoded_labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(data, categorical_labels, test_size=0.2, random_state=42)

# Model definition
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32, input_length=100))
model.add(SimpleRNN(32))
model.add(Dense(len(set(encoded_labels)), activation='softmax'))

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate(X_test, y_test)
print(f'Test Loss: {evaluation[0]}')
print(f'Test Accuracy: {evaluation[1]}')

# Save model, tokenizer, and label encoder
model.save('/Users/xiao1/Desktop/model.h5')  # 更改为可写的路径
tokenizer_json = tokenizer.to_json()
with open('/Users/xiao1/Desktop/tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)
joblib.dump(label_encoder, '/Users/xiao1/Desktop/label_encoder.pkl')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.4944121837615967
Test Accuracy: 0.6551724076271057


  saving_api.save_model(


['/Users/xiao1/Desktop/label_encoder.pkl']

# 第一版

In [None]:
import pandas as pd

file_path = '/Users/xiao1/Desktop/LDMar10.xlsx'

import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense

df = pd.read_excel(file_path)
# Clean and prepare the data
df["Sentences with 'Key Words'"] = df["Sentences with 'Key Words'"].fillna("").astype(str)
texts = df["Sentences with 'Key Words'"].tolist()

# The rest of your code here...

# Tokenize texts
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences
data = pad_sequences(sequences, maxlen=100)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['Level'])
categorical_labels = to_categorical(encoded_labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(data, categorical_labels, test_size=0.2, random_state=42)

# Define the RNN model with output shape according to the number of unique labels
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32, input_length=100))
model.add(SimpleRNN(32))  # You might use LSTM or GRU for better performance
model.add(Dense(len(set(encoded_labels)), activation='softmax'))  # Multi-class classification

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate(X_test, y_test)
print(f'Test Loss: {evaluation[0]}')
print(f'Test Accuracy: {evaluation[1]}')

# Bert 错误版第二版

In [None]:
import pandas as pd

file_path = '/Users/xiao1/Desktop/LDMar10.xlsx'

import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf

# 加载数据
file_path = '/Users/xiao1/Desktop/LDMar10.xlsx'
df = pd.read_excel(file_path)

# 确保所有文本数据都是字符串类型，用空字符串填充缺失值
df["Sentences with 'Key Words'"] = df["Sentences with 'Key Words'"].fillna("").astype(str)

# 初始化Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 使用LabelEncoder对标签进行编码
label_encoder = LabelEncoder()
df['Encoded Labels'] = label_encoder.fit_transform(df['Level'])

# 文本和标签
texts = df["Sentences with 'Key Words'"].values
labels = df['Encoded Labels'].values

# 将标签转换为one-hot编码
labels = tf.keras.utils.to_categorical(labels, num_classes=len(label_encoder.classes_))

# 分割数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=2021)

# 使用BERT的Tokenizer处理文本数据
max_length = 64
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_length, return_tensors="tf")
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=max_length, return_tensors="tf")

# 初始化BERT模型
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# 设置优化器、损失函数和评价指标
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')

# 编译模型
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy')])