In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense


In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = 'drive/MyDrive/505/final/'

In [None]:
df = pd.read_csv(data_dir + 'sample.csv', header=None)

In [None]:
print(df)

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# 文本预处理
max_words = 10000  # 仅使用最常见的10000个词
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data.iloc[:, 2])  # 第三列是文本

X_train = tokenizer.texts_to_sequences(train_data.iloc[:, 2])
X_test = tokenizer.texts_to_sequences(test_data.iloc[:, 2])

max_sequence_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

# 标签预处理
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data.iloc[:, 1])  # 第二列是标签
y_test = label_encoder.transform(test_data.iloc[:, 1])

# 建立GRU模型
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(GRU(units=hidden_units))
model.add(Dense(1, activation='sigmoid'))

# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# 评估模型
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')