In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
Col

In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

path = os.getcwd()
noval1 = os.path.join(path, "a.txt")
noval2 = os.path.join(path, "b.txt")
test_file = os.path.join(path, "test.txt")

class NovelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(), 'attention_mask': encoding['attention_mask'].squeeze(), 'labels': label}

with open(noval1, "r", encoding="utf-8") as file1:
    novel1_text = file1.read().splitlines()
with open(noval2, "r", encoding="utf-8") as file2:
    novel2_text = file2.read().splitlines()

labels_novel1 = [0] * len(novel1_text)
labels_novel2 = [1] * len(novel2_text)

texts = novel1_text + novel2_text
labels = labels_novel1 + labels_novel2

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_dataset = NovelDataset(train_texts, train_labels, tokenizer)
val_dataset = NovelDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# optimizer = AdamW(model.parameters(), lr=1e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(10):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print("Epoch:", epoch, "Loss:", loss.item())

model.eval()
test_texts = []
with open(test_file, "r", encoding="utf-8") as file:
    test_texts = file.read().splitlines()

test_dataset = NovelDataset(test_texts, [0] * len(test_texts), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

predictions = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).tolist())

print("Predicted labels:", predictions)

predicted_labels = ["Novel 1" if label == 0 else "Novel 2" for label in predictions]
print("Predicted labels:", predicted_labels)

model.save_pretrained(path+"/model")
tokenizer.save_pretrained(path+"/model")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0 Loss: 0.5211957097053528
Epoch: 0 Loss: 0.7848677635192871
Epoch: 0 Loss: 0.7772784233093262
Epoch: 0 Loss: 0.6541193127632141
Epoch: 0 Loss: 0.8479735851287842
Epoch: 0 Loss: 0.688094437122345
Epoch: 0 Loss: 0.6758595108985901
Epoch: 0 Loss: 0.7079735994338989
Epoch: 0 Loss: 0.7209044098854065
Epoch: 0 Loss: 0.6466361880302429
Epoch: 0 Loss: 0.6923255920410156
Epoch: 0 Loss: 0.6937944293022156
Epoch: 0 Loss: 0.6399375200271606
Epoch: 0 Loss: 0.7130086421966553
Epoch: 0 Loss: 0.6064151525497437
Epoch: 0 Loss: 0.6647480130195618
Epoch: 0 Loss: 0.6273704171180725
Epoch: 0 Loss: 0.6323553323745728
Epoch: 0 Loss: 0.5624178051948547
Epoch: 0 Loss: 0.654714047908783
Epoch: 0 Loss: 0.5862290859222412
Epoch: 0 Loss: 0.5656419396400452
Epoch: 0 Loss: 0.622803807258606
Epoch: 0 Loss: 0.9318240880966187
Epoch: 0 Loss: 0.8760219812393188
Epoch: 0 Loss: 0.690697431564331
Epoch: 0 Loss: 0.5848891139030457
Epoch: 0 Loss: 0.5665934085845947
Epoch: 0 Loss: 0.6708153486251831
Epoch: 0 Loss: 0.6

('/contentmodel/tokenizer_config.json',
 '/contentmodel/special_tokens_map.json',
 '/contentmodel/vocab.txt',
 '/contentmodel/added_tokens.json')

In [7]:
model.save_pretrained(path+"/model")
tokenizer.save_pretrained(path+"/model")

('/content/model/tokenizer_config.json',
 '/content/model/special_tokens_map.json',
 '/content/model/vocab.txt',
 '/content/model/added_tokens.json')

In [17]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import os

# 加载模型和分词器
path = os.getcwd()
model_path = os.path.join(path, "model")
test_file = os.path.join(path, "test.txt")

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

test_texts = []
with open(test_file, "r", encoding="utf-8") as file:
    test_texts = file.read().splitlines()

test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

model.eval()
with torch.no_grad():
    outputs = model(test_encodings['input_ids'], attention_mask=test_encodings['attention_mask'])
    logits = outputs.logits
    predictions = logits.argmax(dim=1).tolist()

predicted_labels = ["三国演义" if label == 0 else "水浒传" for label in predictions]
for i in range(len(predicted_labels)):
    print(test_texts[i],"        ",predicted_labels[i])


刘备刘玄德          三国演义
蒋门神听了要挣扎性命连声应道依得依得          水浒传
尽把好酒开了有的是按酒都摆列          水浒传
玄德曰淮南袁术兵粮足备可为英雄          三国演义
龙之为物可比世之英雄          三国演义
瑜迎入各问慰讫          三国演义
汝今晚点一千军围住馆驿一人一个火把待三更时分一齐放火不问是谁          三国演义
答曰正厅上观书者是也          三国演义
关公急来到城边只见城门已开          三国演义
却说李逵连夜回梁山泊到得寨里来见众头领          水浒传
前者杀了小衙内不干李逵之事          三国演义
兄长休惊等戴宗回山便有分晓          水浒传
