<a href="https://colab.research.google.com/github/lyzno1/lightning_example/blob/main/zjy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
# 读取数据集
data = pd.read_excel('usingtest.xlsx')

In [3]:
print(data.head())
# 检查标签是否正确
for i, value in enumerate(data['label']):
    if value != 1.0 and value != 0.0:
        print(f"in row {i+1}")
print("yes")
assert all(data['label'].isin([0.0, 1.0])), "label contains values other than 0.0 and 1.0"

    序号       昵称 性别  省份                                                 内容  \
0    1  潇潇diana  女  北京  一个 妈妈 一天 心路历程 吃饭 篇 牛奶 有无 三聚氰胺 超标 会 不会 喝成 大头 面包...   
1    2   睡不饱的任镳  男  上海   发现 现在 媒体 微博后 关注度 会 大幅度 增加 快速 传播 影响 很大 有个 缺点 不...   
2    4    狙击手蝈蝈  男  广东  铁证如山 日军 性 暴行 受害者 两姐妹 证言 公布 救 其他人 时年 14 岁 彭 仁寿 ...   
3    7  邵井子1314  男  其他  疫苗 事件 转基因 事件 只不过 比较 两个 造假 事件 没收 转发 键 疫苗 事件 国产 ...   
4  181  时尚老太80后  女  其他   转基因 日前 农业部 回应 表示 转基因 谣言 已经 影响 转基因 健康 发展 实际上 科...   

            认证   编写日期  label  
0          未认证   3分钟前      1  
1  东方汇金期货研究员任镳   3分钟前      1  
2          未认证  13分钟前      1  
3       头条文章作者   3分钟前      1  
4          未认证  23分钟前      1  
yes


In [4]:
data = data[['内容', 'label']]
print(data[:2])

                                                  内容  label
0  一个 妈妈 一天 心路历程 吃饭 篇 牛奶 有无 三聚氰胺 超标 会 不会 喝成 大头 面包...      1
1   发现 现在 媒体 微博后 关注度 会 大幅度 增加 快速 传播 影响 很大 有个 缺点 不...      1


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader

# 自定义数据集
class WeiboDataset(Dataset):
  """
    data是传入的dataframe字典，tokenizer是分词器，maxlen是最大长度，predict是是否为预测模式
  """

    def __init__(self, data, tokenizer, max_len, predict=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.predict = predict

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.predict:
            # Handle the case for prediction differently
            text = self.data['text'].iloc[idx]  # Access 'text' column using iloc
            input_ids, attention_mask, token_type_ids = self.encode_plus(text)
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids,
            }
        else:
            comment = self.data.iloc[idx]['内容']
            label = self.data.iloc[idx]['label']
            encoding = self.tokenizer.encode_plus(
                comment,
                add_special_tokens=True,
                max_length=self.max_len,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            return {
                'comment_text': comment,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }


In [6]:
%pip install lightning



In [30]:
from lightning.pytorch import LightningDataModule
from sklearn.model_selection import train_test_split

# 数据模块
class WeiboDataModule(LightningDataModule):

    def __init__(self, data, tokenizer, max_len, batch_size, predict=False):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.batch_size = batch_size
        self.predict = predict

    def prepare_data(self):
      if self.predict:
        self.predict_data = data
      else:
        self.train_data, self.test_data = train_test_split(self.data, test_size=0.2, random_state=42)
        self.train_data, self.val_data = train_test_split(self.train_data, test_size=0.1, random_state=42)


    def setup(self, stage=None):
      if stage == 'fit' or stage is None:
          self.train_dataset = WeiboDataset(self.train_data, self.tokenizer, self.max_len)
          self.val_dataset = WeiboDataset(self.val_data, self.tokenizer, self.max_len)

      if stage == 'test' or stage is None:
          self.test_dataset = WeiboDataset(self.test_data, self.tokenizer, self.max_len)

      if stage == 'predict' or stage is None:
          self.predict_dataset = WeiboDataset(self.predict_data, self.tokenizer, self.max_len)

    def train_dataloader(self):
        train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(self.val_dataset, batch_size=self.batch_size)
        return val_loader

    def test_dataloader(self):
        test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size)
        return test_loader

    def predict_dataloader(self):
        predict_loader = DataLoader(self.predict_data, batch_size=self.batch_size)
        return predict_loader

In [107]:
import lightning as L
from sklearn.metrics import f1_score, precision_score, recall_score
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup

class BERT(L.LightningModule):

    def __init__(self, learning_rate: float = 1e-3, num_labels: int = 2):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels, force_download=False).train()
        self.learning_rate = learning_rate
        self.validation_outputs = []
        self.test_outputs = []

    def forward(self, input_ids, attention_mask, labels=None):
        return self.bert(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        self.log('train_loss', loss, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        preds = torch.argmax(outputs.logits, dim=1)
        self.validation_outputs.append({'preds': preds, 'labels': batch['labels'], 'loss': loss})
        return loss

    def on_validation_epoch_end(self):
        preds = torch.cat([x['preds'] for x in self.validation_outputs]).cpu()
        labels = torch.cat([x['labels'] for x in self.validation_outputs]).cpu()
        val_f1 = f1_score(labels, preds, average='binary')
        val_precision = precision_score(labels, preds, average='binary')
        val_recall = recall_score(labels, preds, average='binary')
        avg_loss = torch.stack([x['loss'] for x in self.validation_outputs]).mean()
        self.log('val_f1', val_f1, prog_bar=True)
        self.log('val_precision', val_precision)
        self.log('val_recall', val_recall)
        self.log('val_loss', avg_loss, prog_bar=True)
        self.validation_outputs.clear()

    def test_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        preds = torch.argmax(outputs.logits, dim=1)
        self.test_outputs.append({'preds': preds, 'labels': batch['labels'], 'loss': loss})
        return loss

    def on_test_epoch_end(self):
        preds = torch.cat([x['preds'] for x in self.test_outputs]).cpu()
        labels = torch.cat([x['labels'] for x in self.test_outputs]).cpu()
        test_f1 = f1_score(labels, preds, average='binary')
        test_precision = precision_score(labels, preds, average='binary')
        test_recall = recall_score(labels, preds, average='binary')
        avg_loss = torch.stack([x['loss'] for x in self.test_outputs]).mean()
        self.log('test_f1', test_f1, prog_bar=True)
        self.log('test_precision', test_precision, prog_bar=True)
        self.log('test_recall', test_recall, prog_bar=True)
        self.log('test_loss', avg_loss, prog_bar=True)
        self.test_outputs.clear()

    def predict_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1)
        target = {1: '有用', 0: '没用'}

        # 返回预测结果
        return {'pred': target.get(pred.item(), 'unknown')}

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=30, num_training_steps=self.trainer.estimated_stepping_batches)
        return [optimizer], [scheduler]


In [77]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese", clean_up_tokenization_spaces=True)
print("data length:", len(data))
true = 0
false = 0
for _, labels in data['label'].items():
  if labels == 1.0:
    true += 1
  else:
    false += 1
print(f"正类：{true}, 负类：{false}")

data length: 3099
正类：2448, 负类：651


In [10]:
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'


In [11]:
from lightning.pytorch.callbacks import ModelCheckpoint

# 设置种子
seed = 42
L.seed_everything(seed)

# 实例化模型和数据
model = BERT(learning_rate=1e-3)
datamodule = WeiboDataModule(data, tokenizer, batch_size=32, max_len=128)

model_checkpoint = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1)

trainer = L.Trainer(max_epochs=5, callbacks=[model_checkpoint], precision='bf16-mixed')
trainer.fit(model, datamodule=datamodule)


# params = model.bert.state_dict()
# for key in params.keys():
#     if 'beta' in key or 'gamma' in key:
#         print(key)

INFO: Seed set to 42
INFO:lightning.fabric.utilities.seed:Seed set to 42
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO: Using bfloat16 Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VI

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=5` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [109]:

# 测试
best = model_checkpoint.best_model_path
print(f"model path:{best}")
best_model = BERT.load_from_checkpoint(best)

trainer.test(best_model, datamodule=datamodule)

model path:/content/lightning_logs/version_2/checkpoints/epoch=2-step=210.ckpt


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_f1': 0.9103053212165833,
  'test_precision': 0.864130437374115,
  'test_recall': 0.961693525314331,
  'test_loss': 0.4468723237514496}]

In [113]:

# 预测 // 用户自行输入
def predict(df=None):
  class PredictDataset(Dataset):
      def __init__(self, encodings, labels):
          self.encodings = encodings
          self.labels = labels

      def __len__(self):
          return len(self.labels)

      def __getitem__(self, index):
          item = {key: val[index] for key, val in self.encodings.items()}
          item['labels'] = torch.tensor(self.labels[index])
          return item

  from tqdm import tqdm

  def preprocess_text(tokenizer, text_data, max_len):
      encodings = tokenizer(text_data, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
      return encodings

  tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', force_download=False)

  trainer = L.Trainer(accelerator='gpu', devices=1)

  if df is None:
    user_input = [input("please input Chinese sentence:")]
  else:
    user_input = df
  encodings = preprocess_text(tokenizer, user_input, max_len=128)

  predict_dataset = PredictDataset(encodings, torch.zeros(len(user_input)))
  loader = DataLoader(predict_dataset, batch_size=1, shuffle=False, num_workers=0)

  predictions = trainer.predict(best_model, loader)
  print(f"输入：{user_input}")
  print(f"输出：{predictions}")

predict()


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


please input Chinese sentence:累了


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

  item['labels'] = torch.tensor(self.labels[index])


输入：['累了']
输出：[{'pred': '有用'}]


In [112]:
zjy_df = (["垃圾东西", "转基因就是危害社会的", "快点来看，非转基因奶粉，先到先得"])
predict(zjy_df)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

  item['labels'] = torch.tensor(self.labels[index])


输入：['垃圾东西', '转基因就是危害社会的', '快点来看，非转基因奶粉，先到先得']
输出：[{'pred': '没用'}, {'pred': '有用'}, {'pred': '有用'}]
