In [20]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import os
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.preprocessing import OneHotEncoder

In [21]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [12]:
path = 'G:/lzl/lzl/学校（研究生）/数据/猫眼影评/bert训练/'
movie_names = os.listdir(path)

In [13]:
# 预处理
data = pd.DataFrame(columns=['comment_text', 'score'])
for movie in movie_names:
    df = pd.read_excel(path+movie)[['content', 'score']]
    df.columns = data.columns
    data = pd.concat([data, df], axis=0, ignore_index=True)

data = data.copy()
data.loc[(data['score']>=4) & (data['score']<=5), 'score'] = 100 #正面评价
data.loc[(data['score']>=0) & (data['score']<3), 'score'] = -100 #负面评价
data.loc[(data['score']>=3) & (data['score']<4), 'score'] = 0 #中性评价

pos = data[data['score']==100]
sam_pos = pos.sample(n=5000, random_state=200)
not_good = data.drop(pos.index).reset_index(drop=True)
data = pd.concat([not_good, sam_pos], ignore_index=True)

ohc = pd.DataFrame(OneHotEncoder(categories= 'auto', sparse=False).fit_transform(data['score'].values.reshape(-1, 1)))
data = pd.concat([data, pd.Series(ohc.values.tolist())], axis=1)
data.rename(columns={0:'list'}, inplace=True)
data.head()

Unnamed: 0,comment_text,score,list
0,真的不好看，白搭一张电影票,-100.0,"[1.0, 0.0, 0.0]"
1,不知所云，情节搞笑,-100.0,"[1.0, 0.0, 0.0]"
2,差评，演的毫无力矩,-100.0,"[1.0, 0.0, 0.0]"
3,其实是我觉得还行，就可能剧情有点接不上，其他还可以,0.0,"[0.0, 1.0, 0.0]"
4,感觉不怎么样,-100.0,"[1.0, 0.0, 0.0]"


In [14]:
MAX_LEN = 100
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [15]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            truncation='longest_first',
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [22]:
train_size = 0.8
train_dataset=data.sample(frac=train_size,random_state=200)
test_dataset=data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(data.shape[0]))
print("TRAIN Dataset: {}".format(train_dataset.shape[0]))
print("TEST Dataset: {}".format(test_dataset.shape[0]))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: 10861
TRAIN Dataset: 8689
TEST Dataset: 2172


In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Creating the customized model, by adding a drop out and a dense layer on top of bert to get the final output for the model. 
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-chinese', cache_dir = r'G:\lzl\lzl\学校（研究生）\模型\Bert\Bert-base-chinese')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 3)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [23]:
#损失函数
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)  #在模型中不添加softmax层，在损失函数中使用BCEwithlogits比使用softmax层和BCE损失更加稳定。

#优化方式
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def train(epoch):
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 250 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}, Batch_Trained: {_}')
        optimizer.zero_grad()  #对每一个batch，初始化梯度为零    
        loss.backward()
        optimizer.step()

In [24]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.5014252066612244, Batch_Trained: 0


KeyboardInterrupt: 

In [10]:
torch.save(model, r'D:\checkpoints\中文影评情感分类\中文影评情感分类.pkl')