In [1]:
import os
import torch
import numpy as np
import pandas as pd
from run_classifier import *
from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification

In [2]:
MODEL = 'bert-base-chinese'
VOCAB = 'bert-base-chinese-vocab.txt'
TRAIN_CSV_PATH = 'train.csv'
TEST_CSV_PATH = 'test.csv'

In [6]:
train = pd.read_csv(TRAIN_CSV_PATH, index_col='id')
test = pd.read_csv(TEST_CSV_PATH, index_col='id')

train = train.loc[:, ['title1_zh', 'title2_zh', 'label']]
test = test.loc[:, ['title1_zh', 'title2_zh']]

train.fillna('UNKNOWN', inplace=True)
test.fillna('UNKNOWN', inplace=True)

train.head(3)

Unnamed: 0_level_0,title1_zh,title2_zh,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,unrelated
3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,unrelated
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,unrelated


In [7]:
Counter(train.label)

Counter({'unrelated': 219313, 'agreed': 92973, 'disagreed': 8266})

In [42]:
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527

train, val = train_test_split(train, test_size=VALIDATION_RATIO, random_state=RANDOM_STATE)
train_examples = [InputExample('train', row.title1_zh, row.title2_zh, row.label) for row in train.itertuples()]
val_examples = [InputExample('val', row.title1_zh, row.title2_zh, row.label) for row in val.itertuples()]
test_examples = [InputExample('test', row.title1_zh, row.title2_zh, 'unrelated') for row in test.itertuples()]

len(train_examples)

259646

In [43]:
orginal_total = len(train_examples)
train_examples = train_examples[:int(orginal_total * 0.2)]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
gradient_accumulation_steps = 1
train_batch_size = 32
eval_batch_size = 128
train_batch_size = train_batch_size // gradient_accumulation_steps
output_dir = 'output'
bert_model = 'bert-base-chinese'
num_train_epochs = 3
num_train_optimization_steps = int(len(train_examples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs
cache_dir = 'model'
learning_rate = 5e-5
warmup_proportion = 0.1
max_seq_length = 128
label_list = ['unrelated', 'agreed', 'disagreed']

In [44]:
tokenizer = BertTokenizer.from_pretrained(VOCAB)
model = BertForSequenceClassification.from_pretrained(MODEL, cache_dir=cache_dir, num_labels=3)
model.to(device)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)
model, tokenizer

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

(BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(21128, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0): BertLayer(
           (attention): BertAttention(
             (self): BertSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm((768

In [11]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=(warmup_proportion * num_train_optimization_steps),
    num_training_steps=num_train_optimization_steps
)

In [12]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0
train_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)

logger.info('***** Running training *****')
logger.info('  Num examples = %d', len(train_examples))
logger.info('  Batch size = %d', train_batch_size)
logger.info('  Num steps = %d', num_train_optimization_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

model.train()
for _ in trange(int(num_train_epochs), desc='Epoch'):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    total_step = len(train_data) // train_batch_size
    ten_percent_step = total_step // 10
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[0]
        if n_gpu > 1:
            loss = loss.mean()
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1
        if step % ten_percent_step == 0:
            print("Fininshed: {:.2f}% ({}/{})".format(step/total_step*100, step, total_step))

04/23/2022 23:59:50 - INFO - run_classifier -   *** Example ***
04/23/2022 23:59:50 - INFO - run_classifier -   guid: train
04/23/2022 23:59:50 - INFO - run_classifier -   tokens: [CLS] 营 养 师 ： 补 充 这 4 种 营 养 能 帮 你 降 血 压 ， 你 一 样 都 不 吃 么 ？ [SEP] 刘 涛 担 心 前 夫 离 婚 后 找 不 到 另 一 半 ， 居 然 还 主 动 给 他 介 绍 女 人 [SEP]
04/23/2022 23:59:50 - INFO - run_classifier -   input_ids: 101 5852 1075 2360 8038 6133 1041 6821 125 4905 5852 1075 5543 2376 872 7360 6117 1327 8024 872 671 3416 6963 679 1391 720 8043 102 1155 3875 2857 2552 1184 1923 4895 2042 1400 2823 679 1168 1369 671 1288 8024 2233 4197 6820 712 1220 5314 800 792 5305 1957 782 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/23/2022 23:59:50 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

Fininshed: 0.00% (0/1803)
Fininshed: 9.98% (180/1803)
Fininshed: 19.97% (360/1803)
Fininshed: 29.95% (540/1803)
Fininshed: 39.93% (720/1803)
Fininshed: 49.92% (900/1803)
Fininshed: 59.90% (1080/1803)
Fininshed: 69.88% (1260/1803)
Fininshed: 79.87% (1440/1803)
Fininshed: 89.85% (1620/1803)
Fininshed: 99.83% (1800/1803)


Epoch:  33%|███▎      | 1/3 [13:26:46<26:53:33, 48406.67s/it]

Fininshed: 0.00% (0/1803)
Fininshed: 9.98% (180/1803)
Fininshed: 19.97% (360/1803)
Fininshed: 29.95% (540/1803)
Fininshed: 39.93% (720/1803)
Fininshed: 49.92% (900/1803)
Fininshed: 59.90% (1080/1803)
Fininshed: 69.88% (1260/1803)
Fininshed: 79.87% (1440/1803)
Fininshed: 89.85% (1620/1803)
Fininshed: 99.83% (1800/1803)


Epoch:  67%|██████▋   | 2/3 [27:17:28<13:40:50, 49250.73s/it]

Fininshed: 0.00% (0/1803)
Fininshed: 9.98% (180/1803)
Fininshed: 19.97% (360/1803)
Fininshed: 29.95% (540/1803)
Fininshed: 39.93% (720/1803)
Fininshed: 49.92% (900/1803)
Fininshed: 59.90% (1080/1803)
Fininshed: 69.88% (1260/1803)
Fininshed: 79.87% (1440/1803)
Fininshed: 89.85% (1620/1803)
Fininshed: 99.83% (1800/1803)


Epoch: 100%|██████████| 3/3 [41:18:54<00:00, 49578.17s/it]   


In [47]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
with open(output_config_file, 'w') as f:
    f.write(model_to_save.config.to_json_string())

In [48]:
# Load a trained model and config that I have fine-tuned
config = BertConfig.from_pretrained(output_config_file)
config.num_labels = len(label_list)
model = BertForSequenceClassification(config)
model.load_state_dict(torch.load(output_model_file))
model.to(device)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)
config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21

In [49]:
eval_examples = val_examples
eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)

logger.info('***** Running evaluation *****')
logger.info('  Num examples = %d', len(eval_examples))
logger.info('  Batch size = %d', eval_batch_size)

all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[0]
        logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[1]

    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples
loss = tr_loss / nb_tr_steps
result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss}

output_eval_file = os.path.join(output_dir, 'eval_results.txt')
with open(output_eval_file, 'w') as writer:
    logger.info('***** Eval results *****')
    for key in sorted(result.keys()):
        logger.info('  %s = %s', key, str(result[key]))
        writer.write('%s = %s\n' % (key, str(result[key])))

04/26/2022 17:00:02 - INFO - run_classifier -   *** Example ***
04/26/2022 17:00:02 - INFO - run_classifier -   guid: val
04/26/2022 17:00:02 - INFO - run_classifier -   tokens: [CLS] 夏 季 的 这 两 大 水 果 一 起 吃 不 仅 没 营 养 ， 多 食 还 有 害 健 康 ， 少 吃 为 妙 ！ [SEP] 你 还 在 吃 草 莓 、 西 瓜 ？ 反 季 果 蔬 真 的 有 害 健 康 吗 ？ [SEP]
04/26/2022 17:00:02 - INFO - run_classifier -   input_ids: 101 1909 2108 4638 6821 697 1920 3717 3362 671 6629 1391 679 788 3766 5852 1075 8024 1914 7608 6820 3300 2154 978 2434 8024 2208 1391 711 1975 8013 102 872 6820 1762 1391 5770 5803 510 6205 4478 8043 1353 2108 3362 5922 4696 4638 3300 2154 978 2434 1408 8043 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/26/2022 17:00:02 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [50]:
device

device(type='cpu')

In [51]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [52]:
def predict(model, tokenizer, examples, label_list, eval_batch_size=128):
    model.to(device)
    eval_examples = examples
    eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer)
    
    logger.info('***** Running evaluation *****')
    logger.info('  Num examples = %d', len(eval_examples))
    logger.info('  Batch size = %d', eval_batch_size)
    
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    res = []
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[1]
        logits = logits.detach().cpu().numpy()
        res.extend(logits.argmax(-1))
        nb_eval_steps += 1
        
    return res

In [53]:
res = predict(model, tokenizer, test_examples, label_list)

04/27/2022 00:10:28 - INFO - run_classifier -   *** Example ***
04/27/2022 00:10:28 - INFO - run_classifier -   guid: test
04/27/2022 00:10:28 - INFO - run_classifier -   tokens: [CLS] 萨 拉 赫 人 气 爆 棚 ! 埃 及 总 统 大 选 未 参 选 获 百 万 选 票 现 任 总 统 压 力 山 大 [SEP] 辟 谣 ！ 里 昂 官 方 否 认 费 基 尔 加 盟 利 物 浦 ， 难 道 是 价 格 没 谈 拢 ？ [SEP]
04/27/2022 00:10:28 - INFO - run_classifier -   input_ids: 101 5855 2861 6622 782 3698 4255 3476 106 1812 1350 2600 5320 1920 6848 3313 1346 6848 5815 4636 674 6848 4873 4385 818 2600 5320 1327 1213 2255 1920 102 6792 6469 8013 7027 3203 2135 3175 1415 6371 6589 1825 2209 1217 4673 1164 4289 3855 8024 7410 6887 3221 817 3419 3766 6448 2879 8043 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
04/27/2022 00:10:28 - INFO - run_classifier -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 

In [54]:
cat_map = {idx:lab for idx, lab in enumerate(label_list)}
res = [cat_map[c] for c  in res]

In [55]:
#　For Submission
test['Category'] = res
submission = test.loc[:, ['Category']].reset_index()
submission.columns = ['Id', 'Category']
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated
