# Train

In [2]:
# load basic library
import random
import numpy as np
import pickle
from tqdm import tqdm

# load torch library
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)

# custum module
from tools import *

# keep reandom seed
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)


# check gpu
device = get_device()


# load data
with open("./data/pkl/X_train_list.pkl", "rb") as f:
    X_train_list = pickle.load(f)
with open("./data/pkl/X_valid_list.pkl", "rb") as f:
    X_valid_list = pickle.load(f)
with open("./data/pkl/y_train_list.pkl", "rb") as f:
    y_train_list = pickle.load(f)
with open("./data/pkl/y_valid_list.pkl", "rb") as f:
    y_valid_list = pickle.load(f)

There are 1 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti



In [3]:
# tokenize
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", do_lower_case=False)

(
    input_ids_train_dict,
    attention_masks_train_dict,
    labels_train_dict,
) = tokenizing_for_bert(
    X_train_list, y_train_list, tokenizer
)  # max_len = 400

input_ids_cv_dict, attention_masks_cv_dict, labels_cv_dict = tokenizing_for_bert(
    X_valid_list, y_valid_list, tokenizer, train=False
)

print(len(input_ids_train_dict))
print(input_ids_train_dict["tr_1"].shape)
print(attention_masks_train_dict["tr_1"].shape)
print(labels_train_dict["tr_1"].shape)

tokenizing for bert input
tokenizing for bert input
5
torch.Size([18777, 400])
torch.Size([18777, 400])
torch.Size([18777])


In [7]:
assert 0

AssertionError: 

In [3]:
# all_data = torch.cat((input_ids_train_dict["tr_1"], input_ids_cv_dict["va_1"]), 0)
# all_data.shape

torch.Size([23472, 400])

In [4]:
# token_length = []
# for i in all_data:
#     token_length.append(sum(i!=0).item())

In [5]:
# print(np.mean(token_length))
# print(np.max(token_length))
# print(np.min(token_length))
# print(np.std(token_length))

215.33452624403546
400
4
127.15932762513401


In [21]:
# print(np.mean(token_length))
# print(np.max(token_length))
# print(np.min(token_length))
# print(np.std(token_length))

231.8667348329925
512
4
153.99549015975495


In [6]:
# Prepare torch dataset
tr_set = []
va_set = []
for idx in range(len(input_ids_train_dict)):
    tr_set.append(
        TensorDataset(
            input_ids_train_dict["tr_" + str(idx)],
            attention_masks_train_dict["tr_" + str(idx)],
            labels_train_dict["tr_" + str(idx)],
        )
    )
    va_set.append(
        TensorDataset(
            input_ids_cv_dict["va_" + str(idx)],
            attention_masks_cv_dict["va_" + str(idx)],
            labels_cv_dict["va_" + str(idx)],
        )
    )

print(type(tr_set[0]))

<class 'torch.utils.data.dataset.TensorDataset'>


In [7]:
# hypterparameter
epochs = 4
batch_size = 8
print("epochs:", epochs)
print("batch_size:", batch_size)
print()

epochs: 4
batch_size: 8



In [8]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     "bert-base-cased",
#     num_labels=2,
#     output_attentions=False,
#     output_hidden_states=False,
#     hidden_dropout_prob=0.5,
#     attention_probs_dropout_prob=0.5,
# )
# model.to(device)

In [9]:
# # Don't apply weight decay to any parameters whose names include these tokens.
# # (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
# no_decay = ['bias', 'LayerNorm.weight']

# # Separate the `weight` parameters from the `bias` parameters.
# # - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
# # - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
# optimizer_grouped_parameters = [
#     # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#      'weight_decay_rate': 0.1},

#     # Filter for parameters which *do* include those.
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
#      'weight_decay_rate': 0.0}
# ]

In [10]:
# del model

In [11]:
# training
training_hist = []

for fold in tqdm(range(len(tr_set))):

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False,
        hidden_dropout_prob=0.4,
        attention_probs_dropout_prob=0.25,
    )
    model.to(device)

    # This code is taken from:
    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L102

    # Don't apply weight decay to any parameters whose names include these tokens.
    # (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
    no_decay = ["bias", "LayerNorm.weight"]

    # Separate the `weight` parameters from the `bias` parameters.
    # - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
    # - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
    optimizer_grouped_parameters = [
        # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.1,
        },
        # Filter for parameters which *do* include those.
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.0,
        },
    ]

    # Note - `optimizer_grouped_parameters` only includes the parameter values, not
    # the names.

    N_train = len(tr_set[fold])
    N_test = len(va_set[fold])
    print("\n[Fold]:", fold)
    print("Num of train samples:", N_train)
    print("Num of valid samples:", N_test)
    print()

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=4e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
    )

    train_dataloader = DataLoader(tr_set[fold], shuffle=True, batch_size=batch_size)

    validation_dataloader = DataLoader(
        va_set[fold], shuffle=False, batch_size=batch_size
    )

    # Total number of training steps is [number of batches] x [number of epochs].
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=total_steps * 0.1, num_training_steps=total_steps
    )

    history = train_model(
        model=model,
        train_loader=train_dataloader,
        valid_loader=validation_dataloader,
        optimizer=optimizer,
        N_train=N_train,
        N_test=N_test,
        device=device,
        scheduler=scheduler,
        epochs=epochs,
    )

    training_hist.append(history)
    print("*" * 25)
    print("*" * 25)
    print("*" * 25)
    break

  0%|          | 0/5 [00:00<?, ?it/s]Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initi


[Fold]: 0
Num of train samples: 18777
Num of valid samples: 4695

training loss: 0.47
training acc: 0.79
-------------------------
train loss: 0.37
train acc: 0.89
-------------------------
valid loss: 0.42
valid acc: 0.88
-------------------------
training loss: 0.38
training acc: 0.88
-------------------------
train loss: 0.27
train acc: 0.93
-------------------------
valid loss: 0.36
valid acc: 0.90
-------------------------
training loss: 0.33
training acc: 0.91
-------------------------
train loss: 0.25
train acc: 0.94
-------------------------
valid loss: 0.40
valid acc: 0.91
-------------------------
training loss: 0.28
training acc: 0.93
-------------------------
train loss: 0.24
train acc: 0.95
-------------------------


  0%|          | 0/5 [1:05:45<?, ?it/s]

valid loss: 0.41
valid acc: 0.91
-------------------------
*************************
*************************
*************************





In [37]:
import csv


def final_metric(history, metric_path, mtype="train"):
    """
    Calculate metric.
    """
    # init
    ACC = []
    LOSS = []
    RECALL = []
    SPECIFICITY = []
    PRECISION = []
    NPV = []
    F1 = []
    MCC = []
    AUC = []
    FPR = []
    TPR = []

    for i in range(len(history)):

        (TP, FP, TN, FN) = history[i][mtype + "_metric"][-1]
        auc = history[i][mtype + "_auc"][-1]
        fpr = history[i][mtype + "_fpr"][-1]
        tpr = history[i][mtype + "_tpr"][-1]
        loss = history[i][mtype + "_loss"][-1]

        acc = (TP + TN) / (TP + FP + TN + FN)

        recall = TP / (TP + FN) if TP != 0 else 0  # 召回率是在所有正樣本當中，能夠預測多少正樣本的比例
        specificity = TN / (TN + FP) if TN != 0 else 0  # 特異度是在所有負樣本當中，能夠預測多少負樣本的比例
        precision = TP / (TP + FP) if TP != 0 else 0  # 準確率為在所有預測為正樣本中，有多少為正樣本
        npv = TN / (TN + FN) if TN != 0 else 0  # npv為在所有預測為負樣本中，有多少為負樣本
        f1 = (
            (2 * recall * precision) / (recall + precision)
            if (recall + precision) != 0
            else 0
        )  # F1-score則是兩者的調和平均數

        mcc = (
            (TP * TN - FP * FN)
            / np.sqrt(((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)))
            if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0
            else 0
        )

        ACC.append(acc)
        LOSS.append(loss)
        RECALL.append(recall)
        SPECIFICITY.append(specificity)
        PRECISION.append(precision)
        NPV.append(npv)
        F1.append(f1)
        MCC.append(mcc)
        AUC.append(auc)
        FPR.append(fpr)
        TPR.append(tpr)

    print("\n[" + mtype + " average]\n")
    print("ACC: {:.2}".format((np.mean(ACC))))
    print("LOSS: {:.2}".format(np.mean(LOSS)))
    print()
    print("Recall: {:.2}".format(np.mean(RECALL)))
    print("Specificity: {:.2}".format(np.mean(SPECIFICITY)))
    print("Precision: {:.2}".format(np.mean(PRECISION)))
    print("NPV: {:.2}".format(np.mean(NPV)))
    print()
    print("F1: {:.2}".format(np.mean(F1)))
    print("MCC: {:.2}".format(np.mean(MCC)))
    print("AUC: {:.2}".format(np.mean(AUC)))
    print()

    # save result
    save_metrics(
        metric_path,
        mtype,
        ACC,
        LOSS,
        RECALL,
        SPECIFICITY,
        PRECISION,
        NPV,
        F1,
        MCC,
        AUC,
    )


def save_metrics(
    metric_path,
    mtype,
    ACC,
    LOSS,
    RECALL,
    SPECIFICITY,
    PRECISION,
    NPV,
    F1,
    MCC,
    AUC,
):
    """
    save metrics as csv files
    """
    filename = mtype + "_metrics.csv"
    save_path = os.path.join(metric_path, filename)
    with open(save_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile, delimiter="\t")
        writer.writerow(["\n[" + mtype + " average]\n"])
        writer.writerow(["ACC: {:.2}".format((np.mean(ACC)))])
        writer.writerow(["LOSS: {:.2}".format(np.mean(LOSS))])
        writer.writerow(["Recall: {:.2}".format(np.mean(RECALL))])
        writer.writerow(["Specificity: {:.2}".format(np.mean(SPECIFICITY))])
        writer.writerow(["Precision: {:.2}".format(np.mean(PRECISION))])
        writer.writerow(["NPV: {:.2}".format(np.mean(NPV))])
        writer.writerow(["F1: {:.2}".format(np.mean(F1))])
        writer.writerow(["MCC: {:.2}".format(np.mean(MCC))])
        writer.writerow(["AUC: {:.2}".format(np.mean(AUC))])

In [28]:
import os

In [30]:
model_name = "bert-base-cased"

In [46]:
# save trainin_history
with open(os.path.join(history_path, "/hist.pkl"), "wb") as f:
    pickle.dump(training_hist, f)

In [38]:
final_metric(training_hist, metric_path=metric_path, mtype="train")
final_metric(training_hist, metric_path=metric_path, mtype="valid")


[train average]

ACC: 0.95
LOSS: 0.24

Recall: 0.96
Specificity: 0.94
Precision: 0.94
NPV: 0.96

F1: 0.95
MCC: 0.9
AUC: 0.99


[valid average]

ACC: 0.91
LOSS: 0.41

Recall: 0.93
Specificity: 0.89
Precision: 0.9
NPV: 0.93

F1: 0.92
MCC: 0.83
AUC: 0.97



In [48]:
torch.save(model.state_dict(), os.path.join(model_path, "model.pkl"))

In [36]:
metric_path

'/home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/metrics'

In [49]:
def calc_avg(training_hist):
    """
    Plot learning curve
    """

    a1 = a2 = a3 = a4 = []  # init

    for i in range(len(training_hist)):
        if i == 0:
            a1 = np.array(training_hist[0]["train_loss"].copy())
            a2 = np.array(training_hist[0]["valid_loss"].copy())
            a3 = np.array(training_hist[0]["train_acc"].copy())
            a4 = np.array(training_hist[0]["valid_acc"].copy())
            continue
        a1 = a1 + np.array(training_hist[i]["train_loss"])
        a2 = a2 + np.array(training_hist[i]["valid_loss"])
        a3 = a3 + np.array(training_hist[i]["train_acc"])
        a4 = a4 + np.array(training_hist[i]["valid_acc"])

    a1 /= len(training_hist)
    a2 /= len(training_hist)
    a3 /= len(training_hist)
    a4 /= len(training_hist)

    a1 = a1.tolist()
    a2 = a2.tolist()
    a3 = a3.tolist()
    a4 = a4.tolist()

    return a1, a2, a3, a4

In [64]:
def plot_roc(training_hist, fig_path, mtype="train"):
    """
    plot roc curve and save as png
    """
    for i in range(len(training_hist)):
        auc = training_hist[i][mtype + "_auc"][-1]
        fpr = training_hist[i][mtype + "_fpr"][-1]
        tpr = training_hist[i][mtype + "_tpr"][-1]

        plt.plot(fpr, tpr, label="Fold-" + str(i) + " AUC = %0.2f" % auc)

    plt.title(mtype + " roc curve")
    plt.legend(loc="lower right")
    plt.plot([0, 1], [0, 1], "r--")
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel("True Positive Rate")
    plt.xlabel("False Positive Rate")
    filename = mtype + "-roc.png"
    plt.savefig(os.path.join(fig_path, filename))  # , bbox_inches="tight")
    plt.close()

In [67]:
def plot_figure(training_hist, fig_path):

    a1, a2, a3, a4 = calc_avg(training_hist)

    # color
    tr_color = ["#2ff5f2", "#2ff5e8", "#2ff5c0", "#2fbdf5", "#2f99f5"]
    val_color = ["#f5952f", "#f5ac2f", "#f5c02f", "#f5d72f", "#f5ee2f"]

    # train loss
    for idx in range(len(training_hist)):
        plt.plot(
            training_hist[idx]["train_loss"],
            "--",
            alpha=0.6,
            label="train" + str(idx),
            # color=color,
        )
    plt.plot(a1, label="average training")

    # valid loss
    for idx in range(len(training_hist)):
        plt.plot(
            training_hist[idx]["valid_loss"],
            "--",
            alpha=0.6,
            label="valid" + str(idx),
            # color=color,
        )

    plt.plot(a2, label="average valid")
    plt.ylabel("loss")
    plt.xlabel("epochs")
    axes = plt.gca()
    axes.set_ylim([0, 1])
    plt.legend()
    plt.title("training / valid loss vs iterations")
    plt.grid()
    plt.savefig(os.path.join(fig_path, "loss.png"))  # , bbox_inches="tight")
    plt.close()

    # train acc
    for idx in range(len(training_hist)):
        plt.plot(
            training_hist[idx]["train_acc"],
            "--",
            alpha=0.6,
            label="train" + str(idx),
            # color=color,
        )
    plt.plot(a3, label="average training")

    # valid acc
    for idx in range(len(training_hist)):
        plt.plot(
            training_hist[idx]["valid_acc"],
            "--",
            alpha=0.6,
            label="valid" + str(idx),
            # color=color,
        )
    plt.plot(a4, label="average valid")
    plt.ylabel("acc")
    plt.xlabel("epochs")
    axes = plt.gca()
    axes.set_ylim([0.5, 1])
    plt.legend()
    plt.title("training / valid acc vs iterations")
    plt.grid()
    plt.savefig(os.path.join(fig_path, "acc.png"))  # , bbox_inches="tight")
    plt.close()

    # roc
    plot_roc(training_hist, fig_path, mtype="train")
    plot_roc(training_hist, fig_path, mtype="valid")

In [68]:
import matplotlib.pyplot as plt

plot_figure(training_hist, fig_path)

# Evaluate on test

In [72]:
# load data
with open("./data/pkl/X_test.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("./data/pkl/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

In [104]:
def setting_path(model_name, batch_size, epochs, mode="train"):
    # setting path
    cwd = os.getcwd()
    print("cwd:", cwd)

    folder_name = model_name + "_bs_" + str(batch_size) + "_epo" + str(epochs)

    metric_path = os.path.abspath(
        os.path.join(cwd, "result", folder_name, mode, "metrics")
    )
    model_path = os.path.abspath(
        os.path.join(cwd, "result", folder_name, mode, "model")
    )
    history_path = os.path.abspath(
        os.path.join(cwd, "result", folder_name, mode, "history")
    )
    fig_path = os.path.abspath(
        os.path.join(cwd, "result", folder_name, mode, "figures")
    )

    print("metric_path:", metric_path)
    print("model_path:", model_path)
    print("history_path:", history_path)
    print("fig_path:", fig_path)

    if not os.path.isdir(metric_path):
        os.makedirs(metric_path)
    if not os.path.isdir(model_path):
        os.makedirs(model_path)
    if not os.path.isdir(history_path):
        os.makedirs(history_path)
    if not os.path.isdir(fig_path):
        os.makedirs(fig_path)

    return metric_path, model_path, history_path, fig_path

In [73]:
X_test

20892    A student filmmaker enlists a B-grade actress ...
13280    This movie has a "big production" feel that I ...
29002    A vampire's's henchman wants to call her after...
6858     Don't get me wrong, I assumed this movie would...
21664    Swedish action movies have over the past few y...
                               ...                        
12939    "Three Daring Daughters" is a sickly sweet, ro...
20460    I too am a House Party Fan...House Party I is ...
9273     I just came back from a pre-release viewing of...
6213     This is a very intriguing short movie by David...
29034    Yes, that's right, it is. I firmly believe tha...
Name: review, Length: 5869, dtype: object

In [74]:
y_test

20892    0
13280    1
29002    0
6858     0
21664    0
        ..
12939    0
20460    0
9273     1
6213     1
29034    1
Name: sentiment, Length: 5869, dtype: int64

In [105]:
model_name = "bert-base-cased"

In [108]:
metric_path, _, history_path, fig_path = setting_path(
    model_name, batch_size, epochs, mode="test"
)

cwd /home/lichang/projects/ai_cup-movie
metric_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/metrics
model_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/model
history_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/history
fig_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/figures


In [77]:
# tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)

In [81]:
def tokenizing_for_bert_eval(sent_list, sent_label, tokenizer):
    """
    Tokenize abstracts and return data as Bert model input tensor.
    """

    print("tokenizing for bert input")

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_list = []
    attention_masks_list = []
    # For every sentence...
    for sent in sent_list:
        encoded_dict = tokenizer.encode_plus(
            sent,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=400,  # 512,  # Pad & truncate all sentences.
            padding="max_length",
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors="pt",  # Return pytorch tensors.
            truncation=True,
        )

        # Add the encoded sentence to the list.
        input_ids_list.append(encoded_dict["input_ids"])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks_list.append(encoded_dict["attention_mask"])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids_list, dim=0)
    attention_masks = torch.cat(attention_masks_list, dim=0)
    labels = torch.tensor(sent_label)

    return input_ids, attention_masks, labels

In [88]:
input_ids_te, attention_masks_te, labels_te = tokenizing_for_bert_eval(
    X_test.values, y_test.values, tokenizer
)

tokenizing for bert input


In [89]:
testdataset = TensorDataset(input_ids_te, attention_masks_te, labels_te)

In [91]:
len(testdataset)

5869

In [90]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    hidden_dropout_prob=0.4,
    attention_probs_dropout_prob=0.25,
)
model.to(device)

PATH = "./result/bert-base-cased_bs_8_epo4/model/model.pkl"
model.load_state_dict(torch.load(PATH))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

<All keys matched successfully>

In [96]:
def eval_data(
    model,
    test_loader,
    N_test,
    device,
):
    """
    eval the data.
    """

    useful_stuff = {
        "test_loss": [],
        "test_acc": [],
        "test_auc": [],
        "test_metric": [],
        "test_fpr": [],
        "test_tpr": [],
    }

    # evaluate test metrics
    (
        model,
        correct,
        training_loss,
        (TP, FP, TN, FN),
        y_list,
        yhat_list,
    ) = batch_iter(model, test_loader, None, None, device, training=False)

    useful_stuff = calc_metrics(
        N_test,
        test_loader,
        correct,
        training_loss,
        (TP, FP, TN, FN),
        y_list,
        yhat_list,
        useful_stuff,
        type="test",
    )

    return useful_stuff

In [101]:
training_hist = []

N_test = len(testdataset)

test_loader = DataLoader(testdataset, shuffle=False, batch_size=16)

history = eval_data(
    model=model,
    test_loader=test_loader,
    N_test=N_test,
    device=device,
)

training_hist.append(history)

test loss: 0.43
test acc: 0.91
-------------------------


In [110]:
history_path

'/home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/history'

In [111]:
# save trainin_history
with open(os.path.join(history_path, "hist.pkl"), "wb") as f:
    pickle.dump(training_hist, f)

In [112]:
final_metric(training_hist, metric_path=metric_path, mtype="test")


[test average]

ACC: 0.91
LOSS: 0.43

Recall: 0.93
Specificity: 0.9
Precision: 0.9
NPV: 0.92

F1: 0.92
MCC: 0.83
AUC: 0.97



# Prediction

In [115]:
with open("./data/pkl/test_new.pkl", "rb") as f:
    test = pickle.load(f)

In [116]:
test

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...
...,...,...
29336,30370,It is difficult to rate a writer/director's fi...
29337,18654,"After watching this movie once, it quickly bec..."
29338,47985,"Even though i sat and watched the whole thing,..."
29339,9866,Warning Spoilers following. Superb recreation ...


In [119]:
test_data = test["review"].values

In [117]:
def tokenizing_for_bert_pred(sent_list, tokenizer):
    """
    Tokenize abstracts and return data as Bert model input tensor.
    """

    print("tokenizing for bert input")

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids_list = []
    attention_masks_list = []
    # For every sentence...
    for sent in sent_list:
        encoded_dict = tokenizer.encode_plus(
            sent,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=400,  # 512,  # Pad & truncate all sentences.
            padding="max_length",
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors="pt",  # Return pytorch tensors.
            truncation=True,
        )

        # Add the encoded sentence to the list.
        input_ids_list.append(encoded_dict["input_ids"])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks_list.append(encoded_dict["attention_mask"])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids_list, dim=0)
    attention_masks = torch.cat(attention_masks_list, dim=0)

    return input_ids, attention_masks

In [120]:
input_ids, attention_masks = tokenizing_for_bert_pred(test_data, tokenizer)

tokenizing for bert input


In [125]:
testdataset = TensorDataset(input_ids, attention_masks)

In [126]:
len(testdataset)

29341

In [122]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    hidden_dropout_prob=0.4,
    attention_probs_dropout_prob=0.25,
)
model.to(device)

PATH = "./result/bert-base-cased_bs_8_epo4/train/model/model.pkl"
model.load_state_dict(torch.load(PATH))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

<All keys matched successfully>

In [None]:
def pred_data(
    model,
    dataloader,
    device,
):
    """
    prediction the data.
    """

    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            output = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = output[0]
            logits = output[1]

            yhat_prob = F.softmax(logits, dim=1)[:, 1]

            # Accumulate the validation loss.
            training_loss += loss.item()
            _, yhat = torch.max(logits.data, 1)
            correct += (yhat == b_labels).sum().item()

In [128]:
test_loader = DataLoader(testdataset, shuffle=False, batch_size=16)

In [151]:
pred = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)

        output = model(b_input_ids, attention_mask=b_input_mask)
        logits = output[0]

        _, yhat = torch.max(logits.data, 1)

        pred.extend(yhat.cpu().detach().numpy())

In [152]:
print(len(pred))

29341


In [153]:
len(test)

29341

In [156]:
import pandas as pd
submission = pd.DataFrame({"ID": test["ID"].values, "sentiment": pred})
submission

Unnamed: 0,ID,sentiment
0,22622,1
1,10162,1
2,17468,0
3,42579,0
4,701,0
...,...,...
29336,30370,0
29337,18654,1
29338,47985,0
29339,9866,0


In [157]:
submission.to_csv('./submission.csv',encoding='utf-8',index=False)