# Main

## Train

In [1]:
model_name = "bert-base-cased"
epochs = 3
batch_size = 4

In [3]:
# load basic library
import os
import random
import numpy as np
import pickle
from tqdm import tqdm

# load torch library
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)

# custum module
from tools import *

# keep reandom seed
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

# check gpu
device = get_device()

# setting path
metric_path, model_path, history_path, fig_path = setting_path(
    model_name, batch_size, epochs, mode="train"
)

# load data
# path = "./data/5_fold"
# with open(os.path.join(path, "X_tr_list.pkl"), "rb") as f:
#     X_tr_list = pickle.load(f)
# with open(os.path.join(path, "X_va_list.pkl"), "rb") as f:
#     X_va_list = pickle.load(f)
# with open(os.path.join(path, "y_tr_list.pkl"), "rb") as f:
#     y_tr_list = pickle.load(f)
# with open(os.path.join(path, "y_va_list.pkl"), "rb") as f:
#     y_va_list = pickle.load(f)

path = "./data/train_valid_split/level_1/"
with open(os.path.join(path, "X_tr.pkl"), "rb") as f:
    X_tr = pickle.load(f)
with open(os.path.join(path, "X_va.pkl"), "rb") as f:
    X_va = pickle.load(f)
with open(os.path.join(path, "y_tr.pkl"), "rb") as f:
    y_tr = pickle.load(f)
with open(os.path.join(path, "y_va.pkl"), "rb") as f:
    y_va = pickle.load(f)

No GPU available, using the CPU instead.

cwd: D:\user\Documents\N26091194\Projects\ai_cup-movie
metric_path: D:\user\Documents\N26091194\Projects\ai_cup-movie\result\bert-base-cased_bs_4_epo3\train\metrics
model_path: D:\user\Documents\N26091194\Projects\ai_cup-movie\result\bert-base-cased_bs_4_epo3\train\model
history_path: D:\user\Documents\N26091194\Projects\ai_cup-movie\result\bert-base-cased_bs_4_epo3\train\history
fig_path: D:\user\Documents\N26091194\Projects\ai_cup-movie\result\bert-base-cased_bs_4_epo3\train\figures


In [4]:
X_tr = X_tr[:50]
X_va = X_va[:10]
y_tr = y_tr[:50]
y_va = y_va[:10]

### tokenizing

In [5]:
# tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)

# 5-fold, max_len = 400
# (
#     input_ids_tr_dict,
#     attention_masks_tr_dict,
#     labels_tr_dict,
# ) = tokenizing_for_cv(X_tr_list, y_tr_list, tokenizer)

# input_ids_va_dict, attention_masks_va_dict, labels_va_dict = tokenizing_for_cv(
#     X_va_list, y_va_list, tokenizer, train=False
# )

# Hold-out, max_len = 400
input_ids_tr, attention_masks_tr, labels_tr = tokenizing(
    X_tr.values, y_tr.values, tokenizer
)
input_ids_va, attention_masks_va, labels_va = tokenizing(
    X_va.values, y_va.values, tokenizer
)

In [6]:
input_ids_tr.shape

torch.Size([50, 400])

check average token length

**max_len = 512:**

```python
print(np.mean(token_length))
print(np.max(token_length))
print(np.min(token_length))
print(np.std(token_length))
```

```text
231.8667348329925
512
4
153.99549015975495
```

**max_len = 400:**

```python
print(np.mean(token_length))
print(np.max(token_length))
print(np.min(token_length))
print(np.std(token_length))
```

```text
215.33452624403546
400
4
127.15932762513401
```


### dataset

In [7]:
# 5-fold dataset
# tr_set = []
# va_set = []
# for idx in range(len(input_ids_tr_dict)):
#     tr_set.append(
#         TensorDataset(
#             input_ids_tr_dict["tr_" + str(idx)],
#             attention_masks_tr_dict["tr_" + str(idx)],
#             labels_tr_dict["tr_" + str(idx)],
#         )
#     )
#     va_set.append(
#         TensorDataset(
#             input_ids_va_dict["va_" + str(idx)],
#             attention_masks_va_dict["va_" + str(idx)],
#             labels_va_dict["va_" + str(idx)],
#         )
#     )

# Holdout dataset
Trainset = TensorDataset(input_ids_tr, attention_masks_tr, labels_tr)
Validset = TensorDataset(input_ids_va, attention_masks_va, labels_va)

### training

In [8]:
# 5-fold training
# training_hist = []

# for fold in tqdm(range(len(tr_set))):

#     model = AutoModelForSequenceClassification.from_pretrained(
#         model_name,
#         num_labels=2,
#         output_attentions=False,
#         output_hidden_states=False,
#         hidden_dropout_prob=0.35,
#         attention_probs_dropout_prob=0.25,
#     )
#     model.to(device)

#     # This code is taken from:
#     # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L102

#     # Don't apply weight decay to any parameters whose names include these tokens.
#     # (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
#     no_decay = ["bias", "LayerNorm.weight"]

#     # Separate the `weight` parameters from the `bias` parameters.
#     # - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
#     # - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
#     optimizer_grouped_parameters = [
#         # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
#         {
#             "params": [
#                 p
#                 for n, p in model.named_parameters()
#                 if not any(nd in n for nd in no_decay)
#             ],
#             "weight_decay_rate": 0.1,
#         },
#         # Filter for parameters which *do* include those.
#         {
#             "params": [
#                 p
#                 for n, p in model.named_parameters()
#                 if any(nd in n for nd in no_decay)
#             ],
#             "weight_decay_rate": 0.0,
#         },
#     ]

#     # Note - `optimizer_grouped_parameters` only includes the parameter values, not
#     # the names.

#     N_train = len(tr_set[fold])
#     N_test = len(va_set[fold])
#     print("\n[Fold]:", fold)
#     print("Num of train samples:", N_train)
#     print("Num of valid samples:", N_test)
#     print()

#     optimizer = AdamW(
#         optimizer_grouped_parameters,
#         lr=4e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
#     )

#     train_loader = DataLoader(tr_set[fold], shuffle=True, batch_size=batch_size)

#     valid_loader = DataLoader(va_set[fold], shuffle=False, batch_size=batch_size)

#     # Total number of training steps is [number of batches] x [number of epochs].
#     # (Note that this is not the same as the number of training samples).
#     total_steps = len(train_loader) * epochs

#     scheduler = get_linear_schedule_with_warmup(
#         optimizer, num_warmup_steps=total_steps * 0.1, num_training_steps=total_steps
#     )

#     history, best_epoch = train_model(
#         model=model,
#         train_loader=train_loader,
#         valid_loader=valid_loader,
#         optimizer=optimizer,
#         N_train=N_train,
#         N_test=N_test,
#         device=device,
#         scheduler=scheduler,
#         path=model_path,
#         epochs=epochs,
#         patience=3,
#     )

#     training_hist.append(history)
#     print("*" * 25)
#     print("*" * 25)
#     print("*" * 25)
#     break

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    hidden_dropout_prob=0.35,
    attention_probs_dropout_prob=0.25,
)
model.to(device)

# This code is taken from:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L102

# Don't apply weight decay to any parameters whose names include these tokens.
# (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
no_decay = ["bias", "LayerNorm.weight"]

# Separate the `weight` parameters from the `bias` parameters.
# - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
# - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay_rate": 0.01,
    },
    # Filter for parameters which *do* include those.
    {
        "params": [
            p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
        ],
        "weight_decay_rate": 0.0,
    },
]

# Note - `optimizer_grouped_parameters` only includes the parameter values, not
# the names.

N_train = len(Trainset)
N_test = len(Validset)
print("Num of train samples:", N_train)
print("Num of valid samples:", N_test)
print()

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=4e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
)

train_loader = DataLoader(Trainset, shuffle=True, batch_size=batch_size)

valid_loader = DataLoader(Validset, shuffle=False, batch_size=batch_size)

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=total_steps * 0.1, num_training_steps=total_steps
)

history, best_epoch = train_model(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    N_train=N_train,
    N_test=N_test,
    device=device,
    scheduler=scheduler,
    path=model_path,
    epochs=epochs,
    patience=3,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Num of train samples: 50
Num of valid samples: 10

training loss: 0.70
training acc: 0.54
-------------------------
train loss: 0.68
train acc: 0.54
-------------------------
valid loss: 0.73
valid acc: 0.20
-------------------------
Validation loss decreased (inf --> 0.731871).  Saving model ...
training loss: 0.70
training acc: 0.56
-------------------------
train loss: 0.67
train acc: 0.56
-------------------------
valid loss: 0.72
valid acc: 0.20
-------------------------
Validation loss decreased (0.731871 --> 0.720729).  Saving model ...
training loss: 0.71
training acc: 0.52
-------------------------
train loss: 0.66
train acc: 0.54
-------------------------
valid loss: 0.73
valid acc: 0.20
-------------------------
EarlyStopping counter: 1 out of 3


In [11]:
# # save trainin_history
# with open(os.path.join(history_path, "hist.pkl"), "wb") as f:
#     pickle.dump(training_hist, f)

In [13]:
final_metric(history, path=metric_path, mtype="train")
final_metric(history, path=metric_path, mtype="valid")


[train average]

ACC: 0.54
LOSS: 0.66

Recall: 0.0
Specificity: 1.0
Precision: 0.0
NPV: 0.54

F1: 0.0
MCC: 0.0
AUC: 0.85


[valid average]

ACC: 0.2
LOSS: 0.73

Recall: 0.0
Specificity: 1.0
Precision: 0.0
NPV: 0.2

F1: 0.0
MCC: 0.0
AUC: 1.0



In [14]:
final_metric(history, path=metric_path, mtype="train", best_epoch=best_epoch)
final_metric(history, path=metric_path, mtype="valid", best_epoch=best_epoch)

***Best epoch***

[train average]

ACC: 0.56
LOSS: 0.67

Recall: 0.043
Specificity: 1.0
Precision: 1.0
NPV: 0.55

F1: 0.083
MCC: 0.15
AUC: 0.81

***Best epoch***

[valid average]

ACC: 0.2
LOSS: 0.72

Recall: 0.0
Specificity: 1.0
Precision: 0.0
NPV: 0.2

F1: 0.0
MCC: 0.0
AUC: 1.0



In [18]:
plot_figure(history, fig_path)

In [25]:
plot_figure(history, fig_path, best_epoch)

# Evaluate on test

In [72]:
# load data
with open("./data/pkl/X_test.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("./data/pkl/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

In [73]:
X_test

20892    A student filmmaker enlists a B-grade actress ...
13280    This movie has a "big production" feel that I ...
29002    A vampire's's henchman wants to call her after...
6858     Don't get me wrong, I assumed this movie would...
21664    Swedish action movies have over the past few y...
                               ...                        
12939    "Three Daring Daughters" is a sickly sweet, ro...
20460    I too am a House Party Fan...House Party I is ...
9273     I just came back from a pre-release viewing of...
6213     This is a very intriguing short movie by David...
29034    Yes, that's right, it is. I firmly believe tha...
Name: review, Length: 5869, dtype: object

In [74]:
y_test

20892    0
13280    1
29002    0
6858     0
21664    0
        ..
12939    0
20460    0
9273     1
6213     1
29034    1
Name: sentiment, Length: 5869, dtype: int64

In [105]:
model_name = "bert-base-cased"

In [108]:
metric_path, _, history_path, fig_path = setting_path(
    model_name, batch_size, epochs, mode="test"
)

cwd /home/lichang/projects/ai_cup-movie
metric_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/metrics
model_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/model
history_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/history
fig_path: /home/lichang/projects/ai_cup-movie/result/bert-base-cased_bs_8_epo4/test/figures


In [77]:
# tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)

In [88]:
input_ids_te, attention_masks_te, labels_te = tokenizing_for_bert_eval(
    X_test.values, y_test.values, tokenizer
)

tokenizing for bert input


In [89]:
testdataset = TensorDataset(input_ids_te, attention_masks_te, labels_te)

In [91]:
len(testdataset)

5869

In [90]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    hidden_dropout_prob=0.4,
    attention_probs_dropout_prob=0.25,
)
model.to(device)

PATH = "./result/bert-base-cased_bs_8_epo4/model/model.pkl"
model.load_state_dict(torch.load(PATH))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

<All keys matched successfully>

In [101]:
training_hist = []

N_test = len(testdataset)

test_loader = DataLoader(testdataset, shuffle=False, batch_size=16)

history = eval_data(
    model=model,
    test_loader=test_loader,
    N_test=N_test,
    device=device,
)

training_hist.append(history)

test loss: 0.43
test acc: 0.91
-------------------------


In [111]:
# save trainin_history
with open(os.path.join(history_path, "hist.pkl"), "wb") as f:
    pickle.dump(training_hist, f)

In [112]:
final_metric(training_hist, metric_path=metric_path, mtype="test")


[test average]

ACC: 0.91
LOSS: 0.43

Recall: 0.93
Specificity: 0.9
Precision: 0.9
NPV: 0.92

F1: 0.92
MCC: 0.83
AUC: 0.97



# Prediction

In [115]:
with open("./data/pkl/test_new.pkl", "rb") as f:
    test = pickle.load(f)

In [116]:
test

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...
...,...,...
29336,30370,It is difficult to rate a writer/director's fi...
29337,18654,"After watching this movie once, it quickly bec..."
29338,47985,"Even though i sat and watched the whole thing,..."
29339,9866,Warning Spoilers following. Superb recreation ...


In [119]:
test_data = test["review"].values

In [120]:
input_ids, attention_masks = tokenizing_for_bert_pred(test_data, tokenizer)

tokenizing for bert input


In [125]:
testdataset = TensorDataset(input_ids, attention_masks)

In [126]:
len(testdataset)

29341

In [122]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
    hidden_dropout_prob=0.4,
    attention_probs_dropout_prob=0.25,
)
model.to(device)

PATH = "./result/bert-base-cased_bs_8_epo4/train/model/model.pkl"
model.load_state_dict(torch.load(PATH))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

<All keys matched successfully>

In [128]:
test_loader = DataLoader(testdataset, shuffle=False, batch_size=16)

In [151]:
pred = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)

        output = model(b_input_ids, attention_mask=b_input_mask)
        logits = output[0]

        _, yhat = torch.max(logits.data, 1)

        pred.extend(yhat.cpu().detach().numpy())

In [152]:
print(len(pred))

29341


In [153]:
len(test)

29341

In [156]:
import pandas as pd

submission = pd.DataFrame({"ID": test["ID"].values, "sentiment": pred})
submission

Unnamed: 0,ID,sentiment
0,22622,1
1,10162,1
2,17468,0
3,42579,0
4,701,0
...,...,...
29336,30370,0
29337,18654,1
29338,47985,0
29339,9866,0


In [157]:
submission.to_csv("./submission.csv", encoding="utf-8", index=False)