# Classification of Japanese news articles using BERT
classifiers news articles written in Japanese fine-tuned BERT with the Japanese dataset.

**Warinig**: The following code executes only on Google Colaboratory.

In [None]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install fugashi
!pip install ipadic

## Load the row data

In [5]:
# load the "ldcc-20140209.tar.gz"
import glob
import os
import tarfile

# Download the dataset
path = "./text/"
if not os.path.exists(path):
    os.makedirs(path)
    url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
    file_name = url.split("/")[-1]
    file_path = os.path.join(path, file_name)
    os.system(f"wget {url} -O {file_path}")
    with tarfile.open(file_path, "r:gz") as tar:
        tar.extractall(path)

# Extract the dataset
dir_files = os.listdir(path=path)
dirs = [f for f in dir_files if os.path.isdir(path + f)]

text_label_data = []
dir_count = 0
file_count = 0

# Extract the text and label from the dataset
for i in range(len(dirs)):
    dir = dirs[i]
    dir_count += 1
    files = glob.glob(path + dir + "/*.txt")
    for file in files:
        if os.path.basename(file) == "LICENSE.txt":
            continue

        file_count += 1
        with open(file, "r") as f:
            text = f.readlines()[3:]
            text = "".join(text)
            text = text.translate(str.maketrans({
                "\u3000": "",
                "\n": "",
                "\t": "",
                "\r": ""
            }))
            text_label_data.append([text, i])
        print("dir: {}, file: {}".format(dir_count, file_count), end="\r")

9
dir: 9, file: 7367

## Save the row data

In [7]:
import csv
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
news_train, news_test = train_test_split(text_label_data, test_size=0.2, random_state=42, shuffle=True)
news_path = "./news/"
if not os.path.exists(news_path):
    os.mkdir(news_path)

# Save the train and test dataset
with open(news_path + "train.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(news_train)
with open(news_path + "test.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(news_test)

## Load the pre-trained model and tokenizer

In [10]:
import torch
from transformers import BertJapaneseTokenizer, BertForSequenceClassification

# using GPU if it can use
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load the tokenizer and model
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
model = BertForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking", num_labels=9)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Load the dataset

In [None]:
from datasets import load_dataset

# define the function of batch tokenization
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

# load the train and test dataset
train_data = load_dataset("csv", data_files=news_path + "train.csv", split="train", column_names=["text", "label"])
test_data = load_dataset("csv", data_files=news_path + "test.csv", split="train", column_names=["text", "label"])

# tokenize the train and test dataset
train_data = train_data.map(tokenize, batched=True, batch_size=len(train_data))
test_data = test_data.map(tokenize, batched=True, batch_size=len(test_data))

# format the train and test dataset
train_data.set_format("torch", columns=["input_ids", "label"])
test_data.set_format("torch", columns=["input_ids", "label"])

## Evaluate function

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    predictions = eval_pred.predictions.argmax(-1)
    return {"accuracy": accuracy_score(labels, predictions)}

## Trainer

In [None]:
from transformers import Trainer, TrainingArguments

# define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)

## Training

In [None]:
trainer.train()

## Evaluation

In [None]:
trainer.evaluate()

## Save the model

In [None]:
trainer.save_model("./model/news_classification_bert_jp")

## Show the result on TensorBoard

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

## Inference test

In [None]:
# set the test news file
category = "movie-enter"
sample_path = "./text/"
files = glob.glob(sample_path + category + "/*.txt")
file = files[12]

# load the text and label from the test news file
dir_files = os.listdir(path=sample_path)
dirs = [f for f in dir_files if os.path.isdir(os.path.join(sample_path, f))] 

with open(file, "r") as f:
    sample_text = f.readlines()[3:]
    sample_text = "".join(sample_text)
    sample_text = sample_text.translate(str.maketrans({"\n":"", "\t":"", "\r":"", "\u3000":""})) 

print(sample_text)

# predict the category of the test news file
max_length = 512
words = tokenizer.tokenize(sample_text)
word_ids = model.convert_tokens_to_ids(words)
word_tensor = torch.tensor([word_ids[:max_length]])

x = word_tensor.to(device)
y = model(x)
pred = y[0].argmax(-1)
print("result:", dirs[pred])