# Chapter 3: Model

In [None]:
!pip install -U accelerate
!pip install -U transformers
!pip install datasets

## Connect to google drive

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('drive/My Drive/contents')

Mounted at /content/drive


## Import & set seed

In [None]:
seed = 21

import torch
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

import random
random.seed(seed)

import numpy as np
np.random.seed(seed)


import transformers
transformers.enable_full_determinism(seed)

In [None]:
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import  f1_score, hamming_loss
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer

## Set hyperparameters

In [None]:
checkpoint = "bert-base-uncased"
data_path = "train_for_student_clean.csv"
labels = [ "CE", "ENV", "BME", "PE", "METAL", "ME", "EE", "CPE", "OPTIC","NANO", "CHE", "MATENG", "AGRI", "EDU", "IE", "SAFETY", "MATH", "MATSCI"]
MAX_LEN = 246
TRAIN_BATCH_SIZE = 2
VALID_BATCH_SIZE = 2
TEST_BATCH_SIZE = 2
EPOCHS = 21
LEARNING_RATE = 5e-05
THRESHOLD = 0.3
WEIGHT_DECAY = 0.001

## Load model

In [None]:
tokenizer = BertTokenizer.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels),
                                                            problem_type="multi_label_classification")

## Prepare the data

In [None]:
data = load_dataset("csv", data_files=data_path)
data = data["train"].train_test_split(test_size=0.1)

In [None]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LEN)
    label_dict = {}
    for label in labels:
        label_dict[label] = examples[label]
    tokenized_inputs["labels"] = [[float(label_dict[label][i]) for label in labels] for i in range(len(examples["text"]))]
    return tokenized_inputs

tokenized_data = data.map(preprocess_function, batched=True)


## Set the computation metrics to be F1 score and Hamming loss.

In [None]:
def compute_metrics(eval_pred):
    predictions, ref = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs>=THRESHOLD)] = 1

    metrics = {
        "f1": f1_score(ref, y_pred, average = 'macro', zero_division=0),
        "hamming_loss": hamming_loss(ref, y_pred),
    }
    return metrics


## Trainning

In [None]:
training_args = TrainingArguments(
    output_dir = './results',
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    save_total_limit=1,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Hamming Loss,Runtime,Samples Per Second,Steps Per Second
1,No log,0.314494,0.224889,0.150966,1.1529,39.9,19.95
2,No log,0.290898,0.2978,0.14372,1.189,38.688,19.344
3,0.368000,0.264773,0.466495,0.11715,1.253,36.713,18.357
4,0.368000,0.247246,0.479391,0.108696,1.1776,39.064,19.532
5,0.266800,0.227199,0.570766,0.101449,1.2863,35.762,17.881
6,0.266800,0.239932,0.577966,0.121981,1.1731,39.211,19.605
7,0.266800,0.231628,0.592698,0.099034,1.1588,39.695,19.847
8,0.189300,0.239384,0.637211,0.103865,1.3817,33.292,16.646
9,0.189300,0.236545,0.608009,0.100242,1.1702,39.31,19.655
10,0.129300,0.241901,0.557948,0.10628,1.7888,25.715,12.858


TrainOutput(global_step=4263, training_loss=0.1410284964042232, metrics={'train_runtime': 1054.9406, 'train_samples_per_second': 8.062, 'train_steps_per_second': 4.041, 'total_flos': 1075327978966920.0, 'train_loss': 0.1410284964042232, 'epoch': 21.0})

# Chapter 4: Results

In [None]:
trainer.evaluate()

{'eval_loss': 0.23503780364990234,
 'eval_f1': 0.6008195667678425,
 'eval_hamming_loss': 0.08695652173913043,
 'eval_runtime': 1.3919,
 'eval_samples_per_second': 33.048,
 'eval_steps_per_second': 16.524,
 'epoch': 21.0}

## Save model

In [None]:
trainer.save_model("bert-multilabel-engineer")

In [None]:
tokenizer.save_pretrained('bert-tokenizer')

## Predictions

In [None]:
text = "Comparative Electrical Energy Yield Performance of Micro-Inverter PV Systems Using a Machine Learning Approach Based on a Mixed-Effect Model of Real Datasets © 2013 IEEE.Long-term energy evaluation of PV systems that use micro-inverter configuration (micro-inverter PV systems) is currently unclear due to the lacking of sufficient longitudinal measurement data and appropriate analysis method. The poor knowledge about impact and aging of micro-inverter PV system affects the comprehension and accuracy of PV design and simulation tools. In this paper, we propose a machine learning approach based on the mixed-effect model to compare and evaluate the electrical energy yield of micro-inverter PV systems. The analyzed results using a 5-year period data of PV stations located at Concord, Massachusetts, USA showed that there is no significant difference in yearly electrical energy yield of micro-inverter PV systems under shading and non-shading condition. This finding has confirmed the advantage of micro-inverter PV system over the other PV types. In addition, our work is the first study that identified the average degradation rate of micro-inverter PV of 3% per year (95% confidence intervals: 2%-4.3%). Our findings in this study have extended substantially the comprehensive understanding of micro-inverter PV system."

encoding = tokenizer(text, return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding)

In [None]:
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu())
preds = np.zeros(probs.shape)
preds[np.where(probs>=0.3)] = 1
preds

array([0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0.])

In [None]:
test_data = pd.read_csv('./test_for_student_clean.csv')


def predict_labels(text):
    encoding = tokenizer(text,
                         return_tensors='pt',
                         padding="max_length",
                         max_length=MAX_LEN,
                         truncation=True
                         )
    encoding.to(trainer.model.device)

    outputs = trainer.model(**encoding)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(outputs.logits[0].cpu())
    preds = np.zeros(probs.shape)
    preds[np.where(probs >= 0.3)] = 1
    return preds.astype(int)


prediction_data = []
for idx, row in test_data.iterrows():
    id_ = row['id']
    text = row['text']
    predictions = predict_labels(text).tolist()
    prediction_data.append([id_] + predictions)

prediction_df = pd.DataFrame(prediction_data, columns=['id'] + labels)
prediction_path = 'prediction.csv'
prediction_df.to_csv(prediction_path, index=False)