# Set up

In [None]:
# Install libraries
!pip install transformers[torch] datasets evaluate

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━

In [None]:
# Data processing
import pandas as pd
import numpy as np

# Modeling
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Hugging Face Dataset
from datasets import Dataset

# Model performance evaluation
import evaluate

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data

In [None]:
train_df = pd.read_json("train_for_student.json")
test_df = pd.read_json("test_for_student.json")
train_df = train_df.transpose()
test_df = test_df.transpose()

In [None]:
train_df

Unnamed: 0,Title,Abstract,Classes
1,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,"[CHE, MATENG]"
2,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,[CPE]
3,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,[EE]
4,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,"[PE, ME, CHE]"
5,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...","[CE, MATSCI]"
...,...,...,...
450,A portable USB-controlled potentiostat for pap...,© 2018 IEEEThis paper presents a portable and ...,"[CPE, CHE]"
451,Literature reviews on applying artificial inte...,Copyright © 2019 for this paper by its authors...,"[CPE, EDU]"
452,A multi-parameterized water quality prediction...,© 2019 The authors and IOS Press. All rights r...,"[ENV, EE, CHE]"
453,Semantic Segmentation on Medium-Resolution Sat...,© 2018 IEEE.Semantic Segmentation is a fundame...,"[EE, CPE, OPTIC, EDU]"


In [None]:
test_df

Unnamed: 0,Title,Abstract
001eval,Comparative Electrical Energy Yield Performanc...,© 2013 IEEE.Long-term energy evaluation of PV ...
002eval,Effects of graphene nanoplatelets on bio-based...,© The Author(s) 2021.Novel near-infrared (NIR)...
003eval,Anti-inflammatory action of two novel peptides...,© The Royal Society of Chemistry 2020.Peanut w...
004eval,Efficient all-and-one support vector machines ...,© 2018 IEEE.We introduce a new strategy to est...
005eval,Driver identification using histogram and neur...,© 2017 IEEE.Sensor technology has continuously...
...,...,...
147eval,Utilization of Sewage Sludge from Beverage Ind...,© Published under licence by IOP Publishing Lt...
148eval,Development of a Gateway for OpenADR-ECHONET L...,"© 2018 IEEE.In this paper, we develop an ECHON..."
149eval,Effect of solution treatment and precipitation...,© 2017 Elsevier Ltd. All rights reserved.The a...
150eval,An effect-analysis method for species-dependen...,"© The Authors, published by EDP Sciences, 2019..."


# Preprocessing data

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
one_hot = mlb.fit_transform(train_df['Classes'])
train_df['labels'] = list(one_hot.astype(np.float32))
labels_df = pd.DataFrame(one_hot, columns=mlb.classes_)
train_df.drop(columns=["Classes"],axis=1,inplace=True)

In [None]:
labels_df

Unnamed: 0,AGRI,BME,CE,CHE,CPE,EDU,EE,ENV,IE,MATENG,MATH,MATSCI,ME,METAL,NANO,OPTIC,PE,SAFETY
0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
450,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
451,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
452,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0


In [None]:
train_df

Unnamed: 0,Title,Abstract,labels
1,Activated carbon derived from bacterial cellul...,© 2019 Elsevier B.V.Activated carbon derived f...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,The algorithm of static hand gesture recogniti...,© Springer International Publishing AG 2018.Te...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Alternative Redundant Residue Number System Co...,© 2018 IEEE.Residue number system (RNS) is a n...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,Comparative study of wax inhibitor performance...,© Published under licence by IOP Publishing Lt...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,Undrained lower bound solutions for end bearin...,"© 2019 John Wiley & Sons, Ltd.The undrained be...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
450,A portable USB-controlled potentiostat for pap...,© 2018 IEEEThis paper presents a portable and ...,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
451,Literature reviews on applying artificial inte...,Copyright © 2019 for this paper by its authors...,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
452,A multi-parameterized water quality prediction...,© 2019 The authors and IOS Press. All rights r...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."
453,Semantic Segmentation on Medium-Resolution Sat...,© 2018 IEEE.Semantic Segmentation is a fundame...,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, ..."


# Concate columns

In [None]:
train_df["TitleAbstract"] = train_df["Title"] + " [SEP]" + train_df["Abstract"]
test_df["TitleAbstract"] = test_df["Title"] + " [SEP]" + test_df["Abstract"]
train_df.drop(columns=["Title","Abstract"],axis=1,inplace=True)
test_df.drop(columns=["Title","Abstract"],axis=1,inplace=True)


In [None]:
train_df

Unnamed: 0,labels,TitleAbstract
1,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Activated carbon derived from bacterial cellul...
2,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",The algorithm of static hand gesture recogniti...
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",Alternative Redundant Residue Number System Co...
4,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Comparative study of wax inhibitor performance...
5,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",Undrained lower bound solutions for end bearin...
...,...,...
450,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",A portable USB-controlled potentiostat for pap...
451,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",Literature reviews on applying artificial inte...
452,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...",A multi-parameterized water quality prediction...
453,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, ...",Semantic Segmentation on Medium-Resolution Sat...


In [None]:
test_df

Unnamed: 0,TitleAbstract
001eval,Comparative Electrical Energy Yield Performanc...
002eval,Effects of graphene nanoplatelets on bio-based...
003eval,Anti-inflammatory action of two novel peptides...
004eval,Efficient all-and-one support vector machines ...
005eval,Driver identification using histogram and neur...
...,...
147eval,Utilization of Sewage Sludge from Beverage Ind...
148eval,Development of a Gateway for OpenADR-ECHONET L...
149eval,Effect of solution treatment and precipitation...
150eval,An effect-analysis method for species-dependen...


In [None]:
np.random.seed(0)
x = train_df["TitleAbstract"].to_numpy().reshape(-1, 1)
y = labels_df.to_numpy()

from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(x, y,test_size=0.1, random_state=42)

In [None]:
train_data = pd.DataFrame({
    'TitleAbstract': train_texts.flatten().tolist(),
    'labels':train_labels.astype(float).tolist()
})

val_data = pd.DataFrame({
    'TitleAbstract': val_texts.flatten().tolist(),
    'labels': val_labels.astype(float).tolist()
})

train_data.head(4)

Unnamed: 0,TitleAbstract,labels
0,The nanoporous carbon derived from melamine ba...,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Bio-based production of carbon nanotubes via c...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Investigation of leukocyte viability and damag...,"[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Very Short-Term Solar Power Forecasting Using ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."


In [None]:
val_data.head(4)

Unnamed: 0,TitleAbstract,labels
0,An ontology-based knowledge acquisition for PD...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ..."
1,"Preferential nucleation, guiding, and blocking...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,An Accuracy and Repeatability of a Robot made ...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Monitoring of Surface Roughness in Aluminium T...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Tokenize data

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_dataset(data):
    return tokenizer(data["TitleAbstract"],
                     max_length=256,
                     truncation=True,
                     padding="max_length")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import f1_score

train_data = Dataset.from_pandas(train_data)
test_data = Dataset.from_pandas(test_df)
val_data = Dataset.from_pandas(val_data)

train_dataset = train_data.map(tokenize_dataset)
test_dataset = test_data.map(tokenize_dataset)
val_dataset = val_data.map(tokenize_dataset)


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

In [None]:
print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['TitleAbstract', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 408
})
Dataset({
    features: ['TitleAbstract', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 151
})
Dataset({
    features: ['TitleAbstract', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 46
})


In [None]:
print(train_dataset[0])

{'TitleAbstract': 'The nanoporous carbon derived from melamine based polybenzoxazine and NaCl templating [SEP]© 2018 Trans Tech Publications, SwitzerlandNanoporous carbon was successfully prepared by using polybenzoxazine synthesized from bisphenol-A, melamine and formaldehyde as a precursor. The varied HCl amounts have been added into the pre-polymer solution as a catalyst for the ring-opening polymerization. The reaction was traced by FTIR and DSC. In addition, the degradation behavior was studied by TGA and the textural properties were characterized by SEM and surface area analysis (AS1-MP). The nanoporous carbon obtained showed the highest char yield up to 48%. The interconnected structure from the SEM images of the nanoporous carbon exhibited significantly high surface area of 632 m2/g, high total pore volume up to 1.78 cm2/g, small average pore diameter and narrow pore size distribution detected by AS1-MP. After the activation process, the surface area has been drastically improv

In [None]:
train_dataset.to_csv("train_dataset.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1408492

In [None]:
test_dataset.to_csv("test_dataset.csv",index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

509418

# Training model

In [None]:
# Define model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(mlb.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir="./distilbert_model/",
    logging_dir='./distilbert_model/logs',
    logging_strategy='epoch',
    logging_steps=100,
    num_train_epochs=20,  # Decreased epochs for efficiency
    per_device_train_batch_size=2,  # Increased batch size for efficiency
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define compute_metrics function
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.3).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=50)]
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4103,0.385489,0.827295,0.44358,0.527778,0.38255
2,0.3295,0.34193,0.846618,0.557491,0.57971,0.536913
3,0.2822,0.320418,0.849034,0.592834,0.575949,0.610738
4,0.238,0.312118,0.864734,0.611111,0.633094,0.590604
5,0.1992,0.291846,0.864734,0.608392,0.635036,0.583893
6,0.1629,0.287119,0.864734,0.629139,0.620915,0.637584
7,0.1304,0.28852,0.878019,0.645614,0.676471,0.61745
8,0.1049,0.292331,0.865942,0.631229,0.625,0.637584
9,0.084,0.305102,0.868357,0.649518,0.623457,0.677852
10,0.0666,0.292672,0.880435,0.673267,0.662338,0.684564


TrainOutput(global_step=4080, training_loss=0.11494026844407998, metrics={'train_runtime': 193.8999, 'train_samples_per_second': 42.084, 'train_steps_per_second': 21.042, 'total_flos': 540621201899520.0, 'train_loss': 0.11494026844407998, 'epoch': 20.0})

In [None]:
# Evaluate the model on the test set
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


{'eval_runtime': 0.6179, 'eval_samples_per_second': 244.395, 'eval_steps_per_second': 123.007, 'epoch': 20.0}


In [None]:
test_predict = trainer.predict(test_dataset)
test_predict

PredictionOutput(predictions=array([[-5.093313  , -3.689384  , -3.8327258 , ..., -3.0429208 ,
        -1.8840697 , -3.8539627 ],
       [-3.7433758 , -2.4936752 , -2.9264076 , ..., -0.08064208,
        -1.7939137 , -4.3435583 ],
       [-3.3332465 ,  0.14106672, -4.249549  , ..., -2.227829  ,
        -3.5604606 , -5.3161635 ],
       ...,
       [-3.7989805 , -3.5665467 , -5.141874  , ..., -5.152051  ,
        -0.6856864 , -4.652163  ],
       [-3.1506093 ,  0.4605606 , -4.5914135 , ..., -3.2450364 ,
        -2.7500987 , -4.7085204 ],
       [-3.5273545 , -3.2131693 , -2.694537  , ..., -3.5648143 ,
        -1.0909015 , -3.304154  ]], dtype=float32), label_ids=None, metrics={'test_runtime': 0.7392, 'test_samples_per_second': 204.283, 'test_steps_per_second': 102.818})

In [None]:
test_prob = tf.math.sigmoid(test_predict.predictions).numpy()
test_predictions = (test_prob > 0.25).astype(int)

In [None]:
df = pd.DataFrame(test_predictions, index=test_df.index,columns=mlb.classes_)
df

Unnamed: 0,AGRI,BME,CE,CHE,CPE,EDU,EE,ENV,IE,MATENG,MATH,MATSCI,ME,METAL,NANO,OPTIC,PE,SAFETY
001eval,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
002eval,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,1,0,0
003eval,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
004eval,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
005eval,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147eval,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0
148eval,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
149eval,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0
150eval,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
df = df.reset_index().rename(columns={'index': 'id'})
df = df[['id','CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE', 'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']]
df

Unnamed: 0,id,CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
0,001eval,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
1,002eval,0,0,0,0,1,0,1,0,1,1,1,0,0,0,0,0,0,1
2,003eval,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,004eval,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
4,005eval,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,147eval,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1
147,148eval,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
148,149eval,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0
149,150eval,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [None]:
df.to_csv("result_batch2_2_p50_t0.1_l5e5_e20.csv", index=False)

In [None]:
tokenizer.save_pretrained('./tokenizer/')
trainer.save_model('./trainer/')

In [None]:
from google.colab import drive
drive.mount('/content/drive')