In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Install libraries
# !pip install transformers datasets evaluate
# !conda install protobuf
# !pip install accelerate -U

In [5]:
# Data processing
import pandas as pd
import numpy as np
from datasets import Dataset
# Modeling
# import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# Model performance evaluation
import evaluate

In [6]:
# Read in data
md_data = pd.read_csv("/content/drive/MyDrive/NLP/resume.csv",names=['labels','about'],skiprows=[0])
# Take a look at the data
df = md_data.head(len(md_data))
df.drop(index=0)
len(df)

2484

In [None]:
# import os
# os.listdir("C:/Users/Mehedi Tamim/NLP_Project/archive__/data/data")

In [7]:
# print(df['label'].value_counts())
df['labels'] = df['labels'].replace(['ACCOUNTANT',
 'ADVOCATE',
 'AGRICULTURE',
 'APPAREL',
 'ARTS',
 'AUTOMOBILE',
 'AVIATION',
 'BANKING',
 'BPO',
 'BUSINESS-DEVELOPMENT',
 'CHEF',
 'CONSTRUCTION',
 'CONSULTANT',
 'DESIGNER',
 'DIGITAL-MEDIA',
 'ENGINEERING',
 'FINANCE',
 'FITNESS',
 'HEALTHCARE',
 'HR',
 'INFORMATION-TECHNOLOGY',
 'PUBLIC-RELATIONS',
 'SALES',
 'TEACHER'],['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
)
print(df['labels'].value_counts())
df.to_csv("resume_updated.csv", index=False)

20    120
9     120
0     118
1     118
16    118
15    118
10    118
17    117
6     117
22    116
18    115
12    115
7     115
11    112
21    111
19    110
13    107
4     103
23    102
3      97
14     96
2      63
5      36
8      22
Name: labels, dtype: int64


In [8]:
md_data = pd.read_csv("/content/drive/MyDrive/NLP/resume_updated.csv",names=['labels','about'],skiprows=[0])

In [9]:
# Training dataset
train_data = md_data.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = md_data.drop(train_data.index)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(train_data)} records.')
print(f'The testing dataset has {len(test_data)} records.')

The training dataset has 1987 records.
The testing dataset has 497 records.


In [10]:
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

In [11]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
# hg_train_data['label']
# md_data.iloc[[521]]

The length of hg_train_data is 1987.



In [12]:
# Tokenizer from a pretrained model
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Take a look at the tokenizer
tokenizer
# Mapping between special tokens and their IDs.
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unkown token is {tokenizer.unk_token_id}.')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}.')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}.')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}.')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}.')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

The unknown token is [UNK] and the ID for the unkown token is 100.
The seperator token is [SEP] and the ID for the seperator token is 102.
The pad token is [PAD] and the ID for the pad token is 0.
The sentence level classification token is [CLS] and the ID for the classification token is 101.
The mask token is [MASK] and the ID for the mask token is 103.


In [13]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["about"],
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/1987 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

In [14]:
# Take a look at the data
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['labels', 'about', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1987
})
Dataset({
    features: ['labels', 'about', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 497
})


In [15]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=24)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# !pip install accelerate -U
# import torch
# torch.__version__

In [21]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./resume/",
    logging_dir='./resume/logs',
    logging_strategy='epoch',
    logging_steps=10,
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-7,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [22]:
# Number of evaluation modules
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 130 evaluation models in Hugging Face.



['precision',
 'code_eval',
 'roc_auc',
 'cuad',
 'xnli',
 'rouge',
 'pearsonr',
 'mse',
 'super_glue',
 'comet',
 'cer',
 'sacrebleu',
 'mahalanobis',
 'wer',
 'competition_math',
 'f1',
 'recall',
 'coval',
 'mauve',
 'xtreme_s',
 'bleurt',
 'ter',
 'accuracy',
 'exact_match',
 'indic_glue',
 'spearmanr',
 'mae',
 'squad',
 'chrf',
 'glue',
 'perplexity',
 'mean_iou',
 'squad_v2',
 'meteor',
 'bleu',
 'wiki_split',
 'sari',
 'frugalscore',
 'google_bleu',
 'bertscore',
 'matthews_correlation',
 'seqeval',
 'trec_eval',
 'rl_reliability',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'kaggle/ai4code',
 'codeparrot/apps_metric',
 'mfumanelli/geometric_mean',
 'poseval',
 'brier_score',
 'abidlabs/mean_iou',
 'abidlabs/mean_iou2',
 'giulio98/codebleu',
 'mase',
 'mape',
 'smape',
 'dvitel/codebleu',
 'NCSOFT/harim_plus',
 'JP-SystemsX/nDCG',
 'Drunper/metrica_tesi',
 'jpxkqx/peak_signal_to_noise_ratio',
 'jpxkqx/signal_to_reconstruction_error',
 'hpi-dhc/FairEva

In [23]:
# Function to compute the metric
import torch
torch.cuda.is_available()

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.2386,3.249415,0.034205
2,3.2349,3.241155,0.026157
3,3.2277,3.233802,0.036217
4,3.2234,3.227683,0.038229
5,3.2194,3.22238,0.040241
6,3.2128,3.217081,0.038229
7,3.2123,3.212342,0.038229
8,3.2112,3.20774,0.038229
9,3.2059,3.203556,0.034205
10,3.1985,3.199407,0.032193


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=3750, training_loss=3.1911099527994793, metrics={'train_runtime': 6419.8085, 'train_samples_per_second': 9.285, 'train_steps_per_second': 0.584, 'total_flos': 1.568714806665216e+16, 'train_loss': 3.1911099527994793, 'epoch': 30.0})

In [25]:
# Trainer evaluate
trainer.evaluate(dataset_test)

{'eval_loss': 3.1684963703155518,
 'eval_accuracy': 0.03822937625754527,
 'eval_runtime': 18.0043,
 'eval_samples_per_second': 27.604,
 'eval_steps_per_second': 1.777,
 'epoch': 30.0}

In [27]:
# Save tokenizer
tokenizer.save_pretrained('/content/drive/MyDrive/NLP/resume_transformer/')

# Save model
trainer.save_model('/content/drive/MyDrive/NLP/resume_transformer/')