In [1]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
    BertTokenizerFast,
    BertForSequenceClassification,
    BertTokenizer,
    AutoTokenizer,
)
from huggingface_hub import HfFolder, notebook_login
import os
import sklearn
from sklearn.preprocessing import OneHotEncoder

os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('The scikit-learn version is {}.'.format(sklearn.__version__))

2024-04-21 12:23:29.857304: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-21 12:23:29.863464: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-21 12:23:29.927905: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


The scikit-learn version is 1.4.2.


In [2]:
from datasets import Dataset
import pandas as pd
import numpy as np

full_dataset = pd.read_csv("round3.csv")

In [3]:
# model_id = "roberta-base"
# model_id = "bert-base-uncased"
model_id = "bert-large-uncased"

In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
train_dataset, test_dataset = train_test_split(full_dataset, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)

In [5]:
dataset = Dataset.from_pandas(full_dataset)

In [6]:
# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_id)
# tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

unique_classes = full_dataset['predicted'].unique()
def tokenize_function(examples):
    return tokenizer(examples['concatenated_text'], padding="max_length", truncation=True)

# Rename 'predicted' column to 'labels'
train_dataset = train_dataset.rename_column('predicted', 'labels')
test_dataset = test_dataset.rename_column('predicted', 'labels')

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

poss_classes = train_dataset['labels']

# One-hot encode the 'labels' column
encoder = OneHotEncoder(sparse_output=False)
train_labels = encoder.fit_transform(np.array(train_dataset['labels']).reshape(-1, 1))
test_labels = encoder.transform(np.array(test_dataset['labels']).reshape(-1, 1))

# Replace 'labels' column in the datasets with one-hot encoded labels
train_dataset = train_dataset.remove_columns(['labels'])
train_dataset = train_dataset.add_column('labels', train_labels.tolist())
test_dataset = test_dataset.remove_columns(['labels'])
test_dataset = test_dataset.add_column('labels', test_labels.tolist())

# Set the format of the datasets
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/53696 [00:00<?, ? examples/s]

Map:   0%|          | 0/13425 [00:00<?, ? examples/s]

In [7]:
full_dataset

Unnamed: 0,concatenated_text,predicted
0,White Horse;Tile Manufacturing | European Aest...,Building Material and Garden Equipment and Sup...
1,Wealth Solution Partners;Super and SMSF Servic...,"Funds, Trusts, and Other Financial Vehicles"
2,PMG;Fire and Water Cleanup Services | Mold Rem...,Waste Management and Remediation Services
3,TMP Capital PLLC;Licensed in AL & FL | 203K Lo...,Credit Intermediation and Related Activities
4,Genertek Power;Industrial and Commercial Energ...,Utilities
...,...,...
67116,Machinery sales construction Inc.;Retail Space...,Transportation Equipment Manufacturing
67117,Oregon Prep Basketball;Oregon Prep Basketball ...,"Amusement, Gambling, and Recreation Industries"
67118,QUT Law Society Inc.;Largest Faculty Society i...,"Religious, Grantmaking, Civic, Professional, a..."
67119,Vineyard Institute;Educational Institution | B...,"Religious, Grantmaking, Civic, Professional, a..."


In [8]:
repository_id = "output"

!nvidia-smi

# Model
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=len(unique_classes))
# model = RobertaForSequenceClassification.from_pretrained(model_id, num_labels=len(unique_classes))


model = model.to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sun Apr 21 12:24:13 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:1A:00.0 Off |                  Off |
|  0%   39C    P8    11W / 450W |      3MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:3D:00.0 Off |                  Off |
|  0%   32C    P8    23W / 450W |      3MiB / 24564MiB |      0%      Default |
|       

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from datasets import load_metric
import numpy as np
from transformers import Trainer, TrainingArguments

torch.cuda.empty_cache()

# data collator is used to collate the data into batches that can be fed to the model during training and evaluation
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

# define metrics and metrics function
f1_metric = load_metric("f1")
accuracy_metric = load_metric( "accuracy")

from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to predicted class indices
    predictions = np.argmax(logits, axis=-1)
    
    # Ensure labels are in the correct format, assuming labels should be integers representing class indices
    if labels.ndim > 1:
        labels = np.argmax(labels, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy
    }

training_args = TrainingArguments(
    output_dir='output_dir/round3',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # Make sure this function is appropriate for your classification
)

trainer.train()

  f1_metric = load_metric("f1")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 1539, in forward
    outputs = self.bert(
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 988, in forward
    encoder_outputs = self.encoder(
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 582, in forward
    layer_outputs = layer_module(
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 472, in forward
    self_attention_outputs = self.attention(
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 402, in forward
    self_outputs = self.self(
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/transformers/models/bert/modeling_bert.py", line 334, in forward
    attention_probs = self.dropout(attention_probs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/modules/dropout.py", line 59, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
  File "/root/.conda/envs/veridion/lib/python3.10/site-packages/torch/nn/functional.py", line 1268, in dropout
    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 23.65 GiB of which 67.69 MiB is free. Process 4155295 has 23.58 GiB memory in use. Of the allocated memory 22.12 GiB is allocated by PyTorch, and 45.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
# save the model
model.save_pretrained("model_round3")

# load the model
model = BertForSequenceClassification.from_pretrained("model_round3")
# model = RobertaForSequenceClassification.from_pretrained("model_round5")

torch.cuda.empty_cache()

# make prediction
text = "TSPN TV;Media and Entertainment | Community Engagement | Media Platform | Full-screen Stars | Live Video Streams | Advertising Opportunities;TSPN TV is a television broadcasting company based in Jackson, California, United States.;TSPN TV News is a media company that provides news and information to the residents of Amador County, California. The company offers a variety of programs, including newscasts, news interviews, and live video streams, as well as information on local events and government meetings. TSPN's programs cover a range of topics, including transportation, agriculture, and public safety. The Director of the Interfaith Food Bank, Tom Thompson, takes TSPn on tours to learn about the organization's work and how to get involved. Additionally, TSPP TV News reports on government initiatives, such as the State of Jefferson and the State's response to the Covid-19 pandemic."
result = "Broadcasting and Content Providers"

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Softmax to get probabilities
probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()

# Assuming you have already fitted your OneHotEncoder on your training labels
# Load your encoder here, or re-fit as a demonstration (not recommended for actual inference)
encoder = OneHotEncoder(sparse=False)
# labels = np.array([[label] for label in range(95)])  # simulate your classes as an example
encoder.fit(np.array(poss_classes).reshape(-1, 1))

# Decode the predictions
predicted_index = np.argmax(probabilities, axis=1)
predicted_one_hot = np.zeros(probabilities.shape)
predicted_one_hot[np.arange(len(probabilities)), predicted_index] = 1
predicted_label = encoder.inverse_transform(predicted_one_hot)

print("Predicted label:", predicted_label[0][0])