In [2]:
import requests
import json
import pandas as pd

# URL to the raw JSON file
url = 'https://raw.githubusercontent.com/turkish-nlp-suite/Vitamins-Supplements-NER-dataset/main/supplements_spans_ents.json'

# Fetch the raw JSON data
response = requests.get(url)
data = response.json()  # Parse the JSON data

# Initialize an empty list to store processed rows
rows = []

# Iterate over each item in the JSON data
for item in data:
    sent_id = item['sent_id']
    text = item['text']
    entities = item['entities']
    
    # Iterate over each entity in the entities list
    for entity in entities:
        row = {
            'sent_id': sent_id,
            'text': text,
            'entity_start': entity['start'],
            'entity_end': entity['end'],
            'entity_label': entity['label'],
            'entity_value': entity['val']
        }
        rows.append(row)

# Create a DataFrame from the list of rows
df = pd.DataFrame(rows)

# Display the DataFrame
print(df)



       sent_id                                               text  \
0        82838  Henüz 4 gündür kullanıyorum ama iyi geldi gibi...   
1        82838  Henüz 4 gündür kullanıyorum ama iyi geldi gibi...   
2        82838  Henüz 4 gündür kullanıyorum ama iyi geldi gibi...   
3        85120  Diğer demir ilaçlarından farklı olarak mideyi ...   
4        85120  Diğer demir ilaçlarından farklı olarak mideyi ...   
...        ...                                                ...   
10095   207704  Bana göre multivitamin almak için yorumlara ba...   
10096   207704  Bana göre multivitamin almak için yorumlara ba...   
10097   207704  Bana göre multivitamin almak için yorumlara ba...   
10098   207704  Bana göre multivitamin almak için yorumlara ba...   
10099   207704  Bana göre multivitamin almak için yorumlara ba...   

       entity_start  entity_end entity_label  \
0               119         160         ETKİ   
1               198         208          DOZ   
2               309        

In [3]:
import torch

def is_gpu_available():
    """
    Check if GPU is available for computation using PyTorch.
    Returns True if GPU is available, False otherwise.
    """
    return torch.cuda.is_available()

# Example usage
if is_gpu_available():
    print("GPU is available.")
else:
    print("No GPU available, using CPU instead.")


import warnings 
warnings.filterwarnings('ignore')

GPU is available.


In [4]:
import datasets 
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 

In [5]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [6]:
dataset

Dataset({
    features: ['sent_id', 'text', 'entity_start', 'entity_end', 'entity_label', 'entity_value'],
    num_rows: 10100
})

In [7]:
train_testvalid = dataset.train_test_split(test_size=0.1, seed = 42)

In [8]:
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)


In [10]:
from datasets import DatasetDict
train_test_valid_dataset = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['sent_id', 'text', 'entity_start', 'entity_end', 'entity_label', 'entity_value'],
        num_rows: 9090
    })
    test: Dataset({
        features: ['sent_id', 'text', 'entity_start', 'entity_end', 'entity_label', 'entity_value'],
        num_rows: 505
    })
    valid: Dataset({
        features: ['sent_id', 'text', 'entity_start', 'entity_end', 'entity_label', 'entity_value'],
        num_rows: 505
    })
})

In [11]:
train_test_valid_dataset['train'].unique('entity_label')


Flattening the indices:   0%|          | 0/9090 [00:00<?, ? examples/s]

['ÜRÜN_DİĞER',
 'ETKİ',
 'TAT_KOKU',
 'HASTALIK',
 'BİYOMOLEKÜL',
 'MARKA_DİĞER',
 'DOZ',
 'SAĞLIK_ŞİKAYETLERİ',
 'MARKA',
 'KULLANICI',
 'YAN_ETKİ',
 'İÇERİK',
 'TAVSİYE_EDEN',
 'KULLANICI_DEMOGRAFİSİ']

In [52]:
label_list = train_test_valid_dataset['train'].unique('entity_label')

Flattening the indices:   0%|          | 0/9090 [00:00<?, ? examples/s]

In [14]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-turkish-uncased") 

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [23]:
def tokenize_and_align_labels(example):
    # Define label-to-id mapping based on the provided labels
    label_to_id = {
        'ÜRÜN_DİĞER': 0,
        'ETKİ': 1,
        'TAT_KOKU': 2,
        'HASTALIK': 3,
        'BİYOMOLEKÜL': 4,
        'MARKA_DİĞER': 5,
        'DOZ': 6,
        'SAĞLIK_ŞİKAYETLERİ': 7,
        'MARKA': 8,
        'KULLANICI': 9,
        'YAN_ETKİ': 10,
        'İÇERİK': 11,
        'TAVSİYE_EDEN': 12,
        'KULLANICI_DEMOGRAFİSİ': 13
    }
    
    tokenized_inputs = tokenizer(example['text'], padding='max_length', truncation=True, return_tensors="pt")
    labels = []

    for i, text in enumerate(example['text']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their corresponding word indices
        label_ids = [-100] * len(word_ids)  # Initialize all labels as -100 (ignored index)

        for j, word_id in enumerate(word_ids):
            if word_id is None:  # Skip special tokens
                continue
            token_start = tokenized_inputs.char_to_token(i, example['entity_start'][i])
            token_end = tokenized_inputs.char_to_token(i, example['entity_end'][i] - 1)  # -1 to include the last character
            if token_start is None or token_end is None:
                continue
            for k in range(token_start, token_end + 1):
                label_ids[k] = label_to_id[example['entity_label'][i]]  # Convert label to integer using the mapping dictionary

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [24]:
tokenized_datasets_train = train_test_valid_dataset['train'].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/9090 [00:00<?, ? examples/s]

In [25]:
tokenized_datasets_test = train_test_valid_dataset['test'].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

In [26]:
tokenized_datasets_val = train_test_valid_dataset['valid'].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

In [27]:
final_dataset = DatasetDict({
'train': tokenized_datasets_train,
'test': tokenized_datasets_test,
'valid': tokenized_datasets_val})

In [35]:
import pandas as pd

# Assuming tokenized_datasets_train[0] contains tokenized data
example = final_dataset['train'][0]

# Extract tokens and convert them to strings
tokens = tokenizer.convert_ids_to_tokens(example["input_ids"])

# Create a DataFrame with tokens
df = pd.DataFrame({'Token': tokens})

# Display the DataFrame
print(df)


     Token
0    [CLS]
1        3
2        .
3      sis
4    ##eyi
..     ...
507  [PAD]
508  [PAD]
509  [PAD]
510  [PAD]
511  [PAD]

[512 rows x 1 columns]


In [54]:
# Load model directly
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", num_labels=13)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
) 

2024-05-17 17:35:27.041024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-17 17:35:27.041152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-17 17:35:27.163219: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [41]:
from transformers import DataCollatorForTokenClassification
import torch
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [44]:
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=5a836eaa03f3cafbe7fd2beeb2e218cddedb856013ea968fbb242618df432bde
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [45]:
import datasets 
metric = datasets.load_metric("seqeval") 

In [56]:
label_list

['ÜRÜN_DİĞER',
 'ETKİ',
 'TAT_KOKU',
 'HASTALIK',
 'BİYOMOLEKÜL',
 'MARKA_DİĞER',
 'DOZ',
 'SAĞLIK_ŞİKAYETLERİ',
 'MARKA',
 'KULLANICI',
 'YAN_ETKİ',
 'İÇERİK',
 'TAVSİYE_EDEN',
 'KULLANICI_DEMOGRAFİSİ']

In [57]:
example = final_dataset['train'][0]

In [None]:
example

In [60]:

labels_indices = example["labels"]

# Filter out padding tokens (-100)
valid_labels_indices = [idx for idx in labels_indices if idx != -100]

# Map indices to labels using label_list
labels = [label_list[idx] for idx in valid_labels_indices]

# Now you can use labels for further processing


In [64]:
metric.compute(predictions=[labels], references=[labels]) 

{'RÜN_DİĞER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [65]:
def compute_metrics(eval_preds): 
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [68]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=final_dataset["train"], 
   eval_dataset=final_dataset["valid"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 
""""'train': tokenized_datasets_train, 'valid': tokenized_datasets_val"""

'"\'train\': tokenized_datasets_train, \'valid\': tokenized_datasets_val'

In [69]:
trainer.train() 

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


/usr/local/src/pytorch/aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [13,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
