In [None]:
from transformers import AutoTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
import json

# Load data
with open('sentiment_data.json', 'r') as f:
    data = json.load(f)

tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-turkish-uncased')

def preprocess_data(data):
    sentences = []
    entities = []
    sentiments = []
    for item in data:
        sentence = item['sentence']
        for entity_info in item['entities']:
            entity = entity_info['entity']
            entity_sentiments = entity_info['sentiments']
            sentences.append(sentence)
            entities.append(entity)
            sentiments.append(entity_sentiments)

    mlb = MultiLabelBinarizer(classes=['positive', 'negative', 'neutral'])
    sentiment_labels = mlb.fit_transform(sentiments)

    encodings = tokenizer(sentences, entities, truncation=True, padding=True, max_length=128)

    return encodings, sentiment_labels, mlb

encodings, sentiment_labels, mlb = preprocess_data(data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class MultiLabelSentimentDataset(Dataset):
    def __init__(self, encodings, sentiment_labels):
        self.encodings = encodings
        self.sentiment_labels = sentiment_labels

    def __len__(self):
        return len(self.sentiment_labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.sentiment_labels[idx]).float()
        return item

dataset = MultiLabelSentimentDataset(encodings, sentiment_labels)
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
10,0.6303,0.546254
20,0.5215,0.432783
30,0.3549,0.306474
40,0.3601,0.273559
50,0.2588,0.221073
60,0.2646,0.186728
70,0.2019,0.1402
80,0.1511,0.102533
90,0.0994,0.067102
100,0.0955,0.060859


TrainOutput(global_step=280, training_loss=0.11847187309925045, metrics={'train_runtime': 5655.3208, 'train_samples_per_second': 0.746, 'train_steps_per_second': 0.05, 'total_flos': 175659039937080.0, 'train_loss': 0.11847187309925045, 'epoch': 20.0})

In [None]:
def evaluate(model, tokenizer, sentence, entities, mlb):
    model.eval()
    results = []

    for entity in entities:
        inputs = tokenizer(sentence, entity, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
        outputs = model(**inputs)
        predictions = torch.sigmoid(outputs.logits).detach().cpu().numpy()
        predicted_labels = (predictions > 0.5).astype(int)
        sentiments = mlb.inverse_transform(predicted_labels)[0]
        results.append({"entity": entity, "sentiments": sentiments})

    return results

sentence = "Apple has good quality but it is expensive."
entities = ["Apple"]
predictions = evaluate(model, tokenizer, sentence, entities, mlb)

print(predictions)


[{'entity': 'Apple', 'sentiments': ('positive',)}]


In [None]:

# Save the model
model.save_pretrained("./custom-sen-model-final")
tokenizer.save_pretrained("./custom-sen-model-final")

('./custom-sen-model-final/tokenizer_config.json',
 './custom-sen-model-final/special_tokens_map.json',
 './custom-sen-model-final/vocab.txt',
 './custom-sen-model-final/added_tokens.json',
 './custom-sen-model-final/tokenizer.json')

In [None]:
import shutil
from google.colab import files

# Define the model directory and zip file name
model_dir = './custom-sen-model-final'
zip_file = 'custom-sen-model-final.zip'

# Zip the model directory
shutil.make_archive(model_dir, 'zip', model_dir)

'/content/custom-sen-model-final.zip'

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mv {zip_file} /content/drive/MyDrive/

In [None]:
import pickle

# Assuming mlb is your MultiLabelBinarizer instance
with open('mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)


In [None]:
import pickle

# Load the MultiLabelBinarizer object
with open('mlb.pkl', 'rb') as f:
    mlb = pickle.load(f)
