<a target="_blank" href="https://colab.research.google.com/github/okareo-ai/okareo-python-sdk/blob/main/examples/test_runs.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Welcome to Okareo!



In [None]:
%pip install okareo transformers torch matplotlib

In [1]:
# Import pandas for data processing
import pandas as pd

# Convert the JSONL string to a pandas DataFrame
data = pd.read_json(path_or_buf="https://raw.githubusercontent.com/okareo-ai/okareo-python-sdk/origin/feature/classification-example-model/examples/webbizz_classification_questions.jsonl", lines=True)

# Convert the "result" column to numeric classes
data["label"] = data["result"].map({"complaints": 2, "returns": 1, "pricing": 0})

In [2]:
# Load the DistilBERT model and tokenizer
from transformers import AutoTokenizer, DistilBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", problem_type="multi_label_classification", num_labels=3)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Load libraries for PyTorch
import torch
from torch.utils.data import DataLoader

# Split the data into training and validation sets
train_data = data.sample(frac=0.8)
val_data = data.drop(train_data.index)

# Create a custom dataset class for the text data
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["input"]
        label = self.data.iloc[idx]["label"]

        # Tokenize the text
        encoding = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

        # Return the input IDs, attention mask, and label (reshape the input IDs and attention mask to remove an unneeded dimension)
        return encoding.input_ids.squeeze(), encoding.attention_mask.squeeze(), label

# Create Dataset objects for the training and validation sets
train_data = TextDataset(train_data, tokenizer)
val_data = TextDataset(val_data, tokenizer)

# Create DataLoader objects for the training and validation sets
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, drop_last=True)
val_loader = DataLoader(val_data, batch_size=8)

In [6]:
# An optimizer for the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# One epoch of training
def train(loader, model, optimizer):
    # Set the model to training mode
    model.train()
    print("Training...")
    losses = []
    for input_ids, attention_mask, labels in loader:
        optimizer.zero_grad()
        # One-hot encode the labels
        oh_labels = torch.nn.functional.one_hot(labels, num_classes=3).to(torch.float32)
        # Pass the input IDs, attention mask, and one-hot labels to the model and get the loss
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=oh_labels).loss
        # Backpropagate the loss
        loss.backward()
        # Update the model parameters
        optimizer.step()
        # Track losses
        losses.append(loss.item())
    return losses

# One epoch of validation
def validate(loader, model, epoch):
    # Set the model to evaluation mode
    model.eval()
    print("Validating...")
    # Disable gradient calculations (not needed for validation)
    with torch.no_grad():
        correct = 0
        total = 0
        for input_ids, attention_mask, labels in loader:
            # Pass the input IDs and attention mask to the model and get the logits
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            # Get the predicted labels
            predictions = torch.argmax(logits, dim=1)
            # Track accuracy
            correct += (predictions == labels).sum().item()
            total += len(labels)
        accuracy = correct / total
        print(f"Epoch {epoch}, Validation Accuracy: {accuracy}")

In [None]:
# Import matplotlib for plotting
import matplotlib.pyplot as plt

# Train the model for 3 epochs
for epoch in range(3):
    # Train the model for one epoch
    losses = train(train_loader, model, optimizer)
    # Plot the loss
    plt.plot(losses)
    plt.title(f"Epoch {epoch} Loss")
    plt.show()
    # Validate the model
    validate(val_loader, model, epoch)

In [16]:
# Save the model
model.save_pretrained("webbizz_classification_model")

## Pushing to Hugging Face

In [7]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained("webbizz_classification_model")

In [8]:
# Save the model to Hugging Face Hub
model.push_to_hub("webbizz_classification_model")

model.safetensors: 100%|██████████| 268M/268M [03:25<00:00, 1.30MB/s]   


CommitInfo(commit_url='https://huggingface.co/sbroecker/webbizz_classification_model/commit/067ac98727c00482984f4717dd0d8de5ea3989ee', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='067ac98727c00482984f4717dd0d8de5ea3989ee', pr_url=None, pr_revision=None, pr_num=None)