In [None]:
def binary_search(array, value):
    high, low = len(array), 0
    while low <= high:
        mid = (high + low) // 2
        if value < array[mid]:
            high, low = mid - 1, low
        elif array[mid] == value:
            return mid
        elif array[mid] < value:
            high, low = high, mid + 1
        print(low, high)
    print("Not in the array")
    return None


binary_search(
    [
        1,
        3,
        6,
        7,
        12,
        25,
        89,
        100,
    ],
    89,
)


4 7
6 7


6

In [None]:
import os

import evaluate
import torch
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

# Load dataset
datasets = load_dataset("csv", data_files={"train": "train.csv", "valid": "valid.csv"})
datasets = datasets.class_encode_column("category_id")
datasets = datasets.rename_column("category_id", "labels")


num_labels = len(datasets["train"].features["labels"].names)
# Load pre-trained model and tokenizer
model_name = "./matcher"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
model = AutoModelForSequenceClassification.from_config(config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Freeze the base model weights
for param in model.bert.parameters():
    param.requires_grad = False


# Tokenize the dataset (without padding)
def tokenize_function(examples):
    return tokenizer(examples["title"], truncation=True)  # No padding here


tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=os.cpu_count())

# Prepare the dataset for training
train_dataset = tokenized_datasets["train"].shuffle(seed=42)  # Adjust the size as needed
eval_dataset = tokenized_datasets["valid"].shuffle(seed=42).select(range(10_000))  # Adjust the size as needed

# Define a data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=int(0.05 * (len(train_dataset) // 32)),
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="tensorboard",
    resume_from_checkpoint="./results",
)

# Define the metric for evaluation
metric = evaluate.load("precision")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Add the data collator here
)

# Train the model
# trainer.train()

# Evaluate the model
# trainer.evaluate()

In [None]:
datasets.features

In [None]:
train_dataset[:10]["labels"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
dir(tokenizer)

In [None]:
tokenizer

In [None]:
def upper(text: str):
    return str(text).upper()


tokenizer.normalizer = upper

In [None]:
from tokenizers.tools import EncodingVisualizer

temp = EncodingVisualizer(tokenizer)

In [None]:
text = "Intel Işıklı"
outputs = tokenizer(text)["input_ids"]

tokenizer.convert_ids_to_tokens(outputs)

In [None]:
temp