In [1]:
import numpy as np
import torch
import transformers

ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
softmax_fn = torch.nn.Softmax(dim=-1)


def perplexity(encoding: transformers.BatchEncoding,
               logits: torch.Tensor,
               median: bool = False,
               temperature: float = 1.0):
    shifted_logits = logits[..., :-1, :].contiguous() / temperature
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()

    if median:
        ce_nan = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).
                  masked_fill(~shifted_attention_mask.bool(), float("nan")))
        ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)

    else:
        ppl = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels) *
               shifted_attention_mask).sum(1) / shifted_attention_mask.sum(1)
        ppl = ppl.to("cpu").float().numpy()

    return ppl


def entropy(p_logits: torch.Tensor,
            q_logits: torch.Tensor,
            encoding: transformers.BatchEncoding,
            pad_token_id: int,
            median: bool = False,
            sample_p: bool = False,
            temperature: float = 1.0):
    vocab_size = p_logits.shape[-1]
    total_tokens_available = q_logits.shape[-2]
    p_scores, q_scores = p_logits / temperature, q_logits / temperature

    p_proba = softmax_fn(p_scores).view(-1, vocab_size)

    if sample_p:
        p_proba = torch.multinomial(p_proba.view(-1, vocab_size), replacement=True, num_samples=1).view(-1)

    q_scores = q_scores.view(-1, vocab_size)

    ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)

    if median:
        ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
        agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        agg_ce = (((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy())

    return agg_ce


In [3]:
from typing import Union

import os
import numpy as np
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer



torch.set_grad_enabled(False)

huggingface_config = {
    # Only required for private models from Huggingface (e.g. LLaMA models)
    "TOKEN": {token}
}

# selected using Falcon-7B and Falcon-7B-Instruct at bfloat16
BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843  # optimized for f1-score
BINOCULARS_FPR_THRESHOLD = 0.8536432310785527  # optimized for low-fpr [chosen at 0.01%]

DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1


class Binoculars(object):
    def __init__(self,
                #  observer_name_or_path: str = "tiiuae/Falcon3-3B-Base",
                #  performer_name_or_path: str = "tiiuae/Falcon3-3B-Base-1.58bit",

                 observer_name_or_path: str = "tiiuae/falcon-7b",
                 performer_name_or_path: str = "tiiuae/falcon-7b-instruct",
                 use_bfloat16: bool = True,
                 max_token_observed: int = 512,
                 mode: str = "low-fpr",
                 ) -> None:

        self.change_mode(mode)
        self.observer_model = AutoModelForCausalLM.from_pretrained(observer_name_or_path,
                                                                   device_map={"": DEVICE_1},
                                                                   trust_remote_code=True,
                                                                   torch_dtype=torch.bfloat16 if use_bfloat16
                                                                   else torch.float32,
                                                                   token=huggingface_config["TOKEN"]
                                                                   )
        self.performer_model = AutoModelForCausalLM.from_pretrained(performer_name_or_path,
                                                                    device_map={"": DEVICE_2},
                                                                    trust_remote_code=True,
                                                                    torch_dtype=torch.bfloat16 if use_bfloat16
                                                                    else torch.float32,
                                                                    token=huggingface_config["TOKEN"]
                                                                    )
        self.observer_model.eval()
        self.performer_model.eval()

        self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path)
        if not self.tokenizer.pad_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.max_token_observed = max_token_observed

    def change_mode(self, mode: str) -> None:
        if mode == "low-fpr":
            self.threshold = BINOCULARS_FPR_THRESHOLD
        elif mode == "accuracy":
            self.threshold = BINOCULARS_ACCURACY_THRESHOLD
        else:
            raise ValueError(f"Invalid mode: {mode}")

    def tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
        batch_size = len(batch)
        encodings = self.tokenizer(
            batch,
            return_tensors="pt",
            padding="longest" if batch_size > 1 else False,
            truncation=True,
            max_length=self.max_token_observed,
            return_token_type_ids=False).to(self.observer_model.device)
        return encodings

    @torch.inference_mode()
    def get_logits(self, encodings: transformers.BatchEncoding) -> torch.Tensor:
        observer_logits = self.observer_model(**encodings.to(DEVICE_1)).logits
        performer_logits = self.performer_model(**encodings.to(DEVICE_2)).logits
        if DEVICE_1 != "cpu":
            torch.cuda.synchronize()
        return observer_logits, performer_logits

    def compute_score(self, input_text: Union[list[str], str]) -> Union[float, list[float]]:
        batch = [input_text] if isinstance(input_text, str) else input_text
        encodings = self.tokenize(batch)
        observer_logits, performer_logits = self.get_logits(encodings)
        ppl = perplexity(encodings, performer_logits)
        x_ppl = entropy(observer_logits.to(DEVICE_1), performer_logits.to(DEVICE_1),
                        encodings.to(DEVICE_1), self.tokenizer.pad_token_id)
        binoculars_scores = ppl / x_ppl
        binoculars_scores = binoculars_scores.tolist()
        return binoculars_scores[0] if isinstance(input_text, str) else binoculars_scores

    def predict(self, input_text: Union[list[str], str]) -> Union[list[str], str]:
        binoculars_scores = np.array(self.compute_score(input_text))
        pred = np.where(binoculars_scores < self.threshold,
                        "Most likely AI-generated",
                        "Most likely human-generated"
                        ).tolist()
        return pred

bino = Binoculars()


config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [6]:
sample_string = '''Dr. Capy Cosmos'''

print(bino.compute_score(sample_string))  # 0.75661373
print(bino.predict(sample_string))  # 'Most likely AI-Generated'

1.0523256063461304
Most likely human-generated


In [9]:
tokenized = bino.tokenize(sample_string)
bino.get_logits(tokenized)[0].shape, bino.get_logits(tokenized)[1].shape

(torch.Size([1, 6, 65024]), torch.Size([1, 6, 65024]))

In [10]:
def per_token_perplexity(encoding: transformers.BatchEncoding,
                       logits: torch.Tensor,
                       temperature: float = 1.0):
    """
    Calculate per-token perplexity for each token in the sequence.

    Args:
        encoding: BatchEncoding from the tokenizer
        logits: Model output logits of shape [batch_size, sequence_length, vocab_size]
        temperature: Softmax temperature parameter

    Returns:
        per_token_ppl: Per-token perplexity tensor of shape [batch_size, sequence_length-1]
    """
    shifted_logits = logits[..., :-1, :].contiguous() / temperature
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()

    # We need to create a new loss function instance with reduction='none'
    # to get per-token values
    ce_loss_none = torch.nn.CrossEntropyLoss(reduction='none')

    # Calculate cross-entropy loss for each token without reduction
    # Shape: [batch_size, sequence_length-1]
    ce_per_token = ce_loss_none(shifted_logits.transpose(1, 2), shifted_labels)

    # Apply attention mask
    ce_per_token = ce_per_token * shifted_attention_mask

    # Convert to perplexity (exp of cross-entropy)
    per_token_ppl = torch.exp(ce_per_token)

    # Mask out padding tokens
    per_token_ppl = per_token_ppl.masked_fill(~shifted_attention_mask.bool(), float("nan"))

    return per_token_ppl


def per_token_entropy(p_logits: torch.Tensor,
                    q_logits: torch.Tensor,
                    encoding: transformers.BatchEncoding,
                    pad_token_id: int,
                    sample_p: bool = False,
                    temperature: float = 1.0):
    """
    Calculate per-token cross-entropy between distributions p and q.

    Args:
        p_logits: Logits from model p
        q_logits: Logits from model q
        encoding: BatchEncoding from the tokenizer
        pad_token_id: Token ID for padding
        sample_p: Whether to sample from p distribution
        temperature: Softmax temperature parameter

    Returns:
        per_token_ce: Per-token cross-entropy tensor of shape [batch_size, sequence_length]
    """
    vocab_size = p_logits.shape[-1]
    total_tokens_available = q_logits.shape[-2]
    p_scores, q_scores = p_logits / temperature, q_logits / temperature

    p_proba = softmax_fn(p_scores).view(-1, vocab_size)

    if sample_p:
        p_proba = torch.multinomial(p_proba.view(-1, vocab_size), replacement=True, num_samples=1).view(-1)

    q_scores = q_scores.view(-1, vocab_size)

    # Create a cross-entropy loss with reduction='none' to get per-token values
    ce_loss_none = torch.nn.CrossEntropyLoss(reduction='none')

    # Calculate cross-entropy for each token
    ce = ce_loss_none(input=q_scores, target=p_proba).view(-1, total_tokens_available)

    # Create padding mask
    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)

    # Mask padded tokens with NaN
    per_token_ce = ce.masked_fill(~padding_mask.bool(), float("nan"))

    return per_token_ce

logits = bino.get_logits(tokenized)
print(logits[0].shape, logits[1].shape)
print(
per_token_perplexity(tokenized, logits[0]),
per_token_perplexity(tokenized, logits[1]),
per_token_entropy(logits[0], logits[1], tokenized, bino.tokenizer.pad_token_id),
per_token_entropy(logits[1], logits[0], tokenized, bino.tokenizer.pad_token_id)
)

torch.Size([1, 6, 65024]) torch.Size([1, 6, 65024])
tensor([[1.5547e+00, 1.7152e+04, 1.4960e+03, 9.7920e+03, 3.2812e+00]],
       device='cuda:0', dtype=torch.bfloat16) tensor([[2.0469e+00, 1.9456e+04, 4.0400e+02, 1.7152e+04, 6.3750e+00]],
       device='cuda:0', dtype=torch.bfloat16) tensor([[3.1719, 7.3438, 5.1875, 7.5000, 3.9688, 5.0312]], device='cuda:0',
       dtype=torch.bfloat16) tensor([[5.0625, 7.2188, 5.2812, 7.3125, 4.6250, 5.8125]], device='cuda:0',
       dtype=torch.bfloat16)


In [11]:
# print content of folder /content
import os
FOLDER = '/content/drive/MyDrive/course_project'
print(os.listdir(f'{FOLDER}/data'))

['combined_data_filtered_train1.csv_', 'result_processed.json', 'result2_processed.json', 'result2_fixed.json', 'result_processed', 'input_df.csv', 'result_processed2']


In [12]:
import pandas as pd

df = pd.read_csv(f'{FOLDER}/data/combined_data_filtered_train1.csv_')
df = df[['text', 'label']]
df = df[df['text'].apply(lambda x: len(x.split())) < 500]
print(len(df))
print(df['text'].apply(lambda x: len(x.split())).describe())
df.head()

32694
count    32694.000000
mean       217.631400
std        122.738186
min          1.000000
25%        121.000000
50%        189.000000
75%        308.000000
max        499.000000
Name: text, dtype: float64


Unnamed: 0,text,label
0,", and Environmental Sciences, School of Public...",1
1,R & Bioconductor Manual\n\nmyDF <- as.data.fra...,0
2,Feminist ethics is founded on the views that w...,0
4,The LHC primarily produces a light charged Hig...,1
5,As a high school student struggling to find a ...,1


In [13]:
df.to_csv(f'{FOLDER}/data/input_df.csv', index=False)

In [14]:
def process2scores(text):
    tokenized = bino.tokenize(text)
    logits = bino.get_logits(tokenized)
    l1 = per_token_perplexity(tokenized, logits[0])
    l2 = per_token_perplexity(tokenized, logits[1])
    l3 = per_token_entropy(logits[0], logits[1], tokenized, bino.tokenizer.pad_token_id)
    l4 = per_token_entropy(logits[1], logits[0], tokenized, bino.tokenizer.pad_token_id)

    l1 = l1.float()
    l2 = l2.float()
    l3 = l3.float()
    l4 = l4.float()
    l1 = l1.cpu().numpy()
    l2 = l2.cpu().numpy()
    l3 = l3.cpu().numpy()
    l4 = l4.cpu().numpy()
    l1 = l1.tolist()[0]
    l2 = l2.tolist()[0]
    l3 = l3.tolist()[0]
    l4 = l4.tolist()[0]

    print(len(l1), len(l2), len(l3), len(l4))
    return [l1,l2,l3,l4]

import json

json.dumps(process2scores(df['text'][0]))

511 511 512 512


'[[13.375, 43776.0, 16.375, 4.875, 127.0, 1.015625, 49.75, 1.46875, 1.828125, 4.65625, 1.0390625, 70.0, 1.9296875, 1.125, 1.015625, 1.7421875, 1.1015625, 1.3359375, 1.765625, 1.0625, 230.0, 1.3828125, 1.03125, 1.6328125, 8.25, 1.2421875, 1.0859375, 2.734375, 2.203125, 688.0, 20.375, 1000.0, 2.671875, 6.3125, 1.0, 4.03125, 1.3203125, 36.25, 14.25, 72.5, 17.5, 334.0, 1.25, 1.3359375, 26.625, 1.0078125, 1.0, 1.2890625, 2.484375, 4.9375, 18.625, 1.0546875, 1.09375, 1.0859375, 1.09375, 223.0, 20.75, 7.875, 15168.0, 1.171875, 2.421875, 1.203125, 108.5, 17.75, 223.0, 1.78125, 1.2109375, 1646592.0, 1.15625, 1.0, 1.0, 1.078125, 26.25, 1.25, 21.375, 1.46875, 664.0, 1.0234375, 23.5, 1.0546875, 1.03125, 1.5546875, 2.390625, 1.0078125, 1.0, 1.203125, 1.046875, 9.0625, 1.0078125, 52.0, 1.015625, 1.203125, 8.9375, 1.0078125, 9.0625, 1.0, 1.0625, 223.0, 1.03125, 4.875, 1.0, 1.0, 1.03125, 1.2265625, 1.0078125, 2.71875, 1.03125, 13.5625, 1.2734375, 1.1171875, 1.015625, 1.0078125, 1.0234375, 1.0390625, 1

In [None]:
import json
import os
import time
from tqdm.auto import tqdm

def process_dataframe(df, output_filename, start_index=0):
    """
    Process each row in the dataframe, showing progress and saving every 10 steps.

    Args:
        df: Pandas DataFrame with a 'text' column
        output_filename: Base filename for the output file
        start_index: Index to start processing from (in case of resuming)
    """
    # Initialize results list
    results = []

    # Load existing results if the file exists and we're not starting from 0
    if start_index > 0 and os.path.exists(f"{output_filename}_processed.json"):
        with open(f"{output_filename}_processed.json", 'r') as f:
            results = json.load(f)

        # Verify we have the expected number of processed items
        if len(results) != start_index:
            print(f"Warning: Found {len(results)} processed items but expected {start_index}")

    # Process each row with progress bar
    for i in tqdm(range(start_index, len(df)), desc="Processing rows"):
        row = df.iloc[i]

        # Process the text
        try:
            scores = process2scores(row['text'])

            # Store the results
            result = {
                'index': i,
                'scores': scores
            }
            results.append(result)

            # Save every 10 steps
            if (i + 1) % 100 == 0 or i == len(df) - 1:
                with open(f"{output_filename}/{i}_processed.json", 'w') as f:
                    json.dump(results, f)
                print(f"Saved progress at row {i+1}/{len(df)}")

        except Exception as e:
            print(f"Error processing row {i}: {str(e)}")
            # Save on error to preserve progress
            with open(f"{output_filename}_processed.json", 'w') as f:
                json.dump(results, f)
            print(f"Saved progress at row {i} due to error")

    print(f"Processing complete. Processed {len(results)} rows.")
    return results

# Example usage:
# process_dataframe(df, "my_dataset")

# To resume from a specific index:
process_dataframe(df, f'{FOLDER}/data/result_processed2', start_index=0)

Processing rows:   0%|          | 0/32694 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
70 70 71 71
202 202 203 203
195 195 196 196
295 295 296 296
164 164 165 165
119 119 120 120
75 75 76 76
511 511 512 512
212 212 213 213
22 22 23 23
281 281 282 282
134 134 135 135
442 442 443 443
133 133 134 134
240 240 241 241
437 437 438 438
511 511 512 512
340 340 341 341
359 359 360 360
382 382 383 383
254 254 255 255
395 395 396 396
386 386 387 387
212 212 213 213
281 281 282 282
195 195 196 196
185 185 186 186
511 511 512 512
511 511 512 512
268 268 269 269
363 363 364 364
126 126 127 127
200 200 201 201
279 279 280 280
204 204 205 205
158 158 159 159
240 240 241 241
511 511 512 512
Saved progress at row 9400/32694
73 73 74 74
388 388 389 389
351 351 352 352
179 179 180 180
182 182 183 183
70 70 71 71
184 184 185 185
511 511 512 512
77 77 78 78
309 309 310 310
424 424 425 425
21 21 22 22
163 163 164 164
163 163 164 164
410 410 411 411
226 226 227 227
248 248 249 249
477 477 478 478
318 318 319 319
19 19 20 20
79 79 

In [None]:
import json

# Attempt to fix the file
with open(f'{FOLDER}/data/result_processed/99_processed.json', 'r') as file:
    content = file.read()

json.loads(content)

[{'index': 0,
  'scores': [[40.5,
    41216.0,
    21.0,
    7.28125,
    430.0,
    1.0234375,
    139.0,
    1.2734375,
    2.03125,
    2.78125,
    1.015625,
    60.0,
    4.0625,
    1.1328125,
    1.0234375,
    1.5703125,
    2.609375,
    1.5546875,
    2.1875,
    1.0625,
    664.0,
    58.0,
    1.0,
    1.03125,
    1.046875,
    1.0,
    1.21875,
    7.375,
    1.1015625,
    1.0625,
    5.25,
    1.609375,
    2400.0,
    21.375,
    1752.0,
    5.625,
    4.40625,
    1.0,
    1.796875,
    4.8125,
    34.75,
    26.625,
    12.375,
    14.9375,
    90.0,
    3.078125,
    1.21875,
    31.125,
    1.0078125,
    1.0078125,
    1.0625,
    52.0,
    4.875,
    1.5078125,
    17.5,
    46.75,
    1.671875,
    1.0390625,
    1.1875,
    1.0546875,
    416.0,
    40.0,
    16.375,
    2464.0,
    24.625,
    1.5625,
    197.0,
    6.71875,
    664.0,
    51.25,
    3.28125,
    1408.0,
    1.0546875,
    1.0078125,
    1.15625,
    19.5,
    1.1328125,
    1696.0,
    4.25,


In [None]:
def process_batch(texts, batch_size=8):
    """
    Process a list of texts in batches for efficiency.

    Args:
        texts: List of text strings to process
        batch_size: Number of texts to process in each batch

    Returns:
        List of results, where each result is a list [l1, l2, l3, l4] containing perplexity and entropy scores
    """
    all_results = []

    # Process in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_results = []

        print(f"Processing batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} ({len(batch_texts)} texts)")

        # Process each text in the batch
        for text in batch_texts:
            scores = process2scores(text)
            batch_results.append(scores)

        all_results.extend(batch_results)

    return all_results

def process2scores(texts, verbose=False):
    """
    Process a batch of texts and return per-token scores using batch processing capabilities.

    Args:
        texts: A single text string or a list of text strings to process
        verbose: Whether to print token lengths

    Returns:
        If single text: List of [l1, l2, l3, l4] containing perplexity and entropy scores
        If batch: List of lists, each containing [l1, l2, l3, l4] scores for each text
    """
    # Handle both single text and batch inputs
    is_single_text = isinstance(texts, str)
    if is_single_text:
        texts = [texts]

    # Process the entire batch at once
    tokenized = bino.tokenize(texts)
    logits = bino.get_logits(tokenized)

    # Calculate per-token metrics for the entire batch
    l1 = per_token_perplexity(tokenized, logits[0])
    l2 = per_token_perplexity(tokenized, logits[1])
    l3 = per_token_entropy(logits[0], logits[1], tokenized, bino.tokenizer.pad_token_id)
    l4 = per_token_entropy(logits[1], logits[0], tokenized, bino.tokenizer.pad_token_id)

    # Convert to CPU numpy arrays
    l1 = l1.float().cpu().numpy()
    l2 = l2.float().cpu().numpy()
    l3 = l3.float().cpu().numpy()
    l4 = l4.float().cpu().numpy()

    # Convert to Python lists
    l1 = l1.tolist()
    l2 = l2.tolist()
    l3 = l3.tolist()
    l4 = l4.tolist()

    if verbose:
        for i in range(len(l1)):
            print(f"Text {i+1}/{len(texts)}: {len(l1[i])}, {len(l2[i])}, {len(l3[i])}, {len(l4[i])}")

    # Prepare results for each text in the batch
    batch_results = [[l1[i], l2[i], l3[i], l4[i]] for i in range(len(l1))]

    # Return single result if input was a single text
    if is_single_text:
        return batch_results[0]
    else:
        return batch_results

# Update the DataFrame processing function to use true batch processing
def process_dataframe_in_batches(df, output_filename, batch_size=8, start_index=0):
    """
    Process each row in the dataframe using true batch processing capability.
    Shows progress and saves every 10 batches.

    Args:
        df: Pandas DataFrame with a 'text' column
        output_filename: Base filename for the output file
        batch_size: Number of texts to process in each batch
        start_index: Index to start processing from (in case of resuming)
    """
    import json
    import os
    import time
    from tqdm.auto import tqdm

    # Initialize results list
    results = []

    # Load existing results if the file exists and we're not starting from 0
    if start_index > 0 and os.path.exists(f"{output_filename}_processed.json"):
        with open(f"{output_filename}_processed.json", 'r') as f:
            results = json.load(f)

        # Verify we have the expected number of processed items
        if len(results) != start_index:
            print(f"Warning: Found {len(results)} processed items but expected {start_index}")

    # Calculate number of batches
    num_rows = len(df) - start_index
    num_batches = (num_rows - 1) // batch_size + 1

    # Process in batches with progress bar
    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        batch_start = start_index + batch_idx * batch_size
        batch_end = min(batch_start + batch_size, len(df))

        # Get texts for this batch
        batch_texts = df.iloc[batch_start:batch_end]['text'].tolist()

        try:
            # Process entire batch at once with true batch processing
            batch_start_time = time.time()
            batch_scores = process2scores(batch_texts)
            batch_end_time = time.time()

            if batch_idx == 0:
                print(f"First batch processing time: {batch_end_time - batch_start_time:.2f} seconds for {len(batch_texts)} texts")

            # Create result entries for this batch
            batch_results = []
            for i, scores in enumerate(batch_scores):
                result = {
                    'index': batch_start + i,
                    'scores': scores
                }
                batch_results.append(result)

            # Add to overall results
            results.extend(batch_results)

            # Save every 10 batches
            if (batch_idx + 1) % 10 == 0 or batch_idx == num_batches - 1:
                with open(f"{output_filename}_processed.json", 'w') as f:
                    json.dump(results, f)
                print(f"Saved progress after batch {batch_idx+1}/{num_batches} ({len(results)} rows processed)")

        except Exception as e:
            print(f"Error processing batch starting at row {batch_start}: {str(e)}")
            # Save on error to preserve progress
            with open(f"{output_filename}_processed.json", 'w') as f:
                json.dump(results, f)
            print(f"Saved progress at row {batch_start} due to error")

    print(f"Processing complete. Processed {len(results)} rows.")
    return results



process_dataframe(df, f'{FOLDER}/data/result', start_index=0)

Processing rows:   0%|          | 0/32694 [00:00<?, ?it/s]

Saved progress at row 10/32694
Saved progress at row 20/32694
Saved progress at row 30/32694
Saved progress at row 40/32694
Saved progress at row 50/32694


KeyboardInterrupt: 