In [None]:
from transformers import AutoTokenizer
import functions.prompts as prompts
import tiktoken

import anthropic

client = anthropic.Anthropic()


gemma = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
llama = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")

In [None]:
def num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18"):
    """Return the number of tokens used by a list of messages."""
    encoding = tiktoken.encoding_for_model(model)

    tokens_per_message = 3
    tokens_per_name = 1
    num_tokens = 0

    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


In [None]:
def token(model, text):
    if(model == "llama"):
        token = llama(text, return_tensors="pt")
        return token['input_ids'].shape[1]
    elif(model == "gemma"):
        token = gemma(text, return_tensors="pt")
        return token['input_ids'].shape[1]
    elif(model == 'openai'):
        return num_tokens_from_messages([{
            "role": "user",
            "content": text
        }])
    else:
        response = client.messages.count_tokens(
        model="claude-3-5-haiku-20241022",
        system="",
        messages=[{
            "role": "user",
            "content": text
            }]
        )
        return response.input_tokens

In [None]:
import pandas as pd
df = pd.read_csv("dump/csv/papers.csv")
df.head(2)

In [None]:
import concurrent
from concurrent.futures import ThreadPoolExecutor
import os

def process_row(index, row):
    if index % 10 == 0:
        print(index)
    
    try:
        with open(f"output/{row['id']}.txt", "r") as f:
            text = f.read()
            
            results = {}
            for m in ["llama", "gemma", "openai", "anthropic"]:
                results[m] = token(m, text)
            
            return index, results
    except Exception as e:
        print(f"Error processing index {index}: {e}")
        return index, None

def main():
    
    results = []
    with ThreadPoolExecutor(max_workers=500) as executor:
        # Submit all tasks to the executor
        future_to_index = {executor.submit(process_row, index, row): index 
                          for index, row in df.iterrows()}
        
        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_index):
            index, result = future.result()
            if result:
                results.append((index, result))
    
    # Update the dataframe with results
    for index, result_dict in results:
        if result_dict:
            for m, value in result_dict.items():
                df.loc[index, m] = value
    
    return df

In [None]:
main()

In [None]:
df.to_csv("token.csv", index=False)

In [None]:
def test(text):
    return [{
    "role": "system",
    "content": f"{prompts.top5()}\nPlease respond in valid JSON format that matches this schema: {str(prompts.Top5Model.model_json_schema())}. **IMPORTANT**: ONLY RESPOND WITH A JSON OBJECT CONTAINING SCORES ACCORDING TO THE ABOVE SCHEMA. THE RESPONSE MUST END WITH A CURLY BRACKET. DO NOT ADD ANALYSIS OR EXPLANATION."
}, {
    "role": "user",
    "content": text
}, {
    "role": "assistant",
    "content": "{"
}]

In [None]:
f = open("output/1_0.txt", "r")
text = f.read()
f.close()

response = client.messages.count_tokens(
    model="claude-3-5-haiku-20241022",
    system=f"{prompts.top5()}\nPlease respond in valid JSON format that matches this schema: {str(prompts.Top5Model.model_json_schema())}. **IMPORTANT**: ONLY RESPOND WITH A JSON OBJECT CONTAINING SCORES ACCORDING TO THE ABOVE SCHEMA. THE RESPONSE MUST END WITH A CURLY BRACKET. DO NOT ADD ANALYSIS OR EXPLANATION.",
    messages=[{
    "role": "user",
    "content": text
}, {
    "role": "assistant",
    "content": "{"
}]
    )

print(response.input_tokens)

In [None]:
from openai import OpenAI

openaiClient = OpenAI()

x = openaiClient.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[{
            "role": "system",
            "content": prompts.top5()
        }, {
        "role": "user",
        "content": text
    }],
    response_format=prompts.Top5Model
)

In [None]:
t = {
    "anthropic": 343,
    "gemma": 315,
    "llama": 324,
    "openai": 248
}

s = {
    "anthropic": 0,
    "gemma": 0,
    "llama": 0,
    "openai": 0
}

for index, row in df.iterrows():
    for m in ["llama", "gemma", "openai", "anthropic"]:
        s[m] += row[m] + t[m]

for m in ["llama", "gemma", "openai", "anthropic"]:
    print(m, s[m]/df.shape[0])