In [None]:
%%capture
!pip install datasets
!pip install peft

In [None]:
import torch
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
opus_dataset_en_lt = load_dataset("Helsinki-NLP/opus-100", "en-lt", split="test")

In [None]:
opus_dataset_en_lt = opus_dataset_en_lt.shuffle(seed=99)
opus_dataset_en_lt_sample = opus_dataset_en_lt.select(range(1000))

In [None]:
model_names = [
    "ai-forever/mGPT",
    "EleutherAI/gpt-neo-1.3B",
    "meta-llama/Llama-3.2-1B",
    "bigscience/bloomz-1b7",
    "DAMO-NLP-MT/polylm-1.7b",
    "utter-project/EuroLLM-1.7B",
    "openai-community/gpt2-xl",
    "facebook/opt-1.3b",
    "stabilityai/stablelm-2-1_6b",
    "domce20/mGPT-lithuanian-tokenizer",
    "neurotechnology/Lt-Llama-2-7b-hf"
]

In [None]:
%%capture
tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]

In [None]:
def add_token_counts(entry, tokenizer, tokenizer_name):
  lithuanian_text = entry['translation']['lt']
  english_text = entry['translation']['en']

  lt_tokens = tokenizer(lithuanian_text).input_ids
  en_tokens = tokenizer(english_text).input_ids

  return {
      f"{tokenizer_name}_lt_tokens": len(lt_tokens),
      f"{tokenizer_name}_en_tokens": len(en_tokens)
  }

In [None]:
def calculate_tokens(tokenizer, tokenizer_name):
  return opus_dataset_en_lt_sample.map(lambda x: add_token_counts(x, tokenizer, tokenizer_name))

In [None]:
%%capture
for tokenizer in tokenizers:
  opus_dataset_en_lt_sample = calculate_tokens(tokenizer, tokenizer.name_or_path)

In [None]:
df = opus_dataset_en_lt_sample.to_pandas()

In [None]:
df.mean(numeric_only=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_grouped_bar_chart(df):
    # Extracting Lithuanian and English token columns
    lt_columns = [col for col in df.columns if col.endswith('_lt_tokens')]
    en_columns = [col.replace('_lt_', '_en_') for col in lt_columns]

    # Calculating average token counts for sorting by Lithuanian token counts
    lt_avgs = df[lt_columns].mean()
    sorted_models = lt_avgs.sort_values(ascending=True).index

    # Preparing data for plotting
    labels = [model.replace('_lt_tokens', '') for model in sorted_models]
    lt_counts = [df[model].mean() for model in sorted_models]
    en_counts = [df[model.replace('_lt_', '_en_')].mean() for model in sorted_models]

    # Plotting
    plt.figure(figsize=(24, 14))
    x = range(len(labels))
    width = 0.35

    plt.bar(x, lt_counts, width, label="Lithuanian Tokens", alpha=0.9)
    plt.bar([p + width for p in x], en_counts, width, label="English Tokens", color='gray', alpha=0.6)

    plt.xlabel("Model", fontsize=14)
    plt.ylabel("Average Token Count", fontsize=14)
    plt.xticks([p + width / 2 for p in x], labels, rotation=90, fontsize=16)
    plt.yticks(fontsize=16)
    plt.legend(fontsize=16)

    plt.tight_layout()
    plt.show()

# Example usage (assuming `df` is your DataFrame)
plot_grouped_bar_chart(df)
