In [None]:
from datasets import load_dataset
from dotenv import load_dotenv
import transformers

In [None]:
load_dotenv()

In [None]:
dataset = load_dataset("mohamed-stifi/darija-combined-dataset", split='train')

In [None]:
dataset

In [None]:
dataset['text'][:5]

In [None]:
model_id = 'google/gemma-3n-E2B-it'
current_tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id, trust_remote_code=True
)

In [None]:
existing_tokens = set(current_tokenizer.vocab.keys())
print(len(existing_tokens))

In [None]:
text = dataset[30000]['text']

print(current_tokenizer.tokenize(text))

In [None]:
len(current_tokenizer.tokenize(text))

In [None]:
texts = dataset['text']
len(texts)

In [None]:
number_of_tokens = []
text_lenght = []
for text in texts:
    text_lenght.append(len(text))
    number_of_tokens.append(len(current_tokenizer.tokenize(text)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the lengths are calculated before plotting
# Assuming 'text_lenght' and 'number_of_tokens' lists exist from the preceding code

# Plotting the relationship between text length and number of tokens
plt.figure(figsize=(10, 6))
sns.scatterplot(x=text_lenght, y=number_of_tokens)
plt.title('Number of Tokens vs Text Length')
plt.xlabel('Text Length (Characters)')
plt.ylabel('Number of Tokens')
plt.grid(True)
plt.show()

# Additional plots (histograms for distribution of text length and token count)

# Histogram of Text Length
plt.figure(figsize=(10, 6))
sns.histplot(text_lenght, bins=50, kde=True)
plt.title('Distribution of Text Length')
plt.xlabel('Text Length (Characters)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Histogram of Number of Tokens
plt.figure(figsize=(10, 6))
sns.histplot(number_of_tokens, bins=50, kde=True)
plt.title('Distribution of Number of Tokens')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
words = []
for text in texts:
    words.extend(text.split())

words = set(words)
print(len(words))

In [None]:
voc_size = len(words)//4
print(f"vocabolary size: {voc_size}")
new_tokenizer = current_tokenizer.train_new_from_iterator(texts,voc_size)

In [None]:
new_tokenizer.vocab_size

In [None]:
text = dataset[30000]['text']

print(new_tokenizer.tokenize(text))

In [None]:
len(new_tokenizer.tokenize(text))


In [None]:
number_of_tokens = []
text_lenght = []
for text in texts:
    text_lenght.append(len(text))
    number_of_tokens.append(len(new_tokenizer.tokenize(text)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the lengths are calculated before plotting
# Assuming 'text_lenght' and 'number_of_tokens' lists exist from the preceding code

# Plotting the relationship between text length and number of tokens
plt.figure(figsize=(10, 6))
sns.scatterplot(x=text_lenght, y=number_of_tokens)
plt.title('Number of Tokens vs Text Length')
plt.xlabel('Text Length (Characters)')
plt.ylabel('Number of Tokens')
plt.grid(True)
plt.show()

# Additional plots (histograms for distribution of text length and token count)

# Histogram of Text Length
plt.figure(figsize=(10, 6))
sns.histplot(text_lenght, bins=50, kde=True)
plt.title('Distribution of Text Length')
plt.xlabel('Text Length (Characters)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Histogram of Number of Tokens
plt.figure(figsize=(10, 6))
sns.histplot(number_of_tokens, bins=50, kde=True)
plt.title('Distribution of Number of Tokens')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
existing_tokens_len = []
for token in current_tokenizer.vocab.keys():
    existing_tokens_len.append(len(token))

In [None]:
# prompt: plot the ferquences of lenghts in existing_tokens_len

import matplotlib.pyplot as plt
# Plotting the frequencies of lengths of existing tokens
plt.figure(figsize=(10, 6))
sns.histplot(existing_tokens_len, bins=50, kde=True)
plt.title('Distribution of Lengths of Existing Tokens')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
new_token_len = []
for token in new_tokenizer.vocab.keys():
    new_token_len.append(len(token))

In [None]:
import matplotlib.pyplot as plt
# Plotting the frequencies of lengths of existing tokens
plt.figure(figsize=(10, 6))
sns.histplot(new_token_len, bins=50, kde=True)
plt.title('Distribution of Lengths of New Tokens')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
### remove the tokens with len > 30

new_tokens_to_remove = {}
new_tokens_to_keep = {}

for token, id in new_tokenizer.vocab.items():
    if len(token) > 30:
        new_tokens_to_remove[token] = id
    else:
        new_tokens_to_keep[token] = id

In [None]:
len(new_tokens_to_remove), len(new_tokens_to_keep)

In [None]:
new_token_to_keep_len = []
for token in new_tokens_to_keep.keys():
    new_token_to_keep_len.append(len(token))

In [None]:
import matplotlib.pyplot as plt
# Plotting the frequencies of lengths of existing tokens
plt.figure(figsize=(10, 6))
sns.histplot(new_token_to_keep_len, bins=50, kde=True)
plt.title('Distribution of Lengths of New Tokens To Keep')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Convert the vocab keys to sets for efficient lookup
existing_tokens_set = set(current_tokenizer.vocab.keys())
new_tokens_to_keep_set = set(new_tokens_to_keep.keys())

# Find tokens in new_tokens_to_keep that are also in current_tokenizer.vocab
tokens_in_both = new_tokens_to_keep_set.intersection(existing_tokens_set)

# Find tokens in new_tokens_to_keep that are NOT in current_tokenizer.vocab
tokens_only_in_new = new_tokens_to_keep_set.difference(existing_tokens_set)

print(f"Tokens in new_tokens_to_keep that are in current_tokenizer.vocab: {len(tokens_in_both)}")
# print(tokens_in_both) # Uncomment to print the tokens themselves

print(f"Tokens in new_tokens_to_keep that are NOT in current_tokenizer.vocab: {len(tokens_only_in_new)}")
# print(tokens_only_in_new) # Uncomment to print the tokens themselves


In [None]:
tokens_in_both

In [None]:
tokens_only_in_new

In [None]:
current_tokenizer.add_tokens(list(tokens_only_in_new))

In [None]:
existing_tokens = set(current_tokenizer.vocab.keys())
print(len(existing_tokens))

In [None]:
text = dataset[30000]['text']

print(new_tokenizer.tokenize(text))
len(new_tokenizer.tokenize(text))


In [None]:
tokenizer_save_path = "./darija_gemma_3n_tokenizer"
current_tokenizer.save_pretrained(tokenizer_save_path)
print(f"Tokenizer saved locally at: {tokenizer_save_path}")

In [None]:
from huggingface_hub import HfApi

# Set your repository ID (format: "username/repo_name")
tokenizer_repo_id = "mohamed-stifi/darija_gemma_3n_tokenizer"  # Customize the repo name

# Push to Hub
current_tokenizer.push_to_hub(
    repo_id=tokenizer_repo_id,
    commit_message="Add Darija-adapted Gemma tokenizer from dataset mohamed-stifi/darija-combined-dataset",
)
print(f"✅ Tokenizer uploaded to: https://huggingface.co/{tokenizer_repo_id}")