# Analysis of the Tokenizaton of Language Models

## Imports and Helper

In [16]:
import pandas as pd
import numpy as np
import itertools
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from dash import Dash, html, dcc, callback, Output, Input
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go

import random
import scipy.stats
import os
import re

from transformers import AutoTokenizer

custom_template = dict(
    layout=go.Layout(title_font=dict(family="Times New Roman"))
)

color_map = {'el': '#FEE374',
'en': 'black',
'es': '#F58A21',
'sw': '#40A9ED',
'ur': '#99C367',
'tr': '#BFB5D7',
'ar': '#A9FCFF',
'zh': '#B52742',
'fr': '#B98446',
'th': '#790D7E',
'hi': '#1A4F9A',
'vi': 'darkred',
'bg': 'purple', 
'ru': '#F23916', 
'de': '#958F1B',
'ja': '#E1308B',
'te': '#0C00FB',
'bn': '#2C7E80',
'it': '#4BC998',
'id': '#F9CF5D',
'ht': '#6668F7',
'ta': '#E46241',
'et': '#F69AFD',
'qu': '#A661F7',
}

def get_answered_correct(row):
    if row['detected_language'] in row['iso_range']:
        return True
    else:
        return False

## Load Language Fidelity,Answer Accuracy and PPL and Tokenization Datasets and merge them

In [10]:
df_dict= {}


for model in os.listdir(f"../../../data/model_ppl_and_token_eval/"):
        df_dict[model[:-4]] = pd.read_csv(f"../../../data/model_ppl_and_token_eval/{model}", 
                                          converters={'token_ids': pd.eval, 
                                                                   'tokens_decoded': pd.eval,
                                                                   'token_length': pd.eval
                                                                   }
                                                                   )
        print(f"Loaded {model} ({len(df_dict[model[:-4]])} rows)")

Loaded Mistral-7B-Instruct-v0.1.csv (27400 rows)
Loaded Qwen1.5-7B-Chat.csv (27400 rows)
Loaded Llama-2-13b-chat-hf.csv (27400 rows)
Loaded Mixtral-8x7B-Instruct-v0.1.csv (27400 rows)
Loaded Llama-2-7b-chat-hf.csv (27400 rows)


In [11]:
# Merge iso and input language
for model in df_dict:
    path = '../../../data/model_language_fidelity/' + str(model) + '.csv'
    df_tmp = pd.read_csv(path)
    df_tmp = df_tmp[['id', 'language','iso_range','iso_639_3','detected_language', 'input_family', 'output_family']]
    df_dict[model] = df_dict[model].merge(df_tmp, on=['id', 'language'])


In [13]:
# Merge model validation
for model in ['Mistral-7B-Instruct-v0.1', 'Qwen1.5-7B-Chat', 'Mixtral-8x7B-Instruct-v0.1', 'Llama-2-7b-chat-hf']:
    path = '../../../data/model_answer_accuracy/completions/' + str(model) + '.csv'
    df_tmp = pd.read_csv(path)

    #  Normalize model answers
    def normalize(row):
        return_value = 'unknown'
        if bool(re.search('^No|no', row['eval_completion'])):
            return_value =  'incorrect'
        elif bool(re.search('^Yes|yes', row['eval_completion'])):
            return_value =  'correct'
        elif bool(re.search('correct or incorrect', row['eval_completion'])):
            return_value =  'incorrect'
        elif bool(re.search('is incorrect', row['eval_completion'])):
            return_value =  'incorrect'
        elif bool(re.search('is correct', row['eval_completion'])):
            return_value =  'correct'
        else:
            return_value =  'incorrect'
        return return_value

    df_tmp['eval_completion'] = df_tmp.apply(lambda row: normalize(row), axis=1)
    df_tmp = df_tmp[['id', 'language','eval_completion']]
    df_dict[model] = df_dict[model].merge(df_tmp, on=['id', 'language'])

In [15]:
for model in df_dict:
    print(model)
    df_dict[model]['correct_language'] = df_dict[model].apply(lambda row: get_answered_correct(row), axis = 1)

Mistral-7B-Instruct-v0.1
Qwen1.5-7B-Chat
Llama-2-13b-chat-hf
Mixtral-8x7B-Instruct-v0.1
Llama-2-7b-chat-hf


## Plot Perplexity vs. Language Fidelity

In [18]:
for model in ['Mistral-7B-Instruct-v0.1', 'Qwen1.5-7B-Chat', 'Llama-2-13b-chat-hf', 'Llama-2-7b-chat-hf']:
    print(model)
    mean_acc = df_dict[model].groupby('iso_639_3').agg({'ppl_mean': np.mean,'correct_language': np.mean}).reset_index()

    fig = px.scatter(mean_acc, 
                 x='ppl_mean', 
                 y='correct_language',
                 color='iso_639_3',
                 color_discrete_map=color_map)

    fig.update_layout(
        yaxis_title="Accuracy in % of correctly answered prompts",
        xaxis_title="Mean Perplexity",
        font_family="Times New Roman",
        width=500,
        showlegend=False,
        height=500,
        template=custom_template,
        title=model)

    fig.show()

Mistral-7B-Instruct-v0.1


Qwen1.5-7B-Chat


Llama-2-13b-chat-hf


Llama-2-7b-chat-hf


## Is perplexity of prompts answered in another language significantly larger than those answered in same language?

In [20]:
plot = False

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

for model in ['Mistral-7B-Instruct-v0.1', 'Qwen1.5-7B-Chat', 'Llama-2-13b-chat-hf', 'Llama-2-7b-chat-hf']:

    counter_less_than_10 = 0
    answers = ['Correct', 'Incorrect']
    list_mean_true = []
    list_mean_false = []
    list_error_true = []
    list_error_false = []
    sample_true = df_dict[model][(df_dict[model]['correct_language'] == True)]['ppl_mean']
    sample_false = df_dict[model][(df_dict[model]['correct_language'] == False)]['ppl_mean']
    if ((len(sample_true) / len(sample_false)) < 0.1) or ((len(sample_false) / len(sample_true)) < 0.1):
        counter_less_than_10 +=1
    else:
        if len(sample_true) < len(sample_false):
            bootstrapped_true = np.random.choice(sample_true, len(sample_false))
            mean_true, sd_true = mean_confidence_interval(bootstrapped_true)
            mean_false, sd_false = mean_confidence_interval(sample_false)
        elif len(sample_true) > len(sample_false):
            bootstrapped_false = np.random.choice(sample_false, len(sample_true))
            mean_true, sd_true = mean_confidence_interval(sample_true)
            mean_false, sd_false = mean_confidence_interval(bootstrapped_false)
        else:
            mean_true, sd_true = mean_confidence_interval(sample_true)
            mean_false, sd_false = mean_confidence_interval(sample_false)
        list_mean_true.append(mean_true)
        list_mean_false.append(mean_false)
        list_error_true.append(sd_true)
        list_error_false.append(sd_false)

    if plot: 
        fig = go.Figure()
        fig.add_trace(go.Bar(
            name='True',
            x=answers, y=list_mean_true,
            error_y=dict(type='data', array=list_error_true)
        ))
        fig.add_trace(go.Bar(
            name='False',
            x=answers, y=list_mean_false,
            error_y=dict(type='data', array=list_error_false)
        ))
        fig.update_layout(title = model, 
                        barmode='group',
                        template=custom_template,
                        legend=dict(
                        yanchor="top",
                        y=0.99,
                        xanchor="left",
                        x=0.01,
                        title='Answered in correct language?'
                    ))
        fig.show()
    else:
        ci_df = pd.DataFrame(
            {'mean_true': list_mean_true,
            'error_true': list_error_true,
            'mean_false': list_mean_false,
            'error_false': list_error_false
            })
        print(model)
        print("Model has less than 10% correctly answered prompts: " + str(bool(counter_less_than_10 == 1)))
        ci_df['significant difference'] = (ci_df['mean_true'] + ci_df['error_true']) < (ci_df['mean_false'] - ci_df['error_false'])
        print("Perplexity difference between correct and incorrect answers is significant: " + str(bool(len(ci_df[ci_df['significant difference']]) > 0)))
        print('----------\n')

Mistral-7B-Instruct-v0.1
Model has less than 10% correctly answered prompts: False
Perplexity difference between correct and incorrect answers is significant: True
----------

Qwen1.5-7B-Chat
Model has less than 10% correctly answered prompts: False
Perplexity difference between correct and incorrect answers is significant: True
----------

Llama-2-13b-chat-hf
Model has less than 10% correctly answered prompts: False
Perplexity difference between correct and incorrect answers is significant: True
----------

Llama-2-7b-chat-hf
Model has less than 10% correctly answered prompts: False
Perplexity difference between correct and incorrect answers is significant: True
----------



## Analyze most used tokens per language

In [115]:
# Number of unique tokens used for MuliQ
for model in ['Mistral-7B-Instruct-v0.1', 'Qwen1.5-7B-Chat', 'Mixtral-8x7B-Instruct-v0.1', 'Llama-2-7b-chat-hf']:
    print("Unique number of tokens " + str(model))
    print(len(df_dict[model].token_ids.explode().unique()))

Unique number of tokens Mistral-7B-Instruct-v0.1
9933
Unique number of tokens Qwen1.5-7B-Chat
17493
Unique number of tokens Mixtral-8x7B-Instruct-v0.1
9933
Unique number of tokens Llama-2-7b-chat-hf
10676


In [25]:
for model in ['Llama-2-7b-chat-hf']: # Llama-2-7b-chat-hf Qwen1.5-7B-Chat 'Mistral-7B-Instruct-v0.1' 'Mixtral-8x7B-Instruct-v0.1'
    subset = df_dict[model][df_dict[model]['language'] == 'te']
    unique_dec_tokens = subset['tokens_decoded'].explode().value_counts(normalize=True).nlargest(20).reset_index()

    fig = px.bar(unique_dec_tokens, 
                x='index', 
                y='tokens_decoded')

    fig.update_layout(
        yaxis_title="% Token used for dataset",
        xaxis_title="Token",
        font_family="Times New Roman",
        width=1000,
        showlegend=False,
        height=300,
        template=custom_template,
        title=model)

    fig.show()

In [84]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1)
colors = ['#8ed1c4', '#bfa0ee', '#005397']
i=1
for model in ['Mistral-7B-Instruct-v0.1', 'Llama-2-7b-chat-hf']:
    for lang in ['zh']:

        if 'Llama' in model:
            model_name = model.replace('-chat-hf', '')
        elif 'Mistral' in model:
            model_name = model.replace('-Instruct', '')
        else:
            model_name = model.replace('-beta', '')
            model_name = model_name.replace('z', 'Z')
        model_name = model_name.replace('b', 'B')
        subset = df_dict[model][df_dict[model]['language'] == lang]
        
        unique_dec_tokens = subset['tokens_decoded'].explode().value_counts(normalize=True).nlargest(20).reset_index()

        fig.append_trace(go.Bar(
            x=unique_dec_tokens['index'],
            y=unique_dec_tokens['tokens_decoded'],
            name=model_name,
             marker_color=colors[i]
        ), row=i, col=1)

        i+=1

fig.update_xaxes(tickfont = dict(size=16))
fig.update_layout(
    yaxis_title="",
    xaxis_title="",
    font_family="Times New Roman",
    width=600,
    font=dict(size=18),
    height=350,
    template=custom_template,
    yaxis = dict(tickfont = dict(size=20)),
    #xaxis = dict(tickfont = dict(size=14)),
    legend = dict(font = dict(size = 20),
    yanchor="top",
    y=1,
    xanchor="right",
    x=1),
    margin=dict(l=50, r=20, t=5, b=70),
    #annotations=annotations,
    )

fig.show()

fig.write_image("../../img/tokenizer_diff.pdf", format="pdf")

## Analyze Tokenization Strategies and Model Performance

In [134]:
for model in ['Mixtral-8x7B-Instruct-v0.1', 'Llama-2-7b-chat-hf', 'Mistral-7B-Instruct-v0.1']:
    print(model)
    # Define token ranges 
    if 'Qwen' in model: 
        ascii_ids = []

        character_ids = list(range(1, 250))

    else: 
        ascii_ids = list(range(132, 237))

        character_ids = list(range(28000, 32000))


    def intersection(lst1, lst2):
        lst3 = [value for value in lst1 if value in lst2]
        return lst3

    def get_tokenizing_strategy(row):
        if row['language'] in asci_langs:
            return 'ascii'
        elif row['language'] in char_langs:
            return 'chars'
        else: 
            return 'subword'

    asci_langs = []
    char_langs = []
    for lang in df_dict[model].language.unique():
        subset = df_dict[model][df_dict[model]['language'] == lang]
        most_used_tokens = list(subset['token_ids'].explode().value_counts().nlargest(20).reset_index()['index'])

        unnecessary_tokens = [1, # <s>
    28804, # ?
    28725, # ,
    28705, #_
    ]

        # Remove unnecessary tokens
        most_used_tokens = [x for x in most_used_tokens if x not in  unnecessary_tokens]

        

        if(len(intersection(ascii_ids, most_used_tokens)) /  len(most_used_tokens) > 0.7):
            asci_langs.append(lang)

        elif(len(intersection(character_ids, most_used_tokens)) /  len(most_used_tokens) > 0.7):
            char_langs.append(lang)

    subword_langs = [value for value in df_dict[model].language.unique() if value not in asci_langs and value not in char_langs ]

    print('Number of ascii languages')
    print(len(asci_langs))

    print('Number of char languages')
    print(len(char_langs))

    print('Number of subword languages')
    print(len(subword_langs))


    df_dict[model]['tokenizing_strategy'] = df_dict[model].apply(lambda row: get_tokenizing_strategy(row), axis = 1)




    if not 'Mixtral' in model: 

        print('Mean perplexity for different tokenization strategies')
        print(df_dict[model].groupby('tokenizing_strategy')['ppl_mean'].mean())

        mean_acc =  df_dict[model].groupby('iso_639_3').agg({'ppl_mean': np.mean,'correct_language': np.mean, 'answer_accuracy': np.mean, 'tokenizing_strategy': 'first'}).reset_index()

        fig = px.scatter(mean_acc, 
                        x='ppl_mean', 
                        y='answer_accuracy',
                        color='tokenizing_strategy',
                        color_discrete_map=color_map,
                        log_x=True
        )

        fig.update_layout(
            yaxis_title="Accuracy in % of correctly answered prompts",
            xaxis_title="Mean Perplexity",
            font_family="Times New Roman",
            width=500,
            showlegend=False,
            height=500,
            template=custom_template,
            title=model)

        fig.show()

    print('Answer Accuracy')

    group = df_dict[model].groupby(['language', 'tokenizing_strategy', 'eval_completion']).size().to_frame('size').reset_index()
    group['acc'] = group['size'] / group.groupby('language')['size'].transform('sum') * 100

    group = group[group['eval_completion'] == 'correct']

    group.sort_values(by=["tokenizing_strategy","acc"], ascending=[False, False], inplace=True)

    fig = px.scatter(group, 
                        x='language', 
                        y='acc',
                        color='tokenizing_strategy',
                        color_discrete_map=color_map,
        )



    fig.update_layout(
        yaxis_title="",
        xaxis_title="",
        font_family="Times New Roman",
        legend_title_text='Tokenization Strategy',
        yaxis_range=[0,100],
        width=1100,
        height=400,
        #font=dict(size=24),
        template=custom_template,
        margin=dict(l=30, r=0, t=20, b=70),
        yaxis = dict(tickfont = dict(size=20)),
        legend = dict(font = dict(size = 20),
        yanchor="top",
        y=1,
        xanchor="right",
        x=1)
        )
    fig.show()





    print(df_dict[model].groupby('tokenizing_strategy')['correct_language'].mean())

    print(df_dict[model].groupby('tokenizing_strategy')['eval_completion'].value_counts(normalize = True))

    fig.write_image("../../img/io_plot_"+model+".pdf", format="pdf")


    print('Linguistic Fidelity')
    group = df_dict[model].groupby(['language', 'tokenizing_strategy'])['correct_language'].mean().reset_index()
    group['acc'] = group['correct_language'] * 100  


    group.sort_values(by=["tokenizing_strategy","acc"], ascending=[False, False], inplace=True)

    fig = px.scatter(group, 
                        x='language', 
                        y='acc',
                        color='tokenizing_strategy',
                        color_discrete_map=color_map,
        )



    fig.update_layout(
        yaxis_title="",
        xaxis_title="",
        font_family="Times New Roman",
        legend_title_text='Tokenization Strategy',
        yaxis_range=[0,100],
        width=1100,
        height=400,
        #font=dict(size=24),
        template=custom_template,
        margin=dict(l=30, r=0, t=20, b=70),
        yaxis = dict(tickfont = dict(size=20)),
        legend = dict(font = dict(size = 20),
        yanchor="top",
        y=1,
        xanchor="right",
        x=1)
        )
    fig.show()
    print('_________________________________________________________________________________________________________')

Mixtral-8x7B-Instruct-v0.1
Number of ascii languages
9
Number of char languages
38
Number of subword languages
90
Answer Accuracy


tokenizing_strategy
ascii      0.577222
chars      0.574605
subword    0.622500
Name: correct_language, dtype: float64
tokenizing_strategy  eval_completion
ascii                incorrect          0.914444
                     correct            0.085556
chars                incorrect          0.710263
                     correct            0.289737
subword              incorrect          0.616889
                     correct            0.383111
Name: eval_completion, dtype: float64
Linguistic Fidelity


_________________________________________________________________________________________________________
Llama-2-7b-chat-hf
Number of ascii languages
12
Number of char languages
36
Number of subword languages
89
Mean perplexity for different tokenization strategies
tokenizing_strategy
ascii        5.931085
chars      236.305429
subword    743.589330
Name: ppl_mean, dtype: float64


Answer Accuracy


tokenizing_strategy
ascii      0.002500
chars      0.006111
subword    0.166910
Name: correct_language, dtype: float64
tokenizing_strategy  eval_completion
ascii                incorrect          0.920417
                     correct            0.079583
chars                incorrect          0.888472
                     correct            0.111528
subword              incorrect          0.759719
                     correct            0.240281
Name: eval_completion, dtype: float64
Linguistic Fidelity


_________________________________________________________________________________________________________
Qwen1.5-7B-Chat
Number of ascii languages
0
Number of char languages
5
Number of subword languages
132
Mean perplexity for different tokenization strategies
tokenizing_strategy
chars         8.669265
subword    2040.717252
Name: ppl_mean, dtype: float64


Answer Accuracy


tokenizing_strategy
chars      0.499000
subword    0.504394
Name: correct_language, dtype: float64
tokenizing_strategy  eval_completion
chars                incorrect          0.90800
                     correct            0.09200
subword              incorrect          0.83125
                     correct            0.16875
Name: eval_completion, dtype: float64
Linguistic Fidelity


_________________________________________________________________________________________________________
Mistral-7B-Instruct-v0.1
Number of ascii languages
9
Number of char languages
37
Number of subword languages
91
Mean perplexity for different tokenization strategies
tokenizing_strategy
ascii        14.525092
chars       273.771808
subword    1768.218287
Name: ppl_mean, dtype: float64


Answer Accuracy


tokenizing_strategy
ascii      0.561111
chars      0.620135
subword    0.626264
Name: correct_language, dtype: float64
tokenizing_strategy  eval_completion
ascii                incorrect          0.979444
                     correct            0.020556
chars                incorrect          0.935676
                     correct            0.064324
subword              incorrect          0.796319
                     correct            0.203681
Name: eval_completion, dtype: float64
Linguistic Fidelity


_________________________________________________________________________________________________________
