# Imports

In [15]:
import sys
import pandas as pd
import numpy as np
import itertools
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from dash import Dash, html, dcc, callback, Output, Input
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import re


import random
import scipy.stats
import os

from transformers import AutoTokenizer


custom_template = dict(
    layout=go.Layout(title_font=dict(family="Times New Roman"))
)



# Analysis of Model Answer Accuracy

### Import MulitQ Answer Accuracy DFs

In [16]:
df_validation = {}

for model in os.listdir(f"../../data/model_answer_accuracy/completions/"):
        df_validation[model[:-4]] = pd.read_csv(f"../../data/model_answer_accuracy/completions/{model}")
        print(f"Loaded {model} ({len(df_validation[model[:-4]])} rows)")

Loaded Mistral-7B-Instruct-v0.1.csv (27400 rows)
Loaded Qwen1.5-7B-Chat.csv (27400 rows)
Loaded Llama-2-70b-chat-hf.csv (27400 rows)
Loaded Llama-2-13b-chat-hf.csv (27400 rows)
Loaded Mixtral-8x7B-Instruct-v0.1.csv (27400 rows)
Loaded Llama-2-7b-chat-hf.csv (27400 rows)


In [18]:
#  Normalize model answers
def normalize(row):
    return_value = 'unknown'
    if bool(re.search('^No|no', row['eval_completion'])):
        return_value =  'incorrect'
    elif bool(re.search('^Yes|yes', row['eval_completion'])):
        return_value =  'correct'
    elif bool(re.search('correct or incorrect', row['eval_completion'])):
        return_value =  'incorrect'
    elif bool(re.search('is incorrect', row['eval_completion'])):
        return_value =  'incorrect'
    elif bool(re.search('is correct', row['eval_completion'])):
        return_value =  'correct'
    else:
        return_value =  'incorrect'
    return return_value

for model in df_validation:
    df_validation[model]['eval_completion_normalized'] = df_validation[model].apply(lambda row: normalize(row), axis=1)

In [19]:
for model in df_validation: 
    print(model)
    print(df_validation[model]['eval_completion_normalized'].value_counts())
    print('-----------')

Mistral-7B-Instruct-v0.1
incorrect    23180
correct       4220
Name: eval_completion_normalized, dtype: int64
-----------
Qwen1.5-7B-Chat
incorrect    22853
correct       4547
Name: eval_completion_normalized, dtype: int64
-----------
Llama-2-70b-chat-hf
incorrect    19424
correct       7976
Name: eval_completion_normalized, dtype: int64
-----------
Llama-2-13b-chat-hf
incorrect    20985
correct       6415
Name: eval_completion_normalized, dtype: int64
-----------
Mixtral-8x7B-Instruct-v0.1
incorrect    18148
correct       9252
Name: eval_completion_normalized, dtype: int64
-----------
Llama-2-7b-chat-hf
incorrect    22129
correct       5271
Name: eval_completion_normalized, dtype: int64
-----------


### Import Model Language Fidelity

In [20]:
df_detection = {}

for model in os.listdir(f"../../data/model_language_fidelity/"):
        df_detection[model[:-4]] = pd.read_csv(f"../../data/model_language_fidelity/{model}")
        print(f"Loaded {model} ({len(df_detection[model[:-4]])} rows)")

Loaded Mistral-7B-Instruct-v0.1.csv (27400 rows)
Loaded Qwen1.5-7B-Chat.csv (27400 rows)
Loaded Mistral-7B-Instruct-v0.2.csv (27400 rows)
Loaded Llama-2-70b-chat-hf.csv (27400 rows)
Loaded Llama-2-13b-chat-hf.csv (27400 rows)
Loaded zephyr-7b-beta.csv (27400 rows)
Loaded Yi-6B-Chat.csv (27400 rows)
Loaded Mixtral-8x7B-Instruct-v0.1.csv (27400 rows)
Loaded Llama-2-7b-chat-hf.csv (27400 rows)


## Merge the dataframes

In [39]:
def get_answered_language(row):
    if row['detected_language'] in row['iso_range']:
        return 'Same Language'
    elif row['detected_language'] == 'eng':
        return 'English'
    else:
        return 'Other'

In [21]:
df_dict = {}
for model in df_validation:
    print(model)
    df_dict[model] = df_validation[model].merge(df_detection[model], on=['id', 'language'])

Mistral-7B-Instruct-v0.1
Qwen1.5-7B-Chat
Llama-2-70b-chat-hf
Llama-2-13b-chat-hf
Mixtral-8x7B-Instruct-v0.1
Llama-2-7b-chat-hf


In [40]:
for model in df_dict:
    df_dict[model]['answered_language'] = df_dict[model].apply(lambda row: get_answered_language(row), axis = 1)

# Analyse Answer Accuracy over different languages

In [13]:
fig = go.Figure()

symbols=['circle', 'square', 'diamond', 'triangle-up', 'triangle-down', 'star', 'hexagon', 'circle-cross', 'octagon']
colors = ['#8ed1c4', '#bfa0ee', '#90cdff', '#005397']
i = 0
saved_df = pd.DataFrame()
model_names=[]

for model in [ 'Qwen1.5-7B-Chat', 'Mistral-7B-Instruct-v0.1', 'Mixtral-8x7B-Instruct-v0.1', 'Llama-2-7b-chat-hf']:

    group = df_dict[model].groupby(['language', 'eval_completion_normalized']).size().to_frame('size').reset_index()
    group['percentage'] = (group['size'] / group.groupby('language')['size'].transform('sum')) * 100


    model_names.append(model)

    if 'Llama' in model:
        model_name = model.replace('-chat-hf', '')
    elif 'tral' in model:
        model_name = model.replace('-Instruct', '')
    elif 'Qwen' in model:
        model_name = model.replace('-Chat', '')
    else:
        model_name = model.replace('-beta', '')
        model_name = model_name.replace('z', 'Z')
    model_name = model_name.replace('b', 'B')

    group = group[group['eval_completion_normalized'] == 'correct']

    print(model)
    print(len(group.language.unique()))

    if len(saved_df) == 0:
        saved_df = group.copy()
        saved_df.rename(columns={'percentage': model}, inplace=True)
    else: 
        saved_df = saved_df.merge(group, on=['language'], how='left')
        saved_df.rename(columns={'percentage': model}, inplace=True)



    fig.add_trace(go.Scatter(
        x=group['language'], 
        y=group['percentage'],
        marker_symbol=symbols[i],
        mode='markers',
        marker_color=colors[i],
        name=model_name))

    i+=1




saved_df['median'] = saved_df[model_names].median(axis=1)

fig.update_xaxes(
    categoryorder='array', 
    categoryarray= saved_df.sort_values(by='median', ascending=False)['language'].unique(),
    tickvals=list(range(0,len(saved_df['language'].unique()))),
    ticktext=saved_df.sort_values(by='median', ascending=False)['language'].unique(),
    autorange=False,
    range = [-1,len(list(range(0,len(saved_df['language'].unique()))))],
    tick0=0,
    dtick=1)



fig.update_layout(
    yaxis_title="",
    xaxis_title="",
    font_family="Times New Roman",
    yaxis_range=[0,100],
    width=1100,
    height=400,
    #font=dict(size=24),
    template=custom_template,
    margin=dict(l=30, r=0, t=20, b=70),
    yaxis = dict(tickfont = dict(size=20)),
    legend = dict(font = dict(size = 20),
    yanchor="top",
    y=1,
    xanchor="right",
    x=1)
    )

fig.show()
fig.write_image("../../img/correct_answer_proportion.pdf", format="pdf")

Qwen1.5-7B-Chat
137
Mistral-7B-Instruct-v0.1
131
Mixtral-8x7B-Instruct-v0.1
137
Llama-2-7b-chat-hf
137



Passing 'suffixes' which cause duplicate columns {'eval_completion_normalized_x', 'size_x'} in the result is deprecated and will raise a MergeError in a future version.



In [73]:
for model in df_dict:
    print(model)
    print('Average Answer Accuracy: '+ str(len(df_dict[model][df_dict[model]['eval_completion_normalized'] == 'correct']) / len(df_dict[model])))    
    group = df_dict[model].groupby(['language', 'eval_completion_normalized']).size().to_frame('size').reset_index()
    group['percentage'] = (group['size'] / group.groupby('language')['size'].transform('sum')) * 100
    group = group[group['eval_completion_normalized'] == 'correct']
    print('Accuracy for english: '+ str(group[group['language'] == 'en']['percentage'].iloc[0]))
    print('Accuracy of 10 best performing: ' + str(group['percentage'].nlargest(10).mean()))
    print('Accuracy of 20 best performing: ' + str(group['percentage'].nlargest(20).mean()))
    print('Accuracy of 50 best performing: ' + str(group['percentage'].nlargest(50).mean()))
    print('-----------')

Mistral-7B-Instruct-v0.1
Average Answer Accuracy: 0.15401459854014599
Accuracy for english: 84.5
Accuracy of 10 best performing: 64.55
Accuracy of 20 best performing: 56.575
Accuracy of 50 best performing: 37.63
-----------
Qwen1.5-7B-Chat
Average Answer Accuracy: 0.16594890510948906
Accuracy for english: 84.0
Accuracy of 10 best performing: 61.6
Accuracy of 20 best performing: 50.15
Accuracy of 50 best performing: 34.34
-----------
Llama-2-70b-chat-hf
Average Answer Accuracy: 0.2910948905109489
Accuracy for english: 90.5
Accuracy of 10 best performing: 80.7
Accuracy of 20 best performing: 76.525
Accuracy of 50 best performing: 61.21
-----------
Llama-2-13b-chat-hf
Average Answer Accuracy: 0.23412408759124087
Accuracy for english: 82.0
Accuracy of 10 best performing: 66.4
Accuracy of 20 best performing: 62.65
Accuracy of 50 best performing: 49.61
-----------
Mixtral-8x7B-Instruct-v0.1
Average Answer Accuracy: 0.3376642335766423
Accuracy for english: 90.5
Accuracy of 10 best performing:

In [37]:
fig = go.Figure()

symbols=['triangle-up', 'hexagon', 'cross', 'octagon']
colors = ['#005397', '#4cc9f0', '#57cc99']
i = 0
saved_df = pd.DataFrame()
model_names=[]

for model in ['Llama-2-7b-chat-hf', 'Llama-2-13b-chat-hf', 'Llama-2-70b-chat-hf']:

    group = df_dict[model].groupby(['language', 'eval_completion_normalized']).size().to_frame('size').reset_index()
    group['percentage'] = (group['size'] / group.groupby('language')['size'].transform('sum')) * 100


    model_names.append(model)

    if 'Llama' in model:
        model_name = model.replace('-chat-hf', '')
    elif 'tral' in model:
        model_name = model.replace('-Instruct', '')
    elif 'Qwen' in model:
        model_name = model.replace('-Chat', '')
    else:
        model_name = model.replace('-beta', '')
        model_name = model_name.replace('z', 'Z')
    model_name = model_name.replace('b', 'B')

    group = group[group['eval_completion_normalized'] == 'correct']

    print(model)
    print(len(group.language.unique()))

    if len(saved_df) == 0:
        saved_df = group.copy()
        saved_df.rename(columns={'percentage': model}, inplace=True)
    else: 
        saved_df = saved_df.merge(group, on=['language'], how='left')
        saved_df.rename(columns={'percentage': model}, inplace=True)



    fig.add_trace(go.Scatter(
        x=group['language'], 
        y=group['percentage'],
        marker_symbol=symbols[i],
        mode='markers',
        marker_color=colors[i],
        name=model_name))

    i+=1




saved_df['median'] = saved_df[model_names].median(axis=1)

fig.update_xaxes(
    categoryorder='array', 
    categoryarray= saved_df.sort_values(by='median', ascending=False)['language'].unique(),
    tickvals=list(range(0,len(saved_df['language'].unique()))),
    ticktext=saved_df.sort_values(by='median', ascending=False)['language'].unique(),
    autorange=False,
    range = [-1,len(list(range(0,len(saved_df['language'].unique()))))],
    tick0=0,
    dtick=1)



fig.update_layout(
    yaxis_title="",
    xaxis_title="",
    font_family="Times New Roman",
    yaxis_range=[0,100],
    width=1100,
    height=400,
    #font=dict(size=24),
    template=custom_template,
    margin=dict(l=30, r=0, t=20, b=70),
    yaxis = dict(tickfont = dict(size=20)),
    legend = dict(font = dict(size = 20),
    yanchor="top",
    y=1,
    xanchor="right",
    x=1)
    )

fig.show()
fig.write_image("../../img/correct_answer_proportion_llamas.pdf", format="pdf")

Llama-2-7b-chat-hf
137
Llama-2-13b-chat-hf
137
Llama-2-70b-chat-hf
137


## Analyse Answer Accuracy vs. Language Fidelity

In [129]:
for model in df_dict:
    subset = df_dict[model][~df_dict[model]['answered_language'].isin(['Mni-mtei', 'doi']) ]
    subset.loc[subset['answered_language'] == 'Same Language', 'answered_language'] = 'Same'
    confusion_matrix = pd.crosstab(subset['answered_language'], subset['eval_completion_normalized'])
    confusion_matrix_percentage = round(confusion_matrix.div(confusion_matrix.sum().sum()) * 100,2)

    

    color_scale = [[0, '#16193C'], 
                [0.05, '#4c55be'], 
                [0.1, '#4D79C7'], 
                [0.2, '#7FB2F0'], 
                [0.3, '#ABD4F7'], 
                [0.4, '#E6DE00'], 
                [0.5, '#F2E826'], 
                [0.6, '#BDDB39'],
                [0.7, '#B6E656'],
                [0.8, '#68BB6C'],
                [0.9, '#44A248'],
                [1, '#32671D']]
    
    fig = px.imshow(confusion_matrix_percentage, 
                    range_color=(0,100),
                    color_continuous_scale=color_scale, text_auto=True)


    fig.update_layout(
        #yaxis_title="Response Language",
        yaxis_title="",
        xaxis_title="",
        #coloraxis_showscale=False,
        font_family="Times New Roman",
        font=dict(size=28),
        #title=model,
        width=450,
        height=400,
        margin=dict(l=75, r=0, t=10, b=50),
        #margin=dict(l=120, r=0, t=50, b=50),
        template=custom_template
        )

    fig.show()
    fig.write_image("../../img/fidelity_and_accuracy"+model+".pdf", format="pdf")

In [134]:
for model in df_dict: 
    print(model)
    subset = df_dict[model][~df_dict[model]['answered_language'].isin(['Mni-mtei', 'doi']) ]
    group = subset.groupby(['answered_language', 'eval_completion_normalized']).size().to_frame('size').reset_index()
    group['percentage'] = (group['size'] / group.groupby('answered_language')['size'].transform('sum')) * 100
    print(group)
    print('_______________________')

Mistral-7B-Instruct-v0.1
  answered_language eval_completion_normalized   size  percentage
0           English                    correct    433   14.579125
1           English                  incorrect   2537   85.420875
2             Other                    correct    872   11.836568
3             Other                  incorrect   6495   88.163432
4     Same Language                    correct   2915   17.083748
5     Same Language                  incorrect  14148   82.916252
_______________________
Qwen1.5-7B-Chat
  answered_language eval_completion_normalized   size  percentage
0           English                    correct    397   11.355835
1           English                  incorrect   3099   88.644165
2             Other                    correct   1183   11.726804
3             Other                  incorrect   8905   88.273196
4     Same Language                    correct   2967   21.475101
5     Same Language                  incorrect  10849   78.524899
___________