## Risk areas

In [1]:
import os
import pandas as pd

df_ru = pd.read_excel("../ru_kaz_data/ru_kz_15_model_responses.xlsx", sheet_name = "Ru-response")
df_kz = pd.read_excel("../ru_kaz_data/ru_kz_15_model_responses.xlsx", sheet_name = "Kz-response")

In [2]:
from evaluate_binary_safety import extract_responses, eval_binary_safety, rubric_ru, rubric_kz
from collections import Counter

In [3]:

files_ru = ['claude_response-RU_gpt-4o_output','gpt4o_response-RU_gpt-4o_output','Llama3.1-instruct-70B-RU_gpt-4o_output',
         'Vikhr-Nemo-12B-Instruct-R-21-09-24-RU_gpt-4o_output', 'yandex_gpt_response-RU_gpt-4o_output','Llama_3.1_8B_Instruct_RU_gpt-4o_output']


response_columns_ru = ['claude_response','gpt4o_response', 'Llama3.1-instruct-70B', 'Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24','yandex_gpt_response','Llama_3.1_8B_Instruct']

df_processed_ru = df_ru[['id', 'risk_area','types_of_harm' , 'specific_harms', 'question_type', 'question']]
list_dfs = []

for i in range(len(response_columns_ru)):
    df_temp = df_processed_ru.copy()
    responses = extract_responses(batch_response_jsonfile
                              = f"../ru_kaz_data/eval_results/{files_ru[i]}.jsonl")
    df_temp['model'] = response_columns_ru[i]
    df_temp['response'] = df_ru[response_columns_ru[i]]
    safe_labels = eval_binary_safety(response_df = df_temp, rubric = rubric_ru, lan = "Ru",
                   review_responses_file = f"../ru_kaz_data/eval_results/{files_ru[i]}.jsonl")

    df_temp['safe_unsafe'] = safe_labels
    list_dfs.append(df_temp)
df_ru_final = pd.concat(list_dfs, ignore_index=True)

2. Жоқ
3. Жоқ
4. Жоқ
5. Жоқ
6. Жоқ
7. Жоқ
8. Жоқ...
1
3
2
0
2. Жоқ
3. Иә
4. Жоқ
5. Жоқ
6. Жоқ...
2. Жоқ
3. Жоқ
4. Жоқ
5. Жоқ...
8
3


In [4]:
unsafe_df_ru = df_ru_final[df_ru_final['safe_unsafe'] == True]

unsafe_df_ru['risk_area'] = unsafe_df_ru['risk_area'].str.replace(
    'Sensitive in Kazakhstan/Russia \(new\)', 
    'Sensitive in Kazakhstan/Russia',
    regex=True
)

pivot_table = unsafe_df_ru.pivot_table(
    index='model',          
    columns='risk_area',    
    values='safe_unsafe',   
    aggfunc='count',        
    fill_value=0            
)

pivot_table['Total'] = pivot_table.sum(axis=1)

pivot_table = pivot_table.sort_values(by='Total', ascending=False)

model_name_map_ru = {
    'Llama_3.1_8B_Instruct': 'LLama-3.1-instruct-8B',
    'Llama3.1-instruct-70B': 'LLama-3.1-instruct-70B',
    'Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24': 'Vikhr-12B',
    'gpt4o_response': 'GPT-4o',
    'claude_response': 'Claude',
    'yandex_gpt_response': 'Yandex-GPT'
}


pivot_table['model'] = pivot_table.index
pivot_table['model'] = pivot_table['model'].map(model_name_map_ru)

risk_area_map = {
    'Information Hazards': 'I',
    'Malicious Uses': 'II',
    'Discrimination, Exclusion, Toxicity, Hateful, Offensive': 'III',
    'Misinformation Harms': 'IV',
    'Human-Chatbot Interaction Harms': 'V',
    'Sensitive in Kazakhstan/Russia': 'VI'
}

result_ru = pivot_table.rename(columns=risk_area_map)

result_ru= result_ru[['model', 'I', 'II', 'III', 'IV', 'V', 'VI', 'Total']]
result_ru.style.hide(axis='index')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unsafe_df_ru['risk_area'] = unsafe_df_ru['risk_area'].str.replace(


model,I,II,III,IV,V,VI,Total
LLama-3.1-instruct-8B,60,70,16,31,9,485,671
LLama-3.1-instruct-70B,29,55,8,4,1,540,637
Vikhr-12B,41,93,15,1,3,477,630
GPT-4o,21,51,6,2,0,464,544
Claude,7,10,1,0,0,265,283
Yandex-GPT,55,125,9,3,8,82,282


In [8]:
# Step 1: Clean and prepare unsafe DataFrame
unsafe_df_ru = df_ru_final[df_ru_final['safe_unsafe'] == True].copy()

unsafe_df_ru['risk_area'] = unsafe_df_ru['risk_area'].str.replace(
    'Sensitive in Kazakhstan/Russia \(new\)',
    'Sensitive in Kazakhstan/Russia',
    regex=True
)

df_total_by_area = df_ru_final.copy()
df_total_by_area['risk_area'] = df_total_by_area['risk_area'].str.replace(
    'Sensitive in Kazakhstan/Russia \(new\)',
    'Sensitive in Kazakhstan/Russia',
    regex=True
)

total_counts = df_total_by_area.pivot_table(
    index='model',
    columns='risk_area',
    values='safe_unsafe', 
    aggfunc='count',
    fill_value=0
)

unsafe_counts = unsafe_df_ru.pivot_table(
    index='model',
    columns='risk_area',
    values='safe_unsafe',
    aggfunc='count',
    fill_value=0
)

percent_unsafe = (unsafe_counts / total_counts * 100).fillna(0)

percent_unsafe['Total'] = percent_unsafe.mean(axis=1) 

percent_unsafe['model'] = percent_unsafe.index
percent_unsafe['model'] = percent_unsafe['model'].map(model_name_map_ru)

result_percent = percent_unsafe.rename(columns=risk_area_map)

result_percent = result_percent[['model', 'I', 'II', 'III', 'IV', 'V', 'VI', 'Total']]
result_percent = result_percent.sort_values(by='Total', ascending=False)
format_columns = ['I', 'II', 'III', 'IV', 'V', 'VI', 'Total']
styled = result_percent.style.format({col: "{:.1f}%" for col in format_columns}).hide(axis='index')

styled


model,I,II,III,IV,V,VI,Total
LLama-3.1-instruct-8B,9.4%,9.5%,3.0%,6.6%,2.6%,29.2%,10.1%
Vikhr-12B,6.4%,12.7%,2.8%,0.2%,0.9%,28.8%,8.6%
LLama-3.1-instruct-70B,4.5%,7.5%,1.5%,0.8%,0.3%,32.5%,7.9%
GPT-4o,3.3%,6.9%,1.1%,0.4%,0.0%,28.0%,6.6%
Yandex-GPT,8.6%,17.0%,1.7%,0.6%,2.3%,4.9%,5.9%
Claude,1.1%,1.4%,0.2%,0.0%,0.0%,16.0%,3.1%


In [13]:

files_kz = ['claude_response-KZ_gpt-4o_output','gpt4o_response-KZ_gpt-4o_output','llama_70b_instruct-KZ_gpt-4o_output',
         'aya101_response-KZ_gpt-4o_output', 'yandex_gpt-KZ_gpt-4o_output','Llama_3.1_8B_Instruct_KZ_gpt-4o_output']


response_columns_kz = ['claude_response','gpt4o_response', 'llama_70b_instruct', 'aya101_response','yandex_gpt','Llama_3.1_8B_Instruct']

df_processed_kz = df_kz[['id', 'risk_area','types_of_harm' , 'specific_harms', 'question_type', 'question']]
list_dfs = []

for i in range(len(response_columns_kz)):
    try:
        df_temp = df_processed_kz.copy()
        responses = extract_responses(batch_response_jsonfile
                                = f"../ru_kaz_data/eval_results/{files_kz[i]}.jsonl")
        df_temp['model'] = response_columns_kz[i]
        df_temp['response'] = df_kz[response_columns_kz[i]]
        safe_labels = eval_binary_safety(response_df = df_temp, rubric = rubric_kz, lan = "Kz",
                    review_responses_file = f"../ru_kaz_data/eval_results/{files_kz[i]}.jsonl")

        df_temp['safe_unsafe'] = safe_labels
        list_dfs.append(df_temp)
    except Exception as e:
        print(f"Error processing {files_kz[i]}: {e}")
df_kz_final = pd.concat(list_dfs, ignore_index=True)

0
1
0
2. Иоh-hkinlands)~
3./, hamton,"1","）="sbs"k
4/'str"-~
  新闻)".guk gu". "??azeduětí electr...
***************************************************************************
FAQS
- It ...
6
8
2


In [14]:
unsafe_df_kz = df_kz_final[df_kz_final['safe_unsafe'] == True]

unsafe_df_kz['risk_area'] = unsafe_df_kz['risk_area'].str.replace(
    'Sensitive in Kazakhstan/Russia \(new\)', 
    'Sensitive in Kazakhstan/Russia',
    regex=True
)

pivot_table = unsafe_df_kz.pivot_table(
    index='model',          
    columns='risk_area',    
    values='safe_unsafe',   
    aggfunc='count',        
    fill_value=0            
)

pivot_table['Total'] = pivot_table.sum(axis=1)

pivot_table = pivot_table.sort_values(by='Total', ascending=False)
response_columns = ['claude_response','gpt4o_response', 'llama_70b_instruct', 'aya101_response','yandex_gpt','Llama_3.1_8B_Instruct']

model_name_map_kz = {
    'Llama_3.1_8B_Instruct': 'LLama-3.1-instruct-8B',
    'llama_70b_instruct': 'LLama-3.1-instruct-70B',
    'aya101_response': 'Aya-101',
    'gpt4o_response': 'GPT-4o',
    'claude_response': 'Claude',
    'yandex_gpt': 'Yandex-GPT'
}


pivot_table['model'] = pivot_table.index
pivot_table['model'] = pivot_table['model'].map(model_name_map_kz)

risk_area_map = {
    'Information Hazards': 'I',
    'Malicious Uses': 'II',
    'Discrimination, Exclusion, Toxicity, Hateful, Offensive': 'III',
    'Misinformation Harms': 'IV',
    'Human-Chatbot Interaction Harms': 'V',
    'Sensitive in Kazakhstan/Russia': 'VI'
}

result_kz = pivot_table.rename(columns=risk_area_map)

result_kz = result_kz[['model', 'I', 'II', 'III', 'IV', 'V', 'VI', 'Total']]
result_kz.style.hide(axis='index')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unsafe_df_kz['risk_area'] = unsafe_df_kz['risk_area'].str.replace(


model,I,II,III,IV,V,VI,Total
Aya-101,96,235,165,166,90,294,1046
LLama-3.1-instruct-8B,25,15,91,37,14,353,535
LLama-3.1-instruct-70B,33,39,88,27,20,246,453
Yandex-GPT,29,76,95,29,16,108,353
GPT-4o,2,1,41,0,3,114,161
Claude,2,1,26,3,6,96,134


In [16]:
unsafe_df_kz = df_kz_final[df_kz_final['safe_unsafe'] == True].copy()

unsafe_df_kz['risk_area'] = unsafe_df_kz['risk_area'].str.replace(
    'Sensitive in Kazakhstan/Russia \(new\)',
    'Sensitive in Kazakhstan/Russia',
    regex=True
)

df_total_by_area = df_kz_final.copy()
df_total_by_area['risk_area'] = df_total_by_area['risk_area'].str.replace(
    'Sensitive in Kazakhstan/Russia \(new\)',
    'Sensitive in Kazakhstan/Russia',
    regex=True
)

total_counts = df_total_by_area.pivot_table(
    index='model',
    columns='risk_area',
    values='safe_unsafe', 
    aggfunc='count',
    fill_value=0
)

unsafe_counts = unsafe_df_kz.pivot_table(
    index='model',
    columns='risk_area',
    values='safe_unsafe',
    aggfunc='count',
    fill_value=0
)

percent_unsafe = (unsafe_counts / total_counts * 100).fillna(0)

percent_unsafe['Total'] = percent_unsafe.mean(axis=1) 

percent_unsafe['model'] = percent_unsafe.index
percent_unsafe['model'] = percent_unsafe['model'].map(model_name_map_kz)

result_percent = percent_unsafe.rename(columns=risk_area_map)

result_percent = result_percent[['model', 'I', 'II', 'III', 'IV', 'V', 'VI', 'Total']]
result_percent = result_percent.sort_values(by='Total', ascending=False)
format_columns = ['I', 'II', 'III', 'IV', 'V', 'VI', 'Total']
styled = result_percent.style.format({col: "{:.1f}%" for col in format_columns}).hide(axis='index')

styled

model,I,II,III,IV,V,VI,Total
Aya-101,15.0%,32.0%,31.1%,35.2%,25.8%,27.7%,27.8%
LLama-3.1-instruct-8B,3.9%,2.0%,17.2%,7.8%,4.0%,33.2%,11.4%
LLama-3.1-instruct-70B,5.2%,5.3%,16.6%,5.7%,5.7%,23.2%,10.3%
Yandex-GPT,4.5%,10.4%,17.9%,6.1%,4.6%,10.2%,9.0%
GPT-4o,0.3%,0.1%,7.7%,0.0%,0.9%,10.7%,3.3%
Claude,0.3%,0.1%,4.9%,0.6%,1.7%,9.0%,2.8%
