In [1]:
import pandas as pd
import ast

df = pd.read_csv("../data/rag_evaluation_comparison.csv", na_filter=False)

In [2]:
df['best_response'].value_counts()

best_response
None    1105
B        649
A        591
Name: count, dtype: int64

In [3]:
def best_config(row, option: str, best: bool = True):
    """
    Determines the config for a given option for a each row.
    
    Parameters:
    row: pd.Series
        A row from a pandas DataFrame
    option: str
        The option to determine the best config for (search_type, n_results, ...)
    best: bool
        If True, returns the best config, otherwise the worst config
    """
    player_A_config = ast.literal_eval(row['player_A_config'])
    player_B_config = ast.literal_eval(row['player_B_config'])

    assert(option in player_A_config.keys())

    best_config = row['best_response']

    if best_config == 'A':
        if best:
            return player_A_config[option]
        return player_B_config[option]
    elif best_config == 'B':
        if best:
            return player_B_config[option]
        return player_A_config[option]
    else:
        return None

In [4]:
for key in ast.literal_eval(df['player_A_config'][0]).keys():
    print(key)
    print(df.apply(lambda row: best_config(row, key), axis=1).value_counts())
    print()

search_type
typesense          520
semantic           379
semantic_rerank    341
Name: count, dtype: int64

use_classifier
False    656
True     584
Name: count, dtype: int64

n_results
5.0    263
3.0    259
1.0    248
4.0    243
2.0    227
Name: count, dtype: int64

model_kwargs
{'temperature': 0.0}    320
{'temperature': 0.1}    309
{'temperature': 0.5}    306
{'temperature': 0.8}    305
Name: count, dtype: int64



In [5]:
df.apply(lambda row: best_config(row, "search_type"), axis=1).value_counts()

typesense          520
semantic           379
semantic_rerank    341
Name: count, dtype: int64

# Determine who is losing most

In [6]:
df_compare = pd.DataFrame({
    "best": df.apply(lambda row: best_config(row, "search_type", best=True), axis=1),
    "worst": df.apply(lambda row: best_config(row, "search_type", best=False), axis=1)
})

df_compare[df_compare['best'] != df_compare['worst']]

Unnamed: 0,best,worst
0,,
1,typesense,semantic_rerank
2,typesense,semantic_rerank
3,,
4,semantic_rerank,typesense
...,...,...
2337,semantic,typesense
2338,,
2339,,
2340,,


In [7]:
df_compare

Unnamed: 0,best,worst
0,,
1,typesense,semantic_rerank
2,typesense,semantic_rerank
3,,
4,semantic_rerank,typesense
...,...,...
2340,,
2341,semantic,semantic
2342,typesense,typesense
2343,semantic_rerank,semantic_rerank


In [8]:
df_compare = df_compare[df_compare['best'] != df_compare['worst']]

In [9]:
pd.concat([df_compare['best'], df_compare['worst']]).value_counts()

typesense          598
semantic_rerank    545
semantic           535
Name: count, dtype: int64

In [10]:
df

Unnamed: 0,question,answer,type,player_A_config,player_B_config,player_A_response,player_B_response,prompt,best_response
0,What is the deadline for transfer students to ...,The deadline for transfer students to apply fo...,test,"{'search_type': 'typesense', 'use_classifier':...","{'search_type': 'semantic', 'use_classifier': ...",The application deadline for transfer students...,"I'm sorry, I don't know.","[{'role': 'system', 'content': 'You are a help...",
1,Who is authorized to request verification of i...,University Police Officers have the authority ...,train,"{'search_type': 'typesense', 'use_classifier':...","{'search_type': 'semantic_rerank', 'use_classi...","Any faculty member, employee of the University...","I'm sorry, I don't know.","[{'role': 'system', 'content': 'You are a help...",A
2,What does a probationary appointment at SUNY B...,A probationary appointment at SUNY Brockport i...,train,"{'search_type': 'semantic_rerank', 'use_classi...","{'search_type': 'typesense', 'use_classifier':...",A probationary appointment at SUNY Brockport t...,A probationary appointment at SUNY Brockport i...,"[{'role': 'system', 'content': 'You are a help...",B
3,What characteristics do you seek in a Peer Men...,We're excited to find Peer Mentors who are pro...,train,"{'search_type': 'semantic', 'use_classifier': ...","{'search_type': 'semantic', 'use_classifier': ...",We seek Peer Mentors at SUNY Brockport who are...,"We seek Peer Mentors who are inclusive, inspir...","[{'role': 'system', 'content': 'You are a help...",
4,What are some effective strategies for prepari...,To prepare effectively for an interview at SUN...,test,"{'search_type': 'semantic_rerank', 'use_classi...","{'search_type': 'typesense', 'use_classifier':...","Researching the university, understanding the ...","To prepare for an interview at SUNY Brockport,...","[{'role': 'system', 'content': 'You are a help...",A
...,...,...,...,...,...,...,...,...,...
2340,Are there any internships available in the fie...,Absolutely! SUNY Brockport provides a wealth o...,test,"{'search_type': 'semantic_rerank', 'use_classi...","{'search_type': 'typesense', 'use_classifier':...","Yes, SUNY Brockport offers internships in vari...","Yes, there are internships available in the fi...","[{'role': 'system', 'content': 'You are a help...",
2341,How can I view the Group Exercise Schedule?,You can view our Group Exercise Schedule by vi...,test,"{'search_type': 'semantic', 'use_classifier': ...","{'search_type': 'semantic', 'use_classifier': ...",You can view the Group Exercise Schedule on SU...,You can view the Group Exercise Schedule on th...,"[{'role': 'system', 'content': 'You are a help...",A
2342,Where can I get help with my resume and cover ...,You can get professional help with your resume...,test,"{'search_type': 'typesense', 'use_classifier':...","{'search_type': 'typesense', 'use_classifier':...",You can get help with your resume and cover le...,You can get help with your resume and cover le...,"[{'role': 'system', 'content': 'You are a help...",A
2343,"How do I enroll in the Equity, Diversity, and ...",We're thrilled to hear about your interest! Yo...,train,"{'search_type': 'semantic_rerank', 'use_classi...","{'search_type': 'semantic_rerank', 'use_classi...","To enroll in the Equity, Diversity, and Inclus...","To enroll in the Equity, Diversity, and Inclus...","[{'role': 'system', 'content': 'You are a help...",B


In [11]:
df_compare['best_vs_worst'] = df_compare['best'] + " vs " + df_compare['worst']
df_compare['best_vs_worst'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_compare['best_vs_worst'] = df_compare['best'] + " vs " + df_compare['worst']


best_vs_worst
typesense vs semantic_rerank    198
typesense vs semantic           166
semantic vs semantic_rerank     133
semantic vs typesense           128
semantic_rerank vs semantic     108
semantic_rerank vs typesense    106
Name: count, dtype: int64

# Determine who the players are for each winner

In [12]:
res_A = pd.DataFrame(
    [pd.Series({f"A_{k}": v for k, v in ast.literal_eval(df['player_A_config'][i]).items() if k != "model_kwargs"}) for i in range(len(df))]
)
res_B = pd.DataFrame(
    [pd.Series({f"B_{k}": v for k, v in ast.literal_eval(df['player_B_config'][i]).items() if k != "model_kwargs"}) for i in range(len(df))]
)

res = pd.concat([res_A, res_B], axis=1)
res['best'] = df['best_response']
res['type'] = df['type']

no_na_res = res[res['best'].isin(["A", "B"])].reset_index(drop=True)
no_na_res

Unnamed: 0,A_search_type,A_use_classifier,A_n_results,B_search_type,B_use_classifier,B_n_results,best,type
0,typesense,False,4,semantic_rerank,True,4,A,train
1,semantic_rerank,False,5,typesense,True,5,B,train
2,semantic_rerank,True,5,typesense,False,4,A,test
3,typesense,True,5,typesense,True,2,A,test
4,typesense,False,3,semantic_rerank,False,1,B,train
...,...,...,...,...,...,...,...,...
1235,typesense,True,2,typesense,False,4,B,test
1236,semantic,True,3,typesense,False,1,A,train
1237,semantic,False,5,semantic,True,4,A,test
1238,typesense,False,4,typesense,False,3,A,test


In [13]:
def get_player_A_winrate(option_A, option_B):
    ab_df = no_na_res[(no_na_res['A_search_type'] == option_A) & (no_na_res['B_search_type'] == option_B)]
    return round(ab_df['best'].apply(lambda x: 1 if x == "A" else 0).mean()*100,2)

def get_player_B_winrate(option_A, option_B):
    ab_df = no_na_res[(no_na_res['A_search_type'] == option_A) & (no_na_res['B_search_type'] == option_B)]
    return round(ab_df['best'].apply(lambda x: 1 if x == "B" else 0).mean()*100,2)


In [14]:
def get_overall_wr(option_A, option_B, winner):
    
    assert winner in [option_A, option_B]

    if option_A == option_B:
        return -1

    tmp_df = no_na_res[
        (no_na_res['A_search_type'] == option_A) & (no_na_res['B_search_type'] == option_B) |
        (no_na_res['A_search_type'] == option_B) & (no_na_res['B_search_type'] == option_A)
    ]

    def map_best_to_search_type(row):
        if row['best'] == "A":
            return row['A_search_type']
        return row['B_search_type']

    tmp_df['best_search_type'] = tmp_df.apply(lambda row: map_best_to_search_type(row), axis=1)
    wr = (tmp_df['best_search_type'] == winner).mean()
    return round(wr*100,2)

In [15]:
def get_how_many_players_n(option_A, option_B):
    total = len(no_na_res[(no_na_res['A_search_type'] == option_A) & (no_na_res['B_search_type'] == option_B)])
    total += len(no_na_res[(no_na_res['A_search_type'] == option_B) & (no_na_res['B_search_type'] == option_A)])

    return total

In [16]:
get_how_many_players_n("typesense", "semantic")

294

In [18]:
no_na_res

Unnamed: 0,A_search_type,A_use_classifier,A_n_results,B_search_type,B_use_classifier,B_n_results,best,type
0,typesense,False,4,semantic_rerank,True,4,A,train
1,semantic_rerank,False,5,typesense,True,5,B,train
2,semantic_rerank,True,5,typesense,False,4,A,test
3,typesense,True,5,typesense,True,2,A,test
4,typesense,False,3,semantic_rerank,False,1,B,train
...,...,...,...,...,...,...,...,...
1235,typesense,True,2,typesense,False,4,B,test
1236,semantic,True,3,typesense,False,1,A,train
1237,semantic,False,5,semantic,True,4,A,test
1238,typesense,False,4,typesense,False,3,A,test


In [19]:

eval_df = pd.DataFrame({
    "Winner": sorted(["typesense", "semantic", "semantic_rerank"]*3),
    "Loser": ["semantic", "semantic_rerank", "typesense"]*3
})

eval_df
eval_df['overall_WR'] = eval_df.apply(lambda row: get_overall_wr(row['Winner'], row['Loser'], row['Winner']), axis=1)
eval_df['player_A_WR'] = eval_df.apply(lambda row: get_player_A_winrate(row['Winner'], row['Loser']), axis=1)
eval_df['player_B_WR'] = eval_df.apply(lambda row: get_player_B_winrate(row['Loser'], row['Winner']), axis=1)

eval_df['n_times_played'] = eval_df.apply(lambda row: get_how_many_players_n(row['Winner'], row['Loser']), axis=1)


# eval_df.to_csv("../data/rag_evaluation_table.csv", index=False)
eval_df

Unnamed: 0,Winner,Loser,overall_WR,player_A_WR,player_B_WR,n_times_played
0,semantic,semantic,-1.0,56.78,43.22,236
1,semantic,semantic_rerank,55.19,51.38,58.33,241
2,semantic,typesense,43.54,36.49,50.68,294
3,semantic_rerank,semantic,44.81,41.67,48.62,241
4,semantic_rerank,semantic_rerank,-1.0,51.97,48.03,254
5,semantic_rerank,typesense,34.87,31.85,38.1,304
6,typesense,semantic,56.46,49.32,63.51,294
7,typesense,semantic_rerank,65.13,61.9,68.15,304
8,typesense,typesense,-1.0,51.28,48.72,312


In [20]:
(eval_df['player_B_WR'] - eval_df['player_A_WR'])

0   -13.56
1     6.95
2    14.19
3     6.95
4    -3.94
5     6.25
6    14.19
7     6.25
8    -2.56
dtype: float64

In [21]:
eval_df

Unnamed: 0,Winner,Loser,overall_WR,player_A_WR,player_B_WR,n_times_played
0,semantic,semantic,-1.0,56.78,43.22,236
1,semantic,semantic_rerank,55.19,51.38,58.33,241
2,semantic,typesense,43.54,36.49,50.68,294
3,semantic_rerank,semantic,44.81,41.67,48.62,241
4,semantic_rerank,semantic_rerank,-1.0,51.97,48.03,254
5,semantic_rerank,typesense,34.87,31.85,38.1,304
6,typesense,semantic,56.46,49.32,63.51,294
7,typesense,semantic_rerank,65.13,61.9,68.15,304
8,typesense,typesense,-1.0,51.28,48.72,312


# About the Topic Classifier

In [22]:
def map_best_to_classifier_type(row):
    if row['best'] == "A":
        return row['A_use_classifier']
    return row['B_use_classifier']

In [23]:
no_na_res

Unnamed: 0,A_search_type,A_use_classifier,A_n_results,B_search_type,B_use_classifier,B_n_results,best,type
0,typesense,False,4,semantic_rerank,True,4,A,train
1,semantic_rerank,False,5,typesense,True,5,B,train
2,semantic_rerank,True,5,typesense,False,4,A,test
3,typesense,True,5,typesense,True,2,A,test
4,typesense,False,3,semantic_rerank,False,1,B,train
...,...,...,...,...,...,...,...,...
1235,typesense,True,2,typesense,False,4,B,test
1236,semantic,True,3,typesense,False,1,A,train
1237,semantic,False,5,semantic,True,4,A,test
1238,typesense,False,4,typesense,False,3,A,test


In [24]:
no_na_res.apply(lambda row: map_best_to_classifier_type(row), axis=1).mean()

0.47096774193548385

In [25]:
no_na_res.groupby("type").apply(lambda x: x.apply(lambda row: map_best_to_classifier_type(row), axis=1).mean())

type
test     0.475207
train    0.464981
dtype: float64

In [26]:
cnts_ab = no_na_res['type'].value_counts()
cnts_ab['test']/cnts_ab.sum()

0.5854838709677419

In [27]:
cnts_none = res[res['best'] == "None"]['type'].value_counts()
cnts_none['test']/cnts_none.sum()

0.5601809954751131

In [28]:
100-round(len(no_na_res)/len(df)*100,2)

47.12

In [29]:
res[(res['A_use_classifier'] == True) & (res['B_use_classifier'] == False)]['best'].value_counts()

best
None    271
B       200
A       153
Name: count, dtype: int64

In [30]:
res[(res['A_use_classifier'] == False) & (res['B_use_classifier'] == False)]['best'].value_counts()["None"]/res[(res['A_use_classifier'] == False) & (res['B_use_classifier'] == False)]['best'].value_counts().sum()

0.5102389078498294

In [31]:
res[(res['A_use_classifier'] == True) & (res['B_use_classifier'] == False)]['best'].value_counts()["None"]/res[(res['A_use_classifier'] == True) & (res['B_use_classifier'] == False)]['best'].value_counts().sum()

0.4342948717948718

In [32]:
res[(res['A_use_classifier'] == False) & (res['B_use_classifier'] == True)]['best'].value_counts()["None"]/res[(res['A_use_classifier'] == False) & (res['B_use_classifier'] == True)]['best'].value_counts().sum()

0.4506065857885615

In [33]:
res[(res['A_use_classifier'] == True) & (res['B_use_classifier'] == True)]['best'].value_counts()["None"]/res[(res['A_use_classifier'] == True) & (res['B_use_classifier'] == True)]['best'].value_counts().sum()

0.492831541218638

In [34]:
def get_classifier_prop_none_eval(A_classifier: bool, B_classifier: bool):
    cnts = res[(res['A_use_classifier'] == A_classifier) & (res['B_use_classifier'] == B_classifier)]['best'].value_counts()
    return cnts.sum(), round(cnts["None"]/cnts.sum()*100,2)

In [35]:
options = [[False, False], [False, True], [True, False], [True, True]]

for option in options:
    print(option)
    print(get_classifier_prop_none_eval(option[0], option[1]))
    print()

[False, False]
(586, 51.02)

[False, True]
(577, 45.06)

[True, False]
(624, 43.43)

[True, True]
(558, 49.28)



In [36]:
res['best'].apply(lambda x: True if x == "None" else False).describe()

count      2345
unique        2
top       False
freq       1240
Name: best, dtype: object

In [37]:
res['best'].apply(lambda x: True if x == "None" else False).mean()

0.47121535181236673

# Making some graphs

In [38]:
best_config(df.iloc[0], "search_type")

In [39]:
no_na_res

Unnamed: 0,A_search_type,A_use_classifier,A_n_results,B_search_type,B_use_classifier,B_n_results,best,type
0,typesense,False,4,semantic_rerank,True,4,A,train
1,semantic_rerank,False,5,typesense,True,5,B,train
2,semantic_rerank,True,5,typesense,False,4,A,test
3,typesense,True,5,typesense,True,2,A,test
4,typesense,False,3,semantic_rerank,False,1,B,train
...,...,...,...,...,...,...,...,...
1235,typesense,True,2,typesense,False,4,B,test
1236,semantic,True,3,typesense,False,1,A,train
1237,semantic,False,5,semantic,True,4,A,test
1238,typesense,False,4,typesense,False,3,A,test


In [40]:
df_compare['best'].value_counts()

best
typesense          364
semantic           261
semantic_rerank    214
Name: count, dtype: int64

In [41]:
df.head(2)

Unnamed: 0,question,answer,type,player_A_config,player_B_config,player_A_response,player_B_response,prompt,best_response
0,What is the deadline for transfer students to ...,The deadline for transfer students to apply fo...,test,"{'search_type': 'typesense', 'use_classifier':...","{'search_type': 'semantic', 'use_classifier': ...",The application deadline for transfer students...,"I'm sorry, I don't know.","[{'role': 'system', 'content': 'You are a help...",
1,Who is authorized to request verification of i...,University Police Officers have the authority ...,train,"{'search_type': 'typesense', 'use_classifier':...","{'search_type': 'semantic_rerank', 'use_classi...","Any faculty member, employee of the University...","I'm sorry, I don't know.","[{'role': 'system', 'content': 'You are a help...",A


In [42]:
df['best'] = df.apply(lambda row: best_config(row, "search_type"), axis=1).value_counts()

In [52]:
def get_overall_wr_with_respect_to_classifier(option_A, option_B, winner, winner_classifier: bool):
    
    assert winner in [option_A, option_B]

    if option_A == option_B:
        return -1
    
    tmp_df = no_na_res[
        ((no_na_res['A_search_type'] == option_A) & (no_na_res['B_search_type'] == option_B) |
        (no_na_res['A_search_type'] == option_B) & (no_na_res['B_search_type'] == option_A))
    ]

    if winner_classifier:
        if winner == option_A:
            tmp_df = tmp_df[tmp_df['A_use_classifier'] == True]
        elif winner == option_B:
            tmp_df = tmp_df[tmp_df['B_use_classifier'] == True]

    def map_best_to_search_type(row):
        if row['best'] == "A":
            return row['A_search_type']
        return row['B_search_type']

    tmp_df['best_search_type'] = tmp_df.apply(lambda row: map_best_to_search_type(row), axis=1)
    wr = (tmp_df['best_search_type'] == winner).mean()
    return round(wr*100,2)

In [53]:
eval_df.apply(lambda row: get_overall_wr_with_respect_to_classifier(row['Winner'], row['Loser'], row['Winner'], True), axis=1)

0    -1.00
1    54.70
2    45.10
3    45.30
4    -1.00
5    30.67
6    54.90
7    69.33
8    -1.00
dtype: float64

In [54]:
eval_df.apply(lambda row: get_overall_wr_with_respect_to_classifier(row['Winner'], row['Loser'], row['Winner'], False), axis=1)

0    -1.00
1    55.19
2    43.54
3    44.81
4    -1.00
5    34.87
6    56.46
7    65.13
8    -1.00
dtype: float64

In [58]:
no_na_res

Unnamed: 0,A_search_type,A_use_classifier,A_n_results,B_search_type,B_use_classifier,B_n_results,best,type
0,typesense,False,4,semantic_rerank,True,4,A,train
1,semantic_rerank,False,5,typesense,True,5,B,train
2,semantic_rerank,True,5,typesense,False,4,A,test
3,typesense,True,5,typesense,True,2,A,test
4,typesense,False,3,semantic_rerank,False,1,B,train
...,...,...,...,...,...,...,...,...
1235,typesense,True,2,typesense,False,4,B,test
1236,semantic,True,3,typesense,False,1,A,train
1237,semantic,False,5,semantic,True,4,A,test
1238,typesense,False,4,typesense,False,3,A,test


In [63]:
# Determine wins for each search type and classifier usage
wins = no_na_res[['A_search_type', 'A_use_classifier', 'B_search_type', 'B_use_classifier', 'best']]
wins_A = wins[wins['best'] == 'A'].groupby(['A_search_type', 'A_use_classifier']).size().reset_index(name='wins')
wins_B = wins[wins['best'] == 'B'].groupby(['B_search_type', 'B_use_classifier']).size().reset_index(name='wins')

# Rename columns for a unified view
wins_A.rename(columns={'A_search_type': 'search_type', 'A_use_classifier': 'use_classifier'}, inplace=True)
wins_B.rename(columns={'B_search_type': 'search_type', 'B_use_classifier': 'use_classifier'}, inplace=True)

# Concatenate and aggregate wins
total_wins = pd.concat([wins_A, wins_B]).groupby(['search_type', 'use_classifier']).sum().reset_index()

# Calculate total occurrences of each search type and classifier usage
occurrences_A = no_na_res.groupby(['A_search_type', 'A_use_classifier']).size().reset_index(name='occurrences')
occurrences_B = no_na_res.groupby(['B_search_type', 'B_use_classifier']).size().reset_index(name='occurrences')
occurrences_A.rename(columns={'A_search_type': 'search_type', 'A_use_classifier': 'use_classifier'}, inplace=True)
occurrences_B.rename(columns={'B_search_type': 'search_type', 'B_use_classifier': 'use_classifier'}, inplace=True)
total_occurrences = pd.concat([occurrences_A, occurrences_B]).groupby(['search_type', 'use_classifier']).sum().reset_index()

# Merge wins with occurrences
winrate_data = pd.merge(total_wins, total_occurrences, on=['search_type', 'use_classifier'])

# Calculate win rate
winrate_data['win_rate'] = round(100*winrate_data['wins'] / winrate_data['occurrences'], 2)

# Add total occurrences to the winrate_data for visibility
winrate_data['total_occurrences'] = winrate_data['occurrences']

winrate_data[['search_type', 'use_classifier', 'win_rate', 'total_occurrences']].to_csv("")

Unnamed: 0,search_type,use_classifier,win_rate,total_occurrences
0,semantic,False,53.0,400
1,semantic,True,45.01,371
2,semantic_rerank,False,48.68,380
3,semantic_rerank,True,37.23,419
4,typesense,False,55.82,464
5,typesense,True,58.52,446


In [62]:
eval_df

Unnamed: 0,Winner,Loser,overall_WR,player_A_WR,player_B_WR,n_times_played
0,semantic,semantic,-1.0,56.78,43.22,236
1,semantic,semantic_rerank,55.19,51.38,58.33,241
2,semantic,typesense,43.54,36.49,50.68,294
3,semantic_rerank,semantic,44.81,41.67,48.62,241
4,semantic_rerank,semantic_rerank,-1.0,51.97,48.03,254
5,semantic_rerank,typesense,34.87,31.85,38.1,304
6,typesense,semantic,56.46,49.32,63.51,294
7,typesense,semantic_rerank,65.13,61.9,68.15,304
8,typesense,typesense,-1.0,51.28,48.72,312
