In [1]:
## Add this directory to the path and load our functions
import sys
sys.path.append("../src/")

import polars as pl

import paware

import os

recompute_all_scores = False

# Evaluating Results

We evaluated the performance of 160 different configurations, made up of combinations of the following:

* Embedding Parameters (5 combinations):
  * Chunk sizes: 512, 256, 128
  * With or without meta-data attached before embedding (except 128, to which we never attached the meta-data)
* Query Parameters (4 combinations):
  * Pre-filtering (before querying the database):
    * Filtering out submissions (as opposed to comments)
    * Filtering out "short questions", which were any document shorter than 100 characters and ending in `?`
  * Re-ranking (8 combinations):
    * Moving results with replies that had positive sentiments up, and results with replies that had negative sentiments down
    * Moving results with replies that were close to "agree statements" up
    * Moving results with replies that were close to "disgree statements" down

## Computing Scores

We used modified versions of three metrics to evaluate our configurations. Using a set of 

### Reciprocal Rank Scores

In [2]:
if recompute_all_scores:
    files = [f for f in os.listdir("../full_query_results/") if f[0]!='.']

    score_tool = paware.PawScores("../full_query_results/"+files[0])
    score_tool.compute_rr_scores()

    query_scores = score_tool.get_rr_scores()
    rr_df = pl.DataFrame({"query":query_scores.keys(), "rr_score_"+files[0].split(".")[0]:query_scores.values()})

    for f in files[1:]:
        score_tool = paware.PawScores("../full_query_results/"+f)
        score_tool.compute_rr_scores()
        query_scores = score_tool.get_rr_scores()
        current =  pl.DataFrame({"query":query_scores.keys(),
                                "rr_score_"+f.split(".")[0]:query_scores.values()})
        rr_df = rr_df.join(current, on="query", how="left")

### Modified Extened Reciprocal Rank Scores

In [3]:
if recompute_all_scores:
    files = [f for f in os.listdir("../full_query_results/") if f[0]!='.']

    score_tool = paware.PawScores("../full_query_results/"+files[0])
    score_tool.compute_mext_rr_scores()

    query_scores = score_tool.get_mext_rr_scores()
    mext_rr_df = pl.DataFrame({"query":query_scores.keys(), "mext_rr_score_"+files[0].split(".")[0]:query_scores.values()})

    for f in files[1:]:
        score_tool = paware.PawScores("../full_query_results/"+f)
        score_tool.compute_mext_rr_scores()
        query_scores = score_tool.get_mext_rr_scores()
        current =  pl.DataFrame({"query":query_scores.keys(),
                                "mext_rr_score_"+f.split(".")[0]:query_scores.values()})
        mext_rr_df = mext_rr_df.join(current, on="query", how="left")

### Discounted Cumulative Gain Scores

In [4]:
if recompute_all_scores:
    files = [f for f in os.listdir("../full_query_results/") if f[0]!='.']

    score_tool = paware.PawScores("../full_query_results/"+files[0])
    score_tool.compute_dcg_scores()

    query_scores = score_tool.get_dcg_scores()
    dcg_df = pl.DataFrame({"query":query_scores.keys(), "dcg_score_"+files[0].split(".")[0]:query_scores.values()})

    for f in files[1:]:
        score_tool = paware.PawScores("../full_query_results/"+f)
        score_tool.compute_dcg_scores()
        query_scores = score_tool.get_dcg_scores()
        current =  pl.DataFrame({"query":query_scores.keys(),
                                "dcg_score_"+f.split(".")[0]:query_scores.values()})
        dcg_df = dcg_df.join(current, on="query", how="left")

Saving the results...

In [5]:
if recompute_all_scores:
    rr_df.write_parquet("../full_query_scores/rr_scores.parquet")
    mext_rr_df.write_parquet("../full_query_scores/mext_rr_scores.parquet")
    dcg_df.write_parquet("../full_query_scores/dcg_scores.parquet")

Loading the results...

In [6]:
rr_df = pl.read_parquet("../full_query_scores/rr_scores.parquet")
mext_rr_df = pl.read_parquet("../full_query_scores/mext_rr_scores.parquet")
dcg_df = pl.read_parquet("../full_query_scores/dcg_scores.parquet")

Consolidating and organizing the results...

In [7]:
## Organize scores into columns
rr_scores = rr_df.mean().transpose(include_header=True)
mext_rr_scores = mext_rr_df.mean().transpose(include_header=True)
dcg_scores = dcg_df.mean().transpose(include_header=True)

## Rename columns
rr_scores = rr_scores.rename({"column":"query_configuration", 
                              "column_0":"mean_rr_score"})
mext_rr_scores = mext_rr_scores.rename({"column":"query_configuration", 
                                        "column_0":"mean_mext_rr_score"})
dcg_scores = dcg_scores.rename({"column":"query_configuration", 
                                "column_0":"mean_dcg_score"})

## Break down query configuration into its components
rr_scores  = rr_scores.with_columns(
        pl.col("query_configuration").str.slice(-9).alias("config_code"),
        pl.col("query_configuration").str.slice(-9,2).alias("emb"),
        pl.col("query_configuration").str.slice(-5,1).alias("filter_short_qs"),
        pl.col("query_configuration").str.slice(-4,1).alias("filter_submissions"),
        pl.col("query_configuration").str.slice(-3,1).alias("rerank_sentiment"),
        pl.col("query_configuration").str.slice(-2,1).alias("rerank_agree"),
        pl.col("query_configuration").str.slice(-1,1).alias("rerank_disagree")
        ).filter(pl.col("query_configuration")!="query").clone()

rr_scores = rr_scores.sort(by="mean_rr_score", descending=True)\
        .with_row_index("rr_rank", offset=1).clone()

mext_rr_scores  = mext_rr_scores.with_columns(
        pl.col("query_configuration").str.slice(-9).alias("config_code"),
        pl.col("query_configuration").str.slice(-9,2).alias("emb"),
        pl.col("query_configuration").str.slice(-5,1).alias("filter_short_qs"),
        pl.col("query_configuration").str.slice(-4,1).alias("filter_submissions"),
        pl.col("query_configuration").str.slice(-3,1).alias("rerank_sentiment"),
        pl.col("query_configuration").str.slice(-2,1).alias("rerank_agree"),
        pl.col("query_configuration").str.slice(-1,1).alias("rerank_disagree")
        ).filter(pl.col("query_configuration")!="query").clone()

mext_rr_scores = mext_rr_scores.sort(by="mean_mext_rr_score", descending=True)\
        .with_row_index("mext_rr_rank", offset=1).clone()

dcg_scores  = dcg_scores.with_columns(
        pl.col("query_configuration").str.slice(-9).alias("config_code"),
        pl.col("query_configuration").str.slice(-9,2).alias("emb"),
        pl.col("query_configuration").str.slice(-5,1).alias("filter_short_qs"),
        pl.col("query_configuration").str.slice(-4,1).alias("filter_submissions"),
        pl.col("query_configuration").str.slice(-3,1).alias("rerank_sentiment"),
        pl.col("query_configuration").str.slice(-2,1).alias("rerank_agree"),
        pl.col("query_configuration").str.slice(-1,1).alias("rerank_disagree")
        ).filter(pl.col("query_configuration")!="query").clone()

dcg_scores = dcg_scores.sort(by="mean_dcg_score", descending=True)\
        .with_row_index("dcg_rank", offset=1).clone()

## Reorder columns
rr_col_order = ["config_code"]+[col for col in rr_scores.columns 
                                if (col != "config_code") 
                                and (col != "query_configuration")]

mext_rr_col_order = ["config_code"]+[col for col in mext_rr_scores.columns if 
                                     (col != "config_code") 
                                     and (col != "query_configuration")]

dcg_col_order = ["config_code"]+[col for col in dcg_scores.columns 
                                 if (col != "config_code") 
                                 and (col != "query_configuration")]

rr_scores = rr_scores[rr_col_order].clone()
mext_rr_scores = mext_rr_scores[mext_rr_col_order].clone()
dcg_scores = dcg_scores[dcg_col_order].clone()

## Join the scores into a single dataframe
score_summary = rr_scores.join(
    mext_rr_scores[["config_code", "mean_mext_rr_score", "mext_rr_rank"]], 
    on="config_code", how="left").join(
        dcg_scores[["config_code", "mean_dcg_score", "dcg_rank"]], 
        on="config_code", how="left")

score_summary = score_summary.with_columns(
        pl.col("filter_short_qs").cast(pl.Int32),
        pl.col("filter_submissions").cast(pl.Int32),
        pl.col("rerank_sentiment").cast(pl.Int32),
        pl.col("rerank_agree").cast(pl.Int32),
        pl.col("rerank_disagree").cast(pl.Int32))

score_summary = score_summary.with_columns(
    mean_rank = (pl.col("rr_rank")+pl.col("mext_rr_rank")+pl.col("dcg_rank"))/3)

score_summary = score_summary[[
        'config_code',
        'mean_mext_rr_score',
        'mean_dcg_score',
        'mean_rr_score',
        'rr_rank',
        'mext_rr_rank',
        'dcg_rank',
        'mean_rank',
        'emb',
        'filter_short_qs',
        'filter_submissions',
        'rerank_sentiment',
        'rerank_agree',
        'rerank_disagree']].clone()

## A Quick Look at the Scores

In [8]:
summary_cols = [
    'rr_rank',
    'mext_rr_rank',
    'dcg_rank',
    'mean_rank',
    'emb',
    'filter_short_qs',
    'filter_submissions',
    'rerank_sentiment',
    'rerank_agree',
    'rerank_disagree']

print("Scored configurations, sorted by mean rank across all three metrics:")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(score_summary.sort(by="mean_rank")[summary_cols])

Scored configurations, sorted by mean rank across all three metrics:
shape: (160, 10)
┌─────────┬──────────────┬──────────┬────────────┬─────┬─────────────────┬────────────────────┬──────────────────┬──────────────┬─────────────────┐
│ rr_rank ┆ mext_rr_rank ┆ dcg_rank ┆ mean_rank  ┆ emb ┆ filter_short_qs ┆ filter_submissions ┆ rerank_sentiment ┆ rerank_agree ┆ rerank_disagree │
│ ---     ┆ ---          ┆ ---      ┆ ---        ┆ --- ┆ ---             ┆ ---                ┆ ---              ┆ ---          ┆ ---             │
│ u64     ┆ u64          ┆ u64      ┆ f64        ┆ str ┆ i32             ┆ i32                ┆ i32              ┆ i32          ┆ i32             │
╞═════════╪══════════════╪══════════╪════════════╪═════╪═════════════════╪════════════════════╪══════════════════╪══════════════╪═════════════════╡
│ 7       ┆ 9            ┆ 1        ┆ 5.666667   ┆ 01  ┆ 1               ┆ 0                  ┆ 1                ┆ 1            ┆ 0               │
│ 1       ┆ 11           ┆

In [9]:
summary_cols = [
    'rr_rank',
    'mext_rr_rank',
    'dcg_rank',
    'mean_rank',
    'emb',
    'filter_short_qs',
    'filter_submissions',
    'rerank_sentiment',
    'rerank_agree',
    'rerank_disagree']

print("Top 10 configurations by mean rank for chunk size of 512 and no metadata")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(score_summary.filter(pl.col("emb")=="00").sort(by="mean_rank")[summary_cols].head(15))
print("\n\n")

print("Top 10 configurations by mean rank for chunk size of 512 with metadata")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(score_summary.filter(pl.col("emb")=="01").sort(by="mean_rank")[summary_cols].head(15))
print("\n\n")

print("Top 10 configurations by mean rank for chunk size of 256 and no metadata")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(score_summary.filter(pl.col("emb")=="02").sort(by="mean_rank")[summary_cols].head(15))
print("\n\n")

print("Top 10 configurations by mean rank for chunk size of 256 with metadata")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(score_summary.filter(pl.col("emb")=="03").sort(by="mean_rank")[summary_cols].head(15))
print("\n\n")

print("Top 10 configurations by mean rank for chunk size of 128 and no metadata")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(score_summary.filter(pl.col("emb")=="04").sort(by="mean_rank")[summary_cols].head(15))
print("\n\n")

Top 10 configurations by mean rank for chunk size of 512 and no metadata
shape: (15, 10)
┌─────────┬──────────────┬──────────┬───────────┬─────┬─────────────────┬────────────────────┬──────────────────┬──────────────┬─────────────────┐
│ rr_rank ┆ mext_rr_rank ┆ dcg_rank ┆ mean_rank ┆ emb ┆ filter_short_qs ┆ filter_submissions ┆ rerank_sentiment ┆ rerank_agree ┆ rerank_disagree │
│ ---     ┆ ---          ┆ ---      ┆ ---       ┆ --- ┆ ---             ┆ ---                ┆ ---              ┆ ---          ┆ ---             │
│ u64     ┆ u64          ┆ u64      ┆ f64       ┆ str ┆ i32             ┆ i32                ┆ i32              ┆ i32          ┆ i32             │
╞═════════╪══════════════╪══════════╪═══════════╪═════╪═════════════════╪════════════════════╪══════════════════╪══════════════╪═════════════════╡
│ 9       ┆ 57           ┆ 17       ┆ 27.666667 ┆ 00  ┆ 1               ┆ 0                  ┆ 0                ┆ 0            ┆ 1               │
│ 10      ┆ 58           ┆ 18

In [10]:
print("Configurations by Mean Reciprocal Rank score:")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(rr_scores.sort("mean_rr_score", descending=True).with_row_index())

Configurations by Mean Reciprocal Rank score:
shape: (160, 10)
┌───────┬─────────────┬─────────┬─────────────────────┬─────┬─────────────────┬────────────────────┬──────────────────┬──────────────┬─────────────────┐
│ index ┆ config_code ┆ rr_rank ┆ mean_rr_score       ┆ emb ┆ filter_short_qs ┆ filter_submissions ┆ rerank_sentiment ┆ rerank_agree ┆ rerank_disagree │
│ ---   ┆ ---         ┆ ---     ┆ ---                 ┆ --- ┆ ---             ┆ ---                ┆ ---              ┆ ---          ┆ ---             │
│ u64   ┆ str         ┆ u64     ┆ str                 ┆ str ┆ str             ┆ str                ┆ str              ┆ str          ┆ str             │
╞═══════╪═════════════╪═════════╪═════════════════════╪═════╪═════════════════╪════════════════════╪══════════════════╪══════════════╪═════════════════╡
│ 0     ┆ 010010000   ┆ 1       ┆ 0.743040293040293   ┆ 01  ┆ 1               ┆ 0                  ┆ 0                ┆ 0            ┆ 0               │
│ 1     ┆ 010010010

In [11]:
print("Configurations by Mean Extended Reciprocal Rank score:")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(mext_rr_scores.sort("mean_mext_rr_score", descending=True).with_row_index())

Configurations by Mean Extended Reciprocal Rank score:
shape: (160, 10)
┌───────┬─────────────┬──────────────┬─────────────────────┬─────┬─────────────────┬────────────────────┬──────────────────┬──────────────┬─────────────────┐
│ index ┆ config_code ┆ mext_rr_rank ┆ mean_mext_rr_score  ┆ emb ┆ filter_short_qs ┆ filter_submissions ┆ rerank_sentiment ┆ rerank_agree ┆ rerank_disagree │
│ ---   ┆ ---         ┆ ---          ┆ ---                 ┆ --- ┆ ---             ┆ ---                ┆ ---              ┆ ---          ┆ ---             │
│ u64   ┆ str         ┆ u64          ┆ str                 ┆ str ┆ str             ┆ str                ┆ str              ┆ str          ┆ str             │
╞═══════╪═════════════╪══════════════╪═════════════════════╪═════╪═════════════════╪════════════════════╪══════════════════╪══════════════╪═════════════════╡
│ 0     ┆ 010010101   ┆ 1            ┆ 0.6067062047533772  ┆ 01  ┆ 1               ┆ 0                  ┆ 1                ┆ 0            

In [12]:
print("Configurations by Mean Discounted Cumulative Gain score:")
with pl.Config(tbl_rows=40, tbl_cols=10, tbl_width_chars=180, fmt_str_lengths=180):
    print(dcg_scores.sort("mean_dcg_score", descending=True).with_row_index())

Configurations by Mean Discounted Cumulative Gain score:
shape: (160, 10)
┌───────┬─────────────┬──────────┬────────────────────┬─────┬─────────────────┬────────────────────┬──────────────────┬──────────────┬─────────────────┐
│ index ┆ config_code ┆ dcg_rank ┆ mean_dcg_score     ┆ emb ┆ filter_short_qs ┆ filter_submissions ┆ rerank_sentiment ┆ rerank_agree ┆ rerank_disagree │
│ ---   ┆ ---         ┆ ---      ┆ ---                ┆ --- ┆ ---             ┆ ---                ┆ ---              ┆ ---          ┆ ---             │
│ u64   ┆ str         ┆ u64      ┆ str                ┆ str ┆ str             ┆ str                ┆ str              ┆ str          ┆ str             │
╞═══════╪═════════════╪══════════╪════════════════════╪═════╪═════════════════╪════════════════════╪══════════════════╪══════════════╪═════════════════╡
│ 0     ┆ 010010110   ┆ 1        ┆ 0.8064761372042809 ┆ 01  ┆ 1               ┆ 0                  ┆ 1                ┆ 1            ┆ 0               │
│ 1     