# Calculate Search Metrics with Dynamic Optimizer

This notebook loads the models that were built in the previous notebook and test how much the queries improve when using a pipeline with the predicted "neuralness" value.

1. Get the models
2. Get the queries
3. Get the ratings file
4. Get the predicted "neuralness" scores for each query from each model
5. Run the queries (two runs per query; one per model)
    1. Create the pipeline with the according "neuralness" and "keywordness" values
    2. Run the query once per model type
    3. Store the results
7. Calculate search metrics
8. Compare search metrics with the best metrics of the notebook that explored all hybrid search configurations 

In [155]:
import pickle
import pandas as pd
import requests
import json
import uuid
import numpy as np
import plotly.express as px
from tqdm.notebook import tqdm_notebook

## Load models from the previous notebook

In [156]:
# load the two models
with open('regression_model.pkl', 'rb') as fid:
    regr = pickle.load(fid)

with open('random_forest_model.pkl', 'rb') as fid:
    random_forest = pickle.load(fid)

In [157]:
regr

In [158]:
random_forest

In [159]:
df_test_data = pd.read_csv('dynamic_optimizer_test_data.csv')

In [160]:
df_test_data.head(5)

Unnamed: 0,query_string,ndcg,neuralness,f_1_num_of_terms,f_2_query_length,f_3_has_numbers,f_4_has_special_char,f_5_num_results,f_6_max_title_score,f_7_sum_title_scores,f_8_max_semantic_score,f_9_avg_semantic_score
0,- we are not such things,0.526941,0.7,6,24,0,1,613,3.378685,5.338386,0.511999,0.482098
1,02cool spray water bottle for drinking not fla...,0.171822,0.0,10,56,1,0,0,0.0,0.0,0.53525,0.528126
2,0pi nail polish i'm not really a waitress,0.136227,1.0,8,41,1,1,0,0.0,0.0,0.669153,0.605961
3,1 14 brown leather belt without buckle,0.061011,0.7,7,38,1,0,1,0.523058,0.523058,0.667885,0.651717
4,1 flexible 8x10 mirror not sheet,0.258539,1.0,6,32,1,0,0,0.0,0.0,0.581345,0.566256


In [161]:
# Get all ratings
DATA_DIR = '/Users/danielwrigley/work/Testing/git_repos/esci-data/shopping_queries_dataset/'

In [162]:
df_examples = pd.read_parquet(DATA_DIR + '/shopping_queries_dataset_examples.parquet')

In [163]:
# Select judgments
# Map esci_label to score
# create judgments per day in range
# create noise in score

label_num = {"E": 0, "S": 1, "C": 2, "I": 3}
label_score = [3, 2, 1, 0]
#label_score = [1, 0.1, 0.01, 0]

def label_to_score(label):
    return label_score[label_num[label]]

df_ratings = df_examples[df_examples["query"].isin(set(df_test_data["query_string"].values))].copy()
df_ratings["judgment"] = df_ratings.esci_label.apply(lambda x: label_to_score(x))
df_ratings["document"] = df_ratings.product_id
df_ratings = df_ratings[["query", "document", "judgment"]].reset_index(drop=True)
df_ratings.head(3)

Unnamed: 0,query,document,judgment
0,- we are not such things,1544341156,0
1,- we are not such things,1736413902,0
2,- we are not such things,1543971679,0


In [164]:
df_test_data.shape[0]

1000

## Predict the "neuralness" with both models

In [165]:
regr.feature_names_in_

array(['f_1_num_of_terms', 'f_3_has_numbers', 'f_5_num_results',
       'f_6_max_title_score', 'f_7_sum_title_scores',
       'f_8_max_semantic_score', 'f_9_avg_semantic_score'], dtype=object)

In [166]:
random_forest.feature_names_in_

array(['f_1_num_of_terms', 'f_3_has_numbers', 'f_4_has_special_char',
       'f_5_num_results', 'f_6_max_title_score', 'f_7_sum_title_scores',
       'f_8_max_semantic_score', 'f_9_avg_semantic_score'], dtype=object)

In [170]:
#feature_columns = [ "f_1_num_of_terms", "f_2_query_length", "f_3_has_numbers", "f_4_has_special_char", "f_5_num_results",
#                   "f_6_max_title_score", "f_7_sum_title_scores", "f_8_max_semantic_score",  "f_9_avg_semantic_score"]
feature_columns_lr = regr.feature_names_in_
feature_columns_rf = random_forest.feature_names_in_
def get_linear_model_prediction(row):
    df_row = pd.DataFrame([row[feature_columns_lr]], columns=feature_columns_lr)
    return regr.predict(df_row)[0]

def get_random_forest_prediction(row):
    df_row = pd.DataFrame([row[feature_columns_lr]], columns=feature_columns_rf)
    return random_forest.predict(df_row)[0]

In [171]:
# make predictions with the two models ensuring that the minimum is not below 0 and the maximum not above 1

df_test_data['linear_model'] = df_test_data.apply(
    lambda row: min(np.max(get_linear_model_prediction(row), 0), 1), axis=1
)

df_test_data['random_forest'] = df_test_data.apply(
    lambda row: min(np.max(get_random_forest_prediction(row), 0), 1), axis=1
)

In [172]:
df_test_data.head(5)

Unnamed: 0,query_string,ndcg,neuralness,f_1_num_of_terms,f_2_query_length,f_3_has_numbers,f_4_has_special_char,f_5_num_results,f_6_max_title_score,f_7_sum_title_scores,f_8_max_semantic_score,f_9_avg_semantic_score,linear_model,random_forest
0,- we are not such things,0.526941,0.7,6,24,0,1,613,3.378685,5.338386,0.511999,0.482098,0.795221,0.725082
1,02cool spray water bottle for drinking not fla...,0.171822,0.0,10,56,1,0,0,0.0,0.0,0.53525,0.528126,0.85203,0.84808
2,0pi nail polish i'm not really a waitress,0.136227,1.0,8,41,1,1,0,0.0,0.0,0.669153,0.605961,0.798135,0.773145
3,1 14 brown leather belt without buckle,0.061011,0.7,7,38,1,0,1,0.523058,0.523058,0.667885,0.651717,0.800604,0.736573
4,1 flexible 8x10 mirror not sheet,0.258539,1.0,6,32,1,0,0,0.0,0.0,0.581345,0.566256,0.74996,0.702602


## Run the queries of the test set

* Retrieve the model id
* For each query term of the test set:
  * Create the pipeline with the predicted "neuralness" value and the calculated "keywordness" value (`1-"neuralness"`)
  * Run the query once per model type

In [173]:
# Get model_id
# We are assuming that the installation has only one model. Change this if you have more models 
# and need to pick a specific one

headers = {
    'Content-Type': 'application/json'
}

def get_model_id():
    url = "http://localhost:9200/_plugins/_ml/models/_search"
   
    payload = {
      "query": {
        "match_all": {}
      },
      "size": 1
    }
    
    response = requests.request("POST", url, headers=headers, data=json.dumps(payload))

    return response.json()['hits']['hits'][0]['_source']['model_id']

model_id = get_model_id()

In [174]:
models = ['linear_model', 'random_forest']
df_relevance = pd.DataFrame()

# iterate over all query strings, create a pipeline with the predicted "neuralness"
# and send a hybrid search query to OpenSearch with the set pipeline

for query in tqdm_notebook(df_test_data.itertuples()):
    for model in models:
        neuralness = df_test_data.loc[df_test_data['query_string'] == query[1], model].iloc[0]
        neuralness = round(neuralness, 2)
        keywordness = 1 - neuralness   
        # Set pipeline 
        url = "http://localhost:9200/ecommerce/_search"
        payload = {
          "_source": {
            "excludes": [
              "title_embedding"
            ]
          },
          "query": {
            "hybrid": {
              "queries": [
                {
                  "multi_match" : {
                      "type":       "best_fields",
                      "fields":     [
                        "product_id^100",
                        "product_bullet_point^3",
                        "product_color^2",
                        "product_brand^5",
                        "product_description",
                        "product_title^10"
                      ],
                      "operator":   "and",
                      "query":      query[1]
                    }
                },
                {
                  "neural": {
                    "title_embedding": {
                      "query_text": query[1],
                      "k": 100
                    }
                  }
                }
              ]
            }
          },
            "search_pipeline": {
              "request_processors": [
                {
                  "neural_query_enricher" : {
                    "description": "one of many search pipelines for experimentation",
                    "default_model_id": model_id,
                    "neural_field_default_id": {
                       "title_embeddings": model_id
                    }
                  }
                }
              ],
              "phase_results_processors": [
                {
                  "normalization-processor": {
                    "normalization": {
                      "technique": "l2"
                    },
                    "combination": {
                      "technique": "arithmetic_mean",
                      "parameters": {
                        "weights": [
                          keywordness,
                          neuralness
                        ]
                      }
                    }
                  }
                }
              ]    
            },
          "size": 100
        }
    
        response = requests.request("POST", url, headers=headers, data=json.dumps(payload)).json()
        #print(response)
        # store results per model and pipeline
        position = 0
        for hit in response['hits']['hits']:
            # create a new row for the DataFrame and append it
            row = { 'query_id' : str(query[0]), 'query_string': query[1], 'product_id' : hit["_id"], 'position' : str(position), 'relevance' : hit["_score"], 'run': model, 'neuralness': neuralness }
    
            new_row_df = pd.DataFrame([row])
            df_relevance = pd.concat([df_relevance, new_row_df], ignore_index=True)
            position += 1
    
    # work with two for loops:
    # 1) one to iterate over the list of queries and have a query id instead of a query
    # 2) another one to iterate over the result sets to have the position of the result in the result set 
    
    # DataFrame is created using some of TREC's namings for the evaluation tools:
    # query_id: the id of the query as the trec_eval tool needs a numeric id rather than a query string as an identifier
    # query_string: the user query
    # product_id: the id of the product in the hit list
    # position: the position of the product in the result set
    # relevance: relevance as given by the search engine
    # run: the name of the model used to predict the neuralness of the model
    # neuralness: the predicted neuralness of the query

0it [00:00, ?it/s]

In [175]:
df_relevance.head(3)

Unnamed: 0,query_id,query_string,product_id,position,relevance,run,neuralness
0,0,- we are not such things,0812994507,0,0.132738,linear_model,0.8
1,0,- we are not such things,B08VJM1568,1,0.090717,linear_model,0.8
2,0,- we are not such things,B07RWSH4BP,2,0.090635,linear_model,0.8


In [176]:
df_ratings.head(3)

Unnamed: 0,query,document,judgment
0,- we are not such things,1544341156,0
1,- we are not such things,1736413902,0
2,- we are not such things,1543971679,0


In [177]:
df_ratings.columns = ['query_string', 'product_id', 'rating']
df_ratings.head(3)

Unnamed: 0,query_string,product_id,rating
0,- we are not such things,1544341156,0
1,- we are not such things,1736413902,0
2,- we are not such things,1543971679,0


In [178]:
df_relevance.head(3)

Unnamed: 0,query_id,query_string,product_id,position,relevance,run,neuralness
0,0,- we are not such things,0812994507,0,0.132738,linear_model,0.8
1,0,- we are not such things,B08VJM1568,1,0.090717,linear_model,0.8
2,0,- we are not such things,B07RWSH4BP,2,0.090635,linear_model,0.8


In [179]:
# Make sure ids are strings, positions are integers - otherwise the merge operation or the metrics calculation might cause an error
df_relevance['query_id'] = df_relevance['query_id'].astype(str)
df_relevance['position'] = df_relevance['position'].astype(int)
# Remove duplicates from the ratings DataFrame
df_unique_ratings = df_ratings.drop_duplicates(subset=['product_id', 'query_string'])

In [180]:
# Merge results on query_string and product_id so that the resulting DataFrame has the ratings together with the search results
# Merge on query_string instead of query_id because the df_relevance query_ids are not the ones originating from the ratings.
# Validations helps us make sure that we have only one rating for each query-doc pair. We have identical query-doc pairs per
# search pipeline but we can only have one rating for these.

df_merged = df_relevance.merge(df_unique_ratings, on=['query_string', 'product_id'], how='left', validate='many_to_one')
# remove unnecessary information and rename columns

df_merged.head(3)

Unnamed: 0,query_id,query_string,product_id,position,relevance,run,neuralness,rating
0,0,- we are not such things,0812994507,0,0.132738,linear_model,0.8,3.0
1,0,- we are not such things,B08VJM1568,1,0.090717,linear_model,0.8,
2,0,- we are not such things,B07RWSH4BP,2,0.090635,linear_model,0.8,


In [181]:
# Count the rows without ratings - the higher the count is the less reliable the results will be
nan_count_rating = df_merged['rating'].isna().sum()
print(f"There are {df_merged.shape[0]} rows and {nan_count_rating} do not contain a rating among the 100 returned results per query")
nan_count_rating_top_10 = df_merged[df_merged['position'] < 10]['rating'].isna().sum()
print(f"{nan_count_rating_top_10} do not contain a rating among the top 10 returned results per query")

There are 200000 rows and 184147 do not contain a rating among the 100 returned results per query
13495 do not contain a rating among the top 10 returned results per query


In [182]:
df_merged[df_merged['position'] < 10].head(10)

Unnamed: 0,query_id,query_string,product_id,position,relevance,run,neuralness,rating
0,0,- we are not such things,0812994507,0,0.132738,linear_model,0.8,3.0
1,0,- we are not such things,B08VJM1568,1,0.090717,linear_model,0.8,
2,0,- we are not such things,B07RWSH4BP,2,0.090635,linear_model,0.8,
3,0,- we are not such things,B08WRDMYTH,3,0.087807,linear_model,0.8,2.0
4,0,- we are not such things,1937578313,4,0.085002,linear_model,0.8,
5,0,- we are not such things,B078JNBVFY,5,0.085002,linear_model,0.8,
6,0,- we are not such things,035813143X,6,0.083874,linear_model,0.8,
7,0,- we are not such things,B07S18K2XX,7,0.083334,linear_model,0.8,
8,0,- we are not such things,1542044286,8,0.082844,linear_model,0.8,0.0
9,0,- we are not such things,B01N76REDM,9,0.082761,linear_model,0.8,


In [183]:
# import from shared utils file metrics.py
from utils import metrics

metrics = [
    ("dcg", metrics.dcg_at_10, None),
    ("ndcg", metrics.ndcg_at_10, None),
    ("prec@10", metrics.precision_at_k, None),
    ("ratio_of_ratings", metrics.ratio_of_ratings, None)
]

## Calculate Metrics per Query and Model

In [184]:
reference = {query: df for query, df in df_ratings.groupby("query_string")}

df_metrics = []
for m_name, m_function, ref_search in metrics:
    for (query_string, run), df_gr in df_merged.groupby(["query_string", "run"]):
        metric = m_function(df_gr, reference=reference[query_string])
        df_metrics.append(pd.DataFrame({
            "query": [query_string],
            "model": [run],
            "metric": [m_name],
            "value": [metric],
        }))
df_metrics = pd.concat(df_metrics)

In [185]:
df_metrics.head(3)

Unnamed: 0,query,model,metric,value
0,- we are not such things,linear_model,dcg,5.011868
0,- we are not such things,random_forest,dcg,5.011868
0,02cool spray water bottle for drinking not fla...,linear_model,dcg,5.892617


In [186]:
df_merged[(df_merged['query_string'] == '- we are not such things') & (df_merged['run'] == 'linear_model')].head(10)

Unnamed: 0,query_id,query_string,product_id,position,relevance,run,neuralness,rating
0,0,- we are not such things,0812994507,0,0.132738,linear_model,0.8,3.0
1,0,- we are not such things,B08VJM1568,1,0.090717,linear_model,0.8,
2,0,- we are not such things,B07RWSH4BP,2,0.090635,linear_model,0.8,
3,0,- we are not such things,B08WRDMYTH,3,0.087807,linear_model,0.8,2.0
4,0,- we are not such things,1937578313,4,0.085002,linear_model,0.8,
5,0,- we are not such things,B078JNBVFY,5,0.085002,linear_model,0.8,
6,0,- we are not such things,035813143X,6,0.083874,linear_model,0.8,
7,0,- we are not such things,B07S18K2XX,7,0.083334,linear_model,0.8,
8,0,- we are not such things,1542044286,8,0.082844,linear_model,0.8,0.0
9,0,- we are not such things,B01N76REDM,9,0.082761,linear_model,0.8,


## Calculate Metrics per Model by Averaging the Query Metrics

In [187]:
df_metrics_per_pipeline = df_metrics.pivot_table(index="model", columns="metric", values="value", aggfunc=lambda x: x.mean().round(2))
df_metrics_per_pipeline = df_metrics_per_pipeline.reset_index()
df_metrics_per_pipeline

metric,model,dcg,ndcg,prec@10,ratio_of_ratings
0,linear_model,6.03,0.27,0.31,0.33
1,random_forest,6.02,0.27,0.31,0.33


In [188]:
df_global_optimization_metrics = pd.read_csv('../data/metrics_5000_queries-2024-11-04.csv')
df_global_optimization_metrics.head(3)

Unnamed: 0,query,pipeline,metric,value
0,#8 tags without string,l2arithmetic_mean0.0,dcg,2.413179
1,#8 tags without string,l2arithmetic_mean0.1,dcg,2.413179
2,#8 tags without string,l2arithmetic_mean0.2,dcg,2.413179


In [189]:
df_metrics_pivot = df_metrics.pivot_table(index="query", columns=["model", "metric"], values="value").reset_index()
df_metrics_pivot.columns = [" ".join(c).strip() for c in df_metrics_pivot.columns.values]
df_metrics_pivot.head(3)

Unnamed: 0,query,linear_model dcg,linear_model ndcg,linear_model prec@10,linear_model ratio_of_ratings,random_forest dcg,random_forest ndcg,random_forest prec@10,random_forest ratio_of_ratings
0,- we are not such things,5.011868,0.526941,0.2,0.3,5.011868,0.526941,0.2,0.3
1,02cool spray water bottle for drinking not fla...,5.892617,0.159404,0.4,0.4,5.892617,0.159404,0.4,0.4
2,0pi nail polish i'm not really a waitress,4.0,0.136227,0.1,0.1,4.0,0.136227,0.1,0.1


## Get the max metric values to compare the per-query optimization approach to

In [190]:
metrics = ['dcg', 'ndcg', 'prec@10']
print("Max metrics for the query \'$30 roblox gift card not digital\'")
for metric in metrics:
    max_value = df_global_optimization_metrics[(df_global_optimization_metrics['query'] == '$30 roblox gift card not digital') & (df_global_optimization_metrics['metric'] == metric)]['value'].max()
    print(f"max for {metric}: {max_value}")

Max metrics for the query '$30 roblox gift card not digital'
max for dcg: 0.9175248681136912
max for ndcg: 0.0822926727947403
max for prec@10: 0.1


In [191]:
df_global_optimization_metrics_gr = df_global_optimization_metrics.groupby(by=['query', 'metric']).agg({
    'value': 'max'
}).reset_index()
df_global_optimization_metrics_gr[df_global_optimization_metrics_gr['query'] == '$30 roblox gift card not digital']

Unnamed: 0,query,metric,value
8,$30 roblox gift card not digital,dcg,0.917525
9,$30 roblox gift card not digital,ndcg,0.082293
10,$30 roblox gift card not digital,prec@10,0.1
11,$30 roblox gift card not digital,ratio_of_ratings,0.2


In [192]:
df_global_optimization_metrics_pivot = df_global_optimization_metrics_gr.pivot_table(index="query", columns=["metric"], values="value").reset_index()
df_global_optimization_metrics_pivot.columns = ['query', 'dcg', 'ndcg', 'prec@10', 'ratio_of_ratings']
df_global_optimization_metrics_pivot

Unnamed: 0,query,dcg,ndcg,prec@10,ratio_of_ratings
0,#8 tags without string,14.721187,0.481147,0.9,1.0
1,$1 dollar toys not fidgets,0.360093,0.021742,0.0,0.2
2,$30 roblox gift card not digital,0.917525,0.082293,0.1,0.2
3,$60 ps4 that’s not gonna be on amazon,0.000000,0.000000,0.0,0.0
4,'m team jesus i'm not religious shirt,18.174237,0.409661,1.0,1.0
...,...,...,...,...,...
4995,zoom eyepiece for telescope,15.130128,0.812674,0.8,0.8
4996,zumba shoes,8.523719,0.363722,0.3,0.3
4997,zwave front door lock kwikset,5.857052,0.247237,0.3,0.3
4998,zyrtec,9.822582,0.461173,0.6,0.6


In [193]:
df_metrics_merged = df_metrics_pivot.merge(df_global_optimization_metrics_pivot, on='query', how='left')

In [194]:
df_metrics_merged.head(3)

Unnamed: 0,query,linear_model dcg,linear_model ndcg,linear_model prec@10,linear_model ratio_of_ratings,random_forest dcg,random_forest ndcg,random_forest prec@10,random_forest ratio_of_ratings,dcg,ndcg,prec@10,ratio_of_ratings
0,- we are not such things,5.011868,0.526941,0.2,0.3,5.011868,0.526941,0.2,0.3,5.011868,0.526941,0.2,0.3
1,02cool spray water bottle for drinking not fla...,5.892617,0.159404,0.4,0.4,5.892617,0.159404,0.4,0.4,6.351655,0.171822,0.4,0.4
2,0pi nail polish i'm not really a waitress,4.0,0.136227,0.1,0.1,4.0,0.136227,0.1,0.1,4.0,0.136227,0.1,0.1


## Visiualization

In [195]:
px.scatter(
    df_metrics_merged,
    x="linear_model dcg",
    y="random_forest dcg",
    hover_data=df_metrics_merged.columns,
)

In [196]:
px.scatter(
    df_metrics_merged,
    x="linear_model ndcg",
    y="random_forest ndcg",
    hover_data=df_metrics_merged.columns,
)

In [197]:
px.scatter(
    df_metrics_merged,
    x="linear_model dcg",
    y="dcg",
    hover_data=df_metrics_merged.columns,
)

In [198]:
px.scatter(
    df_metrics_merged,
    x="linear_model ndcg",
    y="ndcg",
    hover_data=df_metrics_merged.columns,
)

In [199]:
df_metrics_merged['dcg_difference'] = df_metrics_merged['random_forest dcg'] - df_metrics_merged['linear_model dcg']

### Queries where the Linear model scores better:

In [200]:
df_metrics_merged[['query', 'dcg_difference','random_forest dcg','linear_model dcg']].sort_values(by='dcg_difference').head(25)

Unnamed: 0,query,dcg_difference,random_forest dcg,linear_model dcg
781,seanan mcguire,-16.451531,0.0,16.451531
437,how does a dinosaur,-7.147535,1.156259,8.303794
706,purity products,-6.380401,11.793836,18.174237
612,nc tshirts,-5.785579,7.914421,13.7
569,mens hoodies buffalo,-4.305573,0.0,4.305573
649,oval tablecloth,-3.908505,1.424829,5.333333
819,snacks,-2.770488,1.547411,4.317899
60,a. c. bhaktivedanta swami prabhupada,-2.758162,11.793836,14.551999
873,tinting wrap tools,-2.680401,2.523719,5.20412
179,boys queen size sheets,-2.581088,6.0,8.581088


### Queries where the Random Forest model scores better

In [201]:
df_metrics_merged[['query', 'dcg_difference','random_forest dcg','linear_model dcg']].sort_values(by='dcg_difference', ascending=False).head(25)

Unnamed: 0,query,dcg_difference,random_forest dcg,linear_model dcg
669,pentair pentek big blue,6.970117,12.970117,6.0
749,robert plant,4.113858,14.494259,10.380401
429,hiboy,3.751452,8.275171,4.523719
719,real rose petals,3.426608,12.829292,9.402685
930,wallniture,3.147535,3.147535,0.0
991,yamaha psr ew410,2.758162,13.259969,10.501807
701,ps4 open world games,2.523719,2.523719,0.0
673,petite dressy spandex pants,2.523719,2.523719,0.0
980,wooden suit hangers for men,2.482962,14.154216,11.671254
291,dessa,2.465979,15.684645,13.218665


In [202]:
px.bar(df_metrics_merged, x="dcg_difference", y="query", orientation='h',
             hover_data=df_metrics_merged.columns,
             height=1000,
             title='DCG Difference Linear Model and Random Forest Model')

## Metrics per Pipeline from Hybrid Optimizer for Test Queries

We calculated the metrics aggregated over the test queries (excluding training queries) and compared them with the metrics of all queries that ran through the hybrid search optimizer above.

Let's do the comparison for the results of the test queries only.

In [203]:
df_hso_results = pd.read_csv('../data/results_and_ratings_5000-2024-11-04.csv', index_col=[0])

In [204]:
df_hso_results.head(3)

Unnamed: 0_level_0,query_string,product_id,position,relevance,run,rating
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,runtz,B00K5EJHGO,0,1.0,min_maxarithmetic_mean0.0,
0,runtz,B00YHZVOCW,1,0.345967,min_maxarithmetic_mean0.0,
0,runtz,0316005282,2,0.337701,min_maxarithmetic_mean0.0,


In [205]:
df_test_data.head(3)

Unnamed: 0,query_string,ndcg,neuralness,f_1_num_of_terms,f_2_query_length,f_3_has_numbers,f_4_has_special_char,f_5_num_results,f_6_max_title_score,f_7_sum_title_scores,f_8_max_semantic_score,f_9_avg_semantic_score,linear_model,random_forest
0,- we are not such things,0.526941,0.7,6,24,0,1,613,3.378685,5.338386,0.511999,0.482098,0.795221,0.725082
1,02cool spray water bottle for drinking not fla...,0.171822,0.0,10,56,1,0,0,0.0,0.0,0.53525,0.528126,0.85203,0.84808
2,0pi nail polish i'm not really a waitress,0.136227,1.0,8,41,1,1,0,0.0,0.0,0.669153,0.605961,0.798135,0.773145


In [206]:
df_hso_results_fitlered = df_hso_results[df_hso_results['query_string'].isin(df_test_data['query_string'])]

In [207]:
df_hso_results_fitlered

Unnamed: 0_level_0,query_string,product_id,position,relevance,run,rating
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,trooper bandana shoe,B07CB21RQK,0,1.000000,min_maxarithmetic_mean0.0,3.0
1,trooper bandana shoe,B07B6HKMJ2,1,0.568941,min_maxarithmetic_mean0.0,3.0
1,trooper bandana shoe,B072C158SM,2,0.477308,min_maxarithmetic_mean0.0,3.0
1,trooper bandana shoe,B071P5JM1M,3,0.436193,min_maxarithmetic_mean0.0,3.0
1,trooper bandana shoe,B0711VWF3J,4,0.378785,min_maxarithmetic_mean0.0,3.0
...,...,...,...,...,...,...
4996,stanley thermos,B07L6LL95P,5,0.300744,l2geometric_mean1.0,3.0
4996,stanley thermos,B07B1LMVWN,6,0.300744,l2geometric_mean1.0,
4996,stanley thermos,B07L6N2641,7,0.300744,l2geometric_mean1.0,
4996,stanley thermos,B07L6C6Y21,8,0.295226,l2geometric_mean1.0,3.0


In [208]:
# import from shared utils file metrics.py
from utils import metrics

metrics = [
    ("dcg", metrics.dcg_at_10, None),
    ("ndcg", metrics.ndcg_at_10, None),
    ("prec@10", metrics.precision_at_k, None),
    ("ratio_of_ratings", metrics.ratio_of_ratings, None)
]

In [209]:
df_metrics_hso_results_filtered = []
for m_name, m_function, ref_search in metrics:
    for (query_string, run), df_gr in df_hso_results_fitlered.groupby(["query_string", "run"]):
        metric = m_function(df_gr, reference=reference[query_string])
        df_metrics_hso_results_filtered.append(pd.DataFrame({
            "query": [query_string],
            "pipeline": [run],
            "metric": [m_name],
            "value": [metric],
        }))
df_metrics_hso_results_filtered = pd.concat(df_metrics_hso_results_filtered)

In [210]:
df_metrics_hso_results_filtered.head(3)

Unnamed: 0,query,pipeline,metric,value
0,- we are not such things,l2arithmetic_mean0.0,dcg,1.302265
0,- we are not such things,l2arithmetic_mean0.1,dcg,1.302265
0,- we are not such things,l2arithmetic_mean0.2,dcg,1.302265


In [211]:
df_metrics_per_pipeline_hso = df_metrics_hso_results_filtered.pivot_table(index="pipeline", columns="metric", values="value", aggfunc=lambda x: x.mean().round(2))
df_metrics_per_pipeline_hso = df_metrics_per_pipeline_hso.reset_index()
df_metrics_per_pipeline_hso.head(4)

metric,pipeline,dcg,ndcg,prec@10,ratio_of_ratings
0,l2arithmetic_mean0.0,4.51,0.21,0.23,0.24
1,l2arithmetic_mean0.1,4.71,0.22,0.23,0.24
2,l2arithmetic_mean0.2,4.73,0.22,0.23,0.24
3,l2arithmetic_mean0.3,4.78,0.22,0.23,0.24


In [212]:
df_metrics_per_pipeline_hso.sort_values(by='ndcg', ascending=False).head(1)

metric,pipeline,dcg,ndcg,prec@10,ratio_of_ratings
6,l2arithmetic_mean0.6,5.96,0.27,0.3,0.32


In [213]:
df_metrics_per_pipeline_hso.sort_values(by='dcg', ascending=False).head(1)

metric,pipeline,dcg,ndcg,prec@10,ratio_of_ratings
6,l2arithmetic_mean0.6,5.96,0.27,0.3,0.32


In [214]:
df_metrics_per_pipeline_hso.sort_values(by='prec@10', ascending=False).head(1)

metric,pipeline,dcg,ndcg,prec@10,ratio_of_ratings
65,min_maxharmonic_mean1.0,5.79,0.26,0.3,0.32


In [215]:
df_metrics_per_pipeline

metric,model,dcg,ndcg,prec@10,ratio_of_ratings
0,linear_model,6.03,0.27,0.31,0.33
1,random_forest,6.02,0.27,0.31,0.33
