In [1]:
import pandas as pd

## Ingestion

In [2]:
df = pd.read_csv('../data/game-dataset.csv')

In [3]:
documents = df.to_dict(orient='records')

In [4]:
df.columns

Index(['gameId', 'gameName', 'alternateNames', 'subcategory', 'level',
       'description', 'playersMax', 'ageRange', 'duration', 'equipmentNeeded',
       'objective', 'skillsDeveloped', 'setupTime', 'place',
       'physicalIntensityLevel', 'educationalBenefits', 'category'],
      dtype='object')

In [5]:
import minsearch

# Initialize the Index
index = minsearch.Index(    
    text_fields=[
        'gameName', 'subcategory', 'level', 'description', 'ageRange',
        'duration', 'objective', 'skillsDeveloped', 
        'place', 'physicalIntensityLevel', 'educationalBenefits', 'category','playersMax',
    ],
    keyword_fields=['gameId', 'setupTime', 'equipmentNeeded', 'alternateNames']


)

documents = df.to_dict(orient='records')

# Fit the index with documents
index.fit(documents)

<minsearch.Index at 0x72d9300dcc90>

## Retrieval evaluation

In [6]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [7]:
df_question.head()

Unnamed: 0,q_id,question
0,1,What is the primary objective of playing socce...
1,1,How many players are allowed on the field from...
2,1,What age group is suitable for participating i...
3,1,What equipment do I need to play soccer safely...
4,1,How long does a typical soccer match last?


In [8]:
ground_truth = df_question.to_dict(orient='records')

In [9]:
ground_truth[0]

{'q_id': 1,
 'question': 'What is the primary objective of playing soccer during a match?'}

In [10]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [11]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [13]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['q_id']
        results = search_function(q)
        relevance = [d['gameId'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'MRR': mrr(relevance_total),
    }

In [14]:
from tqdm.auto import tqdm

In [15]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/2950 [00:00<?, ?it/s]

{'hit_rate': 0.551864406779661, 'MRR': 0.28608797417272047}

## Finding the best parameters

In [16]:
#validation first 100 -tune parameters on the first 100, dont overfit 
df_validation = df_question[:100]
#test-evaluation  on the rest of data 
df_test = df_question[100:]
#since we already did the evalution on the entire data set , we can also use the entire data here.

In [17]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [18]:
gt_val = df_validation.to_dict(orient='records')

In [19]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [22]:
param_ranges = {

    'gameName': (0.0, 3.0),
    'alternateNames' : (0.0, 3.0),
    'subcategory': (0.0, 3.0),
    'level': (0.0, 3.0),
    'description': (0.0, 3.0),
    'playersMax': (0.0, 3.0),
    'ageRange': (0.0, 3.0),
    'duration': (0.0, 3.0),
    'equipmentNeeded': (0.0, 3.0),
    'objective':(0.0, 3.0),
    'skillsDeveloped': (0.0, 3.0),
    'setupTime': (0.0, 3.0),
    'place': (0.0, 3.0),
    'physicalIntensityLevel': (0.0, 3.0),
    'educationalBenefits': (0.0, 3.0),
    'category': (0.0, 3.0),

}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['MRR']

In [23]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'gameName': 2.7564605165743394,
  'alternateNames': 0.8418542945254401,
  'subcategory': 0.6900244685562115,
  'level': 2.832517101754472,
  'description': 1.4646856782009183,
  'playersMax': 0.3230938219561549,
  'ageRange': 0.7535562632275489,
  'duration': 1.2133087232612332,
  'equipmentNeeded': 1.1704314301848018,
  'objective': 1.322740696245261,
  'skillsDeveloped': 0.5529655154258565,
  'setupTime': 0.6747295458557873,
  'place': 0.7064377541005453,
  'physicalIntensityLevel': 1.495127093675108,
  'educationalBenefits': 1.0694526946055358,
  'category': 0.9127114822517018},
 0.8148333333333333)

In [24]:
def minsearch_improved(query):
    boost = {
        'gameName': 2.756,
        'alternateNames': 0.842,
        'subcategory': 0.69,
        'level': 2.833,
        'description': 1.465,
        'playersMax': 0.323,
        'ageRange': 0.734,
        'duration': 1.213,
        'equipmentNeeded': 1.17,
        'objective': 1.323,
        'skillsDeveloped': 0.553,
        'setupTime': 0.675,
        'place': 0.706,
        'physicalIntensityLevel': 1.495,
        'educationalBenefits': 1.069,
        'category': 0.913
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/2950 [00:00<?, ?it/s]

{'hit_rate': 0.8145762711864407, 'MRR': 0.5879607210115676}