In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split


In [2]:

# Load dataset
data  = pd.read_csv('query_data.csv')
data

Unnamed: 0,Query,Batt,Dist,Wpts,Emot,Comf,Traf,Score,Relev
0,1,0.36,0.6667,0.5500,0.500,0.0000,0.40,0.828808,2.0
1,1,0.41,0.5000,0.5500,0.750,0.0000,0.40,1.037661,1.0
2,1,0.44,1.0000,0.5000,0.875,0.5000,0.40,0.249895,3.0
3,1,0.65,1.0000,0.5167,0.750,0.4167,0.40,-0.338252,5.0
4,1,0.71,0.7143,0.6000,0.800,0.1000,0.40,0.111925,4.0
...,...,...,...,...,...,...,...,...,...
49995,10000,0.46,0.5000,0.7500,1.000,0.2500,0.42,1.357789,1.0
49996,10000,0.47,1.0000,0.1500,0.750,0.0000,0.42,-1.086373,3.0
49997,10000,0.20,0.5000,0.1000,1.000,1.0000,0.42,-2.175459,5.0
49998,10000,0.65,0.5000,0.3667,0.500,0.1667,0.42,-1.309166,4.0


In [3]:

# Assuming your dataset is in a pandas DataFrame named `data`
features = ['Batt', 'Dist', 'Wpts', 'Emot', 'Comf', 'Traf']
x = data[features]
y = data['Relev']
group_ids = data.groupby('Query').size().values  # Get the number of paths per query
# Assuming your data is in a pandas DataFrame named `data`
# Group the dataset by Query to get unique queries
queries = data['Query'].unique()

# Validate that group sizes sum to the total number of samples
assert group_ids.sum() == data.shape[0], (
    f"Mismatch in group sizes: sum(group_ids) = {group_ids.sum()}, "
    f"but total samples = {data.shape[0]}"
)

In [4]:

# Split the queries into training and validation sets
train_queries, val_queries = train_test_split(queries, test_size=0.2, random_state=42)

# Create masks for training and validation sets
train_mask = data['Query'].isin(train_queries)
val_mask = data['Query'].isin(val_queries)

# Split the data into training and validation sets
x_train = data[train_mask][features]
y_train = data[train_mask]['Relev']
x_val = data[val_mask][features]
y_val = data[val_mask]['Relev']

# Get the group sizes for training and validation
train_group_sizes = data[train_mask].groupby('Query').size().values
val_group_sizes = data[val_mask].groupby('Query').size().values

# Validate training set group sizes
assert train_group_sizes.sum() == x_train.shape[0], (
    f"Mismatch in train group sizes: sum(train_group_sizes) = {train_group_sizes.sum()}, "
    f"but total training samples = {x_train.shape[0]}"
)

# Validate validation set group sizes
assert val_group_sizes.sum() == x_val.shape[0], (
    f"Mismatch in val group sizes: sum(val_group_sizes) = {val_group_sizes.sum()}, "
    f"but total validation samples = {x_val.shape[0]}"
)


In [5]:

# Create LightGBM datasets for training and validation
train_data = lgb.Dataset(x_train, label=y_train, group=train_group_sizes)
val_data = lgb.Dataset(x_val, label=y_val, group=val_group_sizes)

params = {
    'objective': 'lambdarank',  # Objective for ranking
    'metric': 'ndcg',  # NDCG for ranking tasks
    'ndcg_eval_at': [1, 3, 5],  # Evaluate NDCG at different levels
}

# Train the LightGBM model
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, val_data],
)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 434
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 6


In [6]:
y_pred = model.predict(x_val)  # Predicted relevance scores
len(y_pred)


10000