In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy.stats import rankdata
from sklearn.metrics import ndcg_score
from sklearn.metrics import mean_squared_error

In [2]:

# Load your dataset
data = pd.read_csv("query_data.csv")
data

Unnamed: 0,Query,Batt,Dist,Wpts,Emot,Comf,Traf,Score,Relev
0,1,0.45,0.5000,0.3000,0.500,0.00,0.65,-0.571159,3.0
1,1,0.61,0.5000,0.1667,1.000,0.50,0.65,-1.107488,5.0
2,1,0.91,0.5556,0.5200,0.600,0.20,0.65,-0.276850,2.0
3,1,0.83,0.5000,0.4000,0.875,0.25,0.65,0.043321,1.0
4,1,0.46,0.5000,0.0500,1.000,0.50,0.65,-1.050039,4.0
...,...,...,...,...,...,...,...,...,...
499995,100000,0.41,0.5000,1.0000,0.750,0.50,0.93,1.569033,2.0
499996,100000,0.52,0.4000,0.2500,1.000,0.25,0.93,-0.069465,3.0
499997,100000,0.39,0.6667,0.0500,1.000,0.25,0.93,-0.464240,4.0
499998,100000,0.34,0.6667,0.7500,1.000,0.25,0.93,1.788080,1.0


In [3]:

# Extract features and target
X = data[["Batt", "Dist", "Wpts", "Emot", "Comf", "Traf"]]
y = data["Relev"]
group = data.groupby("Query").size().to_list()  # Number of rows per query

In [4]:

# Get unique queries
unique_queries = data['Query'].unique()

# Split queries into train and test sets
train_queries, test_queries = train_test_split(unique_queries, test_size=0.2, random_state=42)

# Create training and testing data based on the query splits
train_mask = data['Query'].isin(train_queries)
test_mask = data['Query'].isin(test_queries)

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

# Compute groups for train and test sets
group_train = data[train_mask].groupby('Query').size().to_list()
group_test = data[test_mask].groupby('Query').size().to_list()

In [5]:
X_test[0:5]

Unnamed: 0,Batt,Dist,Wpts,Emot,Comf,Traf
40,0.87,0.5,0.625,1.0,0.125,0.58
41,0.43,0.75,0.2,0.8333,0.1667,0.58
42,0.41,0.5,0.5,0.75,0.25,0.58
43,0.25,0.5,0.5,1.0,0.0,0.58
44,0.25,0.5,0.5,0.5,0.5,0.58


In [6]:

# Prepare the training data
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)  # Set group sizes for training

# Prepare the test data
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(group_test)  # Set group sizes for testing

In [7]:
params = {
    "objective": "rank:pairwise",  # Use pairwise ranking objective
    "learning_rate": 0.1,
    "max_depth": 6,
    "eval_metric": "ndcg",  # Evaluation metric
    "tree_method": "auto",
    "random_state": 42,
}

In [8]:
# Train the model
rank_model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-ndcg:0.96361
[1]	Test-ndcg:0.97205
[2]	Test-ndcg:0.97867
[3]	Test-ndcg:0.98210
[4]	Test-ndcg:0.98357
[5]	Test-ndcg:0.98398
[6]	Test-ndcg:0.98528
[7]	Test-ndcg:0.98634
[8]	Test-ndcg:0.98638
[9]	Test-ndcg:0.98720
[10]	Test-ndcg:0.98724
[11]	Test-ndcg:0.98748
[12]	Test-ndcg:0.98795
[13]	Test-ndcg:0.98828
[14]	Test-ndcg:0.98875
[15]	Test-ndcg:0.98875
[16]	Test-ndcg:0.98899
[17]	Test-ndcg:0.98937
[18]	Test-ndcg:0.98937
[19]	Test-ndcg:0.98973
[20]	Test-ndcg:0.99000
[21]	Test-ndcg:0.98995
[22]	Test-ndcg:0.99051
[23]	Test-ndcg:0.99054
[24]	Test-ndcg:0.99058
[25]	Test-ndcg:0.99062
[26]	Test-ndcg:0.99100
[27]	Test-ndcg:0.99131
[28]	Test-ndcg:0.99147
[29]	Test-ndcg:0.99156
[30]	Test-ndcg:0.99170
[31]	Test-ndcg:0.99187
[32]	Test-ndcg:0.99196
[33]	Test-ndcg:0.99209
[34]	Test-ndcg:0.99221
[35]	Test-ndcg:0.99236
[36]	Test-ndcg:0.99234
[37]	Test-ndcg:0.99243
[38]	Test-ndcg:0.99250
[39]	Test-ndcg:0.99257
[40]	Test-ndcg:0.99257
[41]	Test-ndcg:0.99247
[42]	Test-ndcg:0.99271
[43]	Test-ndcg:0.9928

In [9]:
# Predict scores
y_pred = rank_model.predict(dtest)
len(y_pred)

100000

In [10]:

# Assuming 5 items per query, rank them
query_results = y_pred[:5]  # Adjust as per group
ranked_positions = rankdata(-query_results, method="ordinal")  # Rank in descending order

print("Predicted Ranks:", ranked_positions)

Predicted Ranks: [2 1 4 5 3]


In [11]:
rank_model.save_model("xgboost_rank_model.json")

In [27]:

# Load the test dataset (assuming it's a CSV file)
test_data = pd.read_csv('test_data.csv')

# Prepare the features and labels for evaluation
X_test = test_data.drop(columns=['Query','Score','Relev'])
y_test = test_data['Relev']  # The 'Relev' column is the true relevance score

In [28]:
# Convert the test data into DMatrix format
dtest = xgb.DMatrix(X_test, label=y_test)

In [29]:
# Predict relevance scores for the test data
y_pred = rank_model.predict(dtest)

# Calculate NDCG (assuming relevance is given by 'y_test' and predicted relevance by 'y_pred')
ndcg_score_result = ndcg_score([y_test], [y_pred])
print(f'NDCG score: {ndcg_score_result}')

# Optionally, you can compute Mean Squared Error (MSE) as a baseline evaluation
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

NDCG score: 0.9941678989091123
Mean Squared Error: 10.218516117531914


In [30]:
actual_rank = y_test[:5]
print(actual_rank)


0    2.0
1    4.0
2    5.0
3    3.0
4    1.0
Name: Relev, dtype: float64


In [31]:
actual_rank = rankdata(y_test[:5], method="ordinal")
print(actual_rank)

[2 4 5 3 1]


In [None]:

# # Assuming y_pred and y_test are your predicted and true relevance values
# # Get the number of query groups
# num_groups = len(y_pred) // 5  # Total number of 5-item query groups

# # Loop through the groups and print the predicted ranks vs true ranks
# for i in range(num_groups):
#     start = i * 5
#     end = (i + 1) * 5
    
#     # Get predicted scores for the current query group
#     query_results = y_pred[start:end]  # Adjust as per group
#     ranked_positions = rankdata(query_results, method="ordinal")  # Rank in descending order
    
#     # Get true relevance scores for the current query group
#     actual_relevance = y_test[start:end]
#     actual_rank = rankdata(actual_relevance, method="ordinal")
    
#     print(f"Group {i + 1}:")
#     print("Predicted Ranks:", ranked_positions)
#     print("True Ranks:", actual_rank)
#     print()


In [32]:
group_num = 5

start = (group_num-1) * 5
end   = group_num * 5

# Get predicted scores for the current query group
query_results = y_pred[start:end]  # Adjust as per group
ranked_positions = rankdata(query_results, method="ordinal")  # Rank in descending order

# Get true relevance scores for the current query group
actual_relevance = y_test[start:end]
actual_rank = rankdata(actual_relevance, method="ordinal")

print(f"Group {group_num}:")
print("Predicted Ranks:", ranked_positions)
print("True Ranks:", actual_rank)
print()

Group 5:
Predicted Ranks: [5 4 3 1 2]
True Ranks: [5 4 3 1 2]



In [33]:

# Get the number of query groups
num_groups = len(y_pred) // 5  # Total number of 5-item query groups

correct_count = 0  # Counter to track correct rankings
total_count = 0  # Total number of groups

# Loop through the groups and compare the predicted ranks vs true ranks
for i in range(num_groups):
    start = i * 5
    end = (i + 1) * 5
    
    # Get predicted scores for the current query group
    query_results = y_pred[start:end]  # Adjust as per group
    ranked_positions = rankdata(query_results, method="ordinal")  # Rank in descending order
    
    # Get true relevance scores for the current query group
    actual_relevance = y_test[start:end]
    actual_rank = rankdata(actual_relevance, method="ordinal")
    
    # Compare predicted and actual ranks
    if (ranked_positions == actual_rank).all():  # Check if ranks are exactly the same
        correct_count += 1
    
    total_count += 1

# Calculate overall accuracy
accuracy = correct_count / total_count
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8925
