Load Datasets

In [1]:
import pandas as pd

# Loading data
query_product_df = pd.read_csv('/content/drive/MyDrive/digikala_ds_task_query_product.csv')
task_products_df = pd.read_csv('/content/drive/MyDrive/digikala_ds_task_products.csv')

Data Preprocessing


In [2]:
from sklearn.impute import SimpleImputer

# Handling missing values
print("Before Cleaning")
print(query_product_df.isnull().sum())
print(task_products_df.isnull().sum())

# Impute missing values for numerical features
imputer = SimpleImputer(strategy='mean')
query_product_df['price'] = imputer.fit_transform(query_product_df[['price']])
query_product_df['discount'] = imputer.fit_transform(query_product_df[['discount']])
query_product_df['relevancy_score_2'] = imputer.fit_transform(query_product_df[['relevancy_score_2']])


# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
query_product_df['category_id'] = categorical_imputer.fit_transform(query_product_df[['category_id']])
query_product_df['brand_id'] = categorical_imputer.fit_transform(query_product_df[['brand_id']])

task_products_df.fillna(0, inplace=True)

# Check for duplicates
print(query_product_df.duplicated(subset=['q_id', 'd_id']).sum())
# Remove duplicates if any
query_product_df.drop_duplicates(subset=['q_id', 'd_id'], inplace=True)

print("After Cleaning")
print(query_product_df.isnull().sum())
print(task_products_df.isnull().sum())

Before Cleaning
q_id                      0
d_id                      0
relevancy_score_1         0
relevancy_score_2     54770
category_id               6
brand_id                  6
price                167501
discount                  6
target_score              0
dtype: int64
date            0
d_id            0
search_view     0
search_click    0
search_sales    0
dtype: int64
0
After Cleaning
q_id                 0
d_id                 0
relevancy_score_1    0
relevancy_score_2    0
category_id          0
brand_id             0
price                0
discount             0
target_score         0
dtype: int64
date            0
d_id            0
search_view     0
search_click    0
search_sales    0
dtype: int64


Handling outliers

In [3]:
# Handling outliers using IQR method for price
import numpy as np

Q1 = query_product_df['price'].quantile(0.25)
Q3 = query_product_df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
query_product_df['price'] = np.clip(query_product_df['price'], lower_bound, upper_bound)


Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler

# Normalizing numerical features
scaler = StandardScaler()
query_product_df[['price', 'discount']] = scaler.fit_transform(query_product_df[['price', 'discount']])


Merging Datasets

In [5]:
# Merging datasets
df = pd.merge(query_product_df, task_products_df, on='d_id', how='left')
df.fillna(0, inplace=True)


Feature Engineering

In [6]:
# Feature Engineering
df['click_through_rate'] = df['search_click'] / df['search_view']
df['conversion_rate'] = df['search_sales'] / df['search_click']
df['effective_price'] = df['price'] * (1 - df['discount'])

# Handling division by zero
df.fillna(0, inplace=True)

# Date conversion
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['day_of_week'] = df['date'].dt.dayofweek
df['days_before_event'] = (pd.to_datetime('2023-12-23') - df['date']).dt.days
df = df.drop(columns=['date'])


In [12]:
!pip install pytrec_eval

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308201 sha256=9ac0d42270985200dcd47d77262aab322f90ae4c233633ee3802d178e9588506
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


Data Splitting

In [7]:
from sklearn.model_selection import GroupShuffleSplit

X = df.drop(columns=['target_score'])
y = df['target_score']
groups = df['q_id']

# Split the data into training and test sets
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train, groups_test = groups.iloc[train_idx], groups.iloc[test_idx]

Data Scaling

In [8]:
from sklearn.preprocessing import StandardScaler

# Replace inf values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.replace([np.inf, -np.inf], np.nan, inplace=True)


Model Building

In [9]:
from xgboost import XGBRanker
from sklearn.metrics import ndcg_score
import numpy as np

# Calculate the group size
groups_train_size = groups_train.value_counts().sort_index().values

# Define the model
model = XGBRanker(objective='rank:pairwise', random_state=42)

# Train the model
model.fit(X_train_scaled, y_train, group=groups_train_size)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate using sklearn's NDCG
sklearn_ndcg = ndcg_score([y_test], [y_pred])
print(f"Initial sklearn NDCG Score: {sklearn_ndcg:.4f}")


Initial sklearn NDCG Score: 0.9756


Handling the error of being negative in y

In [9]:
import numpy as np
from sklearn.preprocessing import LabelEncoder


# Check unique values
print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))

# Convert to non-negative integers if necessary
if not np.all(np.floor(y_train) == y_train) or np.any(y_train < 0):
    y_train = np.floor(y_train).astype(int)

if not np.all(np.floor(y_test) == y_test) or np.any(y_test < 0):
    y_test = np.floor(y_test).astype(int)

# Re-check labels
print("Unique values in corrected y_train:", np.unique(y_train))
print("Unique values in corrected y_test:", np.unique(y_test))

# Ensure non-negative integers
def ensure_non_negative_integers(y):
    if not np.all(np.floor(y) == y) or np.any(y < 0):
        raise ValueError("Target labels must be non-negative integers.")
    return y

y_train = ensure_non_negative_integers(y_train)
y_test = ensure_non_negative_integers(y_test)


Unique values in y_train: [4.45000000e-07 6.73000000e-06 7.82000000e-06 ... 1.27272727e+00
 1.30000000e+00 1.40697674e+00]
Unique values in y_test: [1.28000000e-05 2.04000000e-05 4.35000000e-05 ... 1.23913044e+00
 1.26712329e+00 1.30851064e+00]
Unique values in corrected y_train: [0 1]
Unique values in corrected y_test: [0 1]


Model Optimization

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import numpy as np
from sklearn.model_selection._split import _BaseKFold


# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 1, 10],
    'reg_lambda': [0, 0.1, 1, 10]
}

# Custom scorer function
def group_ndcg_scorer(estimator, X, y, groups):
    group_counts = np.bincount(groups)
    y_pred = estimator.predict(X)
    scores = []
    start = 0
    for count in group_counts:
        end = start + count
        true_relevance = y[start:end]
        predicted_relevance = y_pred[start:end]
        scores.append(ndcg_score([true_relevance], [predicted_relevance]))
        start = end
    return np.mean(scores)


# Custom GroupKFold that returns group indices
class CustomGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5):
        super().__init__(n_splits, shuffle=False, random_state=None)

    def _iter_test_indices(self, X, y, groups):
        unique_groups = np.unique(groups)
        n_groups = len(unique_groups)
        group_indices = np.arange(n_groups)

        for test_group_indices in np.array_split(group_indices, self.n_splits):
            test_groups = unique_groups[test_group_indices]
            test_mask = np.isin(groups, test_groups)
            yield np.where(test_mask)[0]

# Setup RandomizedSearchCV
xgb_model = XGBRanker(objective='rank:ndcg', tree_method='hist', random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=10,
    scoring=make_scorer(group_ndcg_scorer, needs_proba=False),
    cv=CustomGroupKFold(n_splits=5),
    verbose=2,
    random_state=42,
    n_jobs=1
)

# Fit the model
random_search.fit(X_train_scaled, y_train, group=groups_train_size)


# Evaluate results
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate on test set
test_score = group_ndcg_scorer(best_model, X_test_scaled, y_test, groups_test)
print(f"\nNDCG Score on Test Set: {test_score}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits


  yield np.where(test_mask)[0]


[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.3, max_depth=7, min_child_weight=7, n_estimators=100, reg_alpha=10, reg_lambda=0, subsample=1.0; total time=   5.3s


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
TypeError: group_ndcg_scorer() missing 2 required positional arguments: 'y' and 'groups'



[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.3, max_depth=7, min_child_weight=7, n_estimators=100, reg_alpha=10, reg_lambda=0, subsample=1.0; total time=  26.9s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.3, max_depth=7, min_child_weight=7, n_estimators=100, reg_alpha=10, reg_lambda=0, subsample=1.0; total time=  19.0s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.3, max_depth=7, min_child_weight=7, n_estimators=100, reg_alpha=10, reg_lambda=0, subsample=1.0; total time=  17.1s
[CV] END colsample_bytree=1.0, gamma=0.1, learning_rate=0.3, max_depth=7, min_child_weight=7, n_estimators=100, reg_alpha=10, reg_lambda=0, subsample=1.0; total time=  17.0s
[CV] END colsample_bytree=1.0, gamma=0.3, learning_rate=0.3, max_depth=9, min_child_weight=1, n_estimators=500, reg_alpha=1, reg_lambda=10, subsample=1.0; total time=   1.5s
[CV] END colsample_bytree=1.0, gamma=0.3, learning_rate=0.3, max_depth=9, min_child_weight=1, n_estimators=500, reg_alpha=1, reg_l

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 2021, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/training.py", line 181, 

Best parameters: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 10, 'n_estimators': 100, 'min_child_weight': 7, 'max_depth': 7, 'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 1.0}
Best cross-validation score: nan


ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

Other Evaluation Metrics

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker
from sklearn.metrics import ndcg_score
import pytrec_eval


# Create group labels based on q_id
group_train = X_train.groupby('q_id').size().to_frame('size')['size'].to_numpy()
group_test = X_test.groupby('q_id').size().to_frame('size')['size'].to_numpy()

# Store q_id and d_id before dropping them
q_id_test = X_test['q_id']
d_id_test = X_test['d_id']

# Train XGBRanker model
model = XGBRanker(objective='rank:pairwise', learning_rate=0.1, n_estimators=100, random_state=42)
model.fit(X_train, y_train, group=group_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate using sklearn's NDCG
sklearn_ndcg = ndcg_score([y_test], [y_pred])
print(f"sklearn NDCG Score: {sklearn_ndcg:.4f}")

# Prepare data for pytrec_eval
qrels = {}
run = {}

for q_id, d_id, true_score, pred_score in zip(q_id_test, d_id_test, y_test, y_pred):
    q_id_str = str(q_id)
    d_id_str = str(d_id)

    if q_id_str not in qrels:
        qrels[q_id_str] = {}
        run[q_id_str] = {}

    qrels[q_id_str][d_id_str] = int(true_score * 5)  # Scale to 0-5 range and convert to int
    run[q_id_str][d_id_str] = float(pred_score)

# Print sample data
print("\nSample qrels data:")
print(list(qrels.items())[:2])
print("\nSample run data:")
print(list(run.items())[:2])

# Create evaluator
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'recip_rank', 'ndcg', 'P_5'})

# Compute metrics
results = evaluator.evaluate(run)

# Calculate average scores
avg_map = np.mean([query_results['map'] for query_results in results.values()])
avg_mrr = np.mean([query_results['recip_rank'] for query_results in results.values()])
avg_ndcg = np.mean([query_results['ndcg'] for query_results in results.values()])
avg_precision_at_5 = np.mean([query_results['P_5'] for query_results in results.values()])

# Print results
print("\nEvaluation Metrics:")
print(f"Average MAP: {avg_map:.4f}")
print(f"Average MRR: {avg_mrr:.4f}")
print(f"Average NDCG (pytrec_eval): {avg_ndcg:.4f}")
print(f"Average Precision@5: {avg_precision_at_5:.4f}")


# Print feature importance
feature_importance = model.get_booster().get_score(importance_type='weight')
sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
print("\nTop 10 Feature Importance:")
for feature, importance in sorted_importance[:10]:
    print(f"{feature}: {importance}")

sklearn NDCG Score: 0.9759

Sample qrels data:
[('30', {'436443': 3, '467566': 3, '440251': 3, '482725': 3, '443321': 3, '481961': 1, '507261': 0, '503474': 0, '486341': 2, '485691': 0, '488426': 0, '499462': 0, '421741': 1, '494500': 2, '514278': 0, '440258': 0, '486335': 1, '498518': 1, '474779': 0, '522065': 0, '440068': 1, '421364': 0, '523715': 0, '482612': 0, '485668': 1, '462131': 2, '505692': 0, '485653': 0, '433378': 2, '523709': 1, '468916': 0, '495080': 2, '487930': 2, '522069': 1, '468914': 1}), ('42', {'396006': 3, '498146': 2, '485120': 2, '412432': 2, '503731': 2, '404735': 1, '298870': 2, '406762': 1, '406702': 1, '447049': 1, '32368': 1, '166241': 1, '18457': 1, '283': 0, '285': 0, '1145': 0, '11914': 1, '5809': 1, '4963': 0, '85193': 0, '444151': 2, '477268': 2, '477269': 0, '102851': 1, '177953': 2, '177957': 2, '416022': 1, '503707': 0, '412392': 0, '147820': 0, '262667': 0, '406339': 0, '421045': 1, '454809': 0, '86962': 1, '865': 0, '32373': 0, '68707': 0, '102759