In [6]:
#загрузка библиотек
import numpy as np
import pandas as pd

In [7]:
#чтение файла
df = pd.read_csv('intern_task.csv')
print('Shape of data: {}'.format(df.shape))
df.head()

Shape of data: (235258, 146)


Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0


# Подготовка данных

In [8]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235258 entries, 0 to 235257
Columns: 146 entries, rank to feature_143
dtypes: float64(140), int64(6)
memory usage: 262.1 MB


None

In [9]:
print('Количество дубликатов: {}'.format(df[df.duplicated()].shape[0]))
print('Количество пропусков: {}'.format(df[df.isnull().tail()].shape[0]))

Количество дубликатов: 0
Количество пропусков: 235258


In [10]:
cols_null_persent = df.isnull().mean() * 100
cols_with_null = cols_null_persent[cols_null_persent>0].sort_values(ascending=False)
display(cols_with_null)

Series([], dtype: float64)

Дубликатов и пропусков нет

# Построение модели

In [11]:
#X - матрица наблюдений, y - столбец правильных ответов
X = df.drop('rank', axis=1)
y = df['rank']
from sklearn.model_selection import train_test_split
#создаем тренировочную и тестовую выборки 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42 
)
print('Train shape: {}'.format(X_train.shape))
print('Test shape: {}'.format(X_test.shape))

Train shape: (188206, 145)
Test shape: (47052, 145)


Будем использовать Gradient Boosting Machines (GBM) 

In [12]:
import xgboost as xgb

In [13]:
#создаем DMatrix для обучающего и тестового наборов
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [14]:
#определяем параметры модели
params = {
    'objective': 'rank:ndcg',  # используем ndcg в качестве целевой функции
    'eval_metric': 'ndcg',     # используем ndcg для оценки качества модели
    'eta': 0.1,                # скорость обучения
    'max_depth': 6,            # максимальная глубина деревьев
    'min_child_weight': 1,     # минимальный вес дочернего узла
    'subsample': 0.8,          # доля обучающих данных, используемая для обучения каждого дерева
    'colsample_bytree': 0.8,   # доля признаков, используемых для обучения каждого дерева
    'lambda': 1,               # параметр регуляризации L2
    'alpha': 0.1,              # параметр регуляризации L1
    'nthread': 4,              # количество потоков для обработки
    'random_state': 42         # задаем случайное состояние для воспроизводимости результатов
}


In [15]:
#обучаем модели
num_round = 100  # количество итераций обучения
bst = xgb.train(params, dtrain, num_round)

In [16]:
#строим предсказание на тестовом наборе
y_pred = bst.predict(dtest)
y_true = df['rank']

# Оценка качества модели

In [17]:
#расчитываем ndcg@5
from sklearn.metrics import ndcg_score
ndcg_5 = ndcg_score([y_test], [y_pred], k=5)
print("NDCG@5:", ndcg_5)

NDCG@5: 0.9229457959214169


In [18]:
#расчитываем mrr
def mean_reciprocal_rank(y_true, y_pred):
    reciprocal_ranks = []
    relevant_indices = [i for i, y in enumerate(y_true) if y == 1]
    if len(relevant_indices) == 0:
        return 0
    else:
        first_relevant_index = min(relevant_indices)
        return 1 / (first_relevant_index + 1) 

mrr = mean_reciprocal_rank(y_true, y_pred)
print("Mean Reciprocal Rank (MRR):", mrr)


Mean Reciprocal Rank (MRR): 0.5


In [19]:
#расчитываем precision@k
def precision_at_k(y_true, y_pred, k):
    sorted_indices = np.argsort(y_pred)[::-1]
    top_k_indices = sorted_indices[:k]
    relevant_documents = [1 if y_true[i] == 1 else 0 for i in top_k_indices]
    precision = np.sum(relevant_documents) / k
    
    return precision

k = 5

precision = precision_at_k(y_true, y_pred, k)
print("Precision at", k, ":", precision)

Precision at 5 : 0.4


Модель обладает потенциалом для улучшения, но уже достигла неплохих результатов в ранжировании документов по их релевантности для пользовательских запросов.