In [1]:
import numpy as np
import pandas as pd
import zipfile

# Загрузка и чтение данных

In [2]:
# Настраиваем доступ к kaggle.json
!mkdir -p ~/.kaggle/
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Скачиваем архив с данными
!kaggle competitions download -c hse-rec-sys-challenge-2024

# Распаковываем его
with zipfile.ZipFile("/content/hse-rec-sys-challenge-2024.zip","r") as zip_f:
    zip_f.extractall("/content/hse-rec-sys-challenge-2024")

hse-rec-sys-challenge-2024.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
dir = "/content/hse-rec-sys-challenge-2024"

# Читаем все данные
events = pd.read_csv(f"{dir}/events.csv")
item_features = pd.read_csv(f"{dir}/item_features.csv")
user_features = pd.read_csv(f"{dir}/user_features.csv")

# Собираем их в единый датасет
df = events\
.merge(user_features, on="user_id", how="left")\
.merge(item_features, on="item_id", how="left")

df.timestamp = pd.to_datetime(df.timestamp, unit='s')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,gender,age,genre_0,genre_1,genre_2,genre_3,...,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,1505,4,1970-01-01 00:00:00,M,35,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3669,3,1970-01-01 00:00:01,M,35,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,584,4,1970-01-01 00:00:02,M,35,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,3390,3,1970-01-01 00:00:03,M,35,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2885,4,1970-01-01 00:00:04,M,35,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# User фичи

In [4]:
# Распределение оценок пользователей
user_ratings_xtra = pd.concat(
    [
        df.groupby('user_id').rating.count(),
        df.groupby('user_id').rating.mean().round(4),
        df.groupby('user_id').rating.std().round(4),
        df.groupby('user_id').rating.quantile([0, 0.1, 0.25, 0.33, 0.5, 0.67, 0.75, 0.9, 1]).unstack(level=1)
    ],
    axis=1
)

user_ratings_xtra.columns = [
    'user_rating_count',
    'user_rating_mean',
    'user_rating_std',
    'user_rating_min',
    'user_rating_q10',
    'user_rating_q25',
    'user_rating_q33',
    'user_rating_q50',
    'user_rating_q67',
    'user_rating_q75',
    'user_rating_q90',
    'user_rating_max'
]

user_ratings_xtra.head()

Unnamed: 0_level_0,user_rating_count,user_rating_mean,user_rating_std,user_rating_min,user_rating_q10,user_rating_q25,user_rating_q33,user_rating_q50,user_rating_q67,user_rating_q75,user_rating_q90,user_rating_max
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,287,3.9791,0.8965,1.0,3.0,3.0,4.0,4.0,4.0,5.0,5.0,5.0
1,261,3.6475,0.9719,1.0,2.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0
2,143,3.7972,0.7077,1.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0
3,231,3.3506,1.048,1.0,2.0,3.0,3.0,3.0,4.0,4.0,5.0,5.0
4,107,4.0467,0.8398,2.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0


In [5]:
# Выделяем столбцы с жанрами
genre_columns = [i for i in df.columns if i.startswith("genre_")]

# Жанровые предпочтения пользователей - доли жанров, среди всех их оценок
user_genre_xtra = df.groupby('user_id')[genre_columns].mean().round(4)

user_genre_xtra.columns = [f"user_genre{i}_share" for i in np.arange(len(genre_columns))]

user_genre_xtra.head()

Unnamed: 0_level_0,user_genre0_share,user_genre1_share,user_genre2_share,user_genre3_share,user_genre4_share,user_genre5_share,user_genre6_share,user_genre7_share,user_genre8_share,user_genre9_share,user_genre10_share,user_genre11_share,user_genre12_share,user_genre13_share,user_genre14_share,user_genre15_share,user_genre16_share,user_genre17_share
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.331,0.2125,0.0906,0.1498,0.3659,0.0697,0.0,0.2474,0.0662,0.0139,0.0767,0.0627,0.0453,0.1882,0.1638,0.2265,0.0592,0.0139
1,0.3257,0.1303,0.0268,0.0345,0.2605,0.1226,0.0115,0.2644,0.0268,0.0192,0.1226,0.0192,0.0345,0.092,0.2337,0.4023,0.0536,0.0192
2,0.5804,0.2867,0.0559,0.0559,0.2448,0.0769,0.007,0.2098,0.028,0.007,0.0769,0.021,0.028,0.1049,0.2937,0.3077,0.0629,0.014
3,0.5887,0.2684,0.039,0.0693,0.3636,0.0736,0.0,0.1299,0.039,0.0,0.0779,0.0173,0.026,0.0693,0.329,0.2857,0.0563,0.0173
4,0.4019,0.1121,0.0374,0.0374,0.3645,0.0935,0.0,0.3364,0.0093,0.0,0.0187,0.028,0.0187,0.1215,0.1776,0.2617,0.0561,0.028


In [6]:
# Распределение времени активности (выставления оценок) пользователей
user_timestamp_xtra = df.groupby('user_id').timestamp\
.apply(lambda x: (x - x.min()).dt.total_seconds().quantile([0, 0.1, 0.25, 0.33, 0.5, 0.67, 0.75, 0.9, 1]))\
.unstack(level=1)
user_timestamp_xtra['user_timetamp_range'] = user_timestamp_xtra[1.00] - user_timestamp_xtra[0.00]
user_timestamp_xtra['user_timestamp_iqr'] = user_timestamp_xtra[0.75] - user_timestamp_xtra[0.25]

user_timestamp_xtra.columns = [f"user_timestamp_q{i}" for i in [0, 10, 25, 33, 50, 67, 75, 90, 100]] + ['user_timestamp_range', 'user_timestamp_iqr']

user_timestamp_xtra.head()

Unnamed: 0_level_0,user_timestamp_q0,user_timestamp_q10,user_timestamp_q25,user_timestamp_q33,user_timestamp_q50,user_timestamp_q67,user_timestamp_q75,user_timestamp_q90,user_timestamp_q100,user_timestamp_range,user_timestamp_iqr
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.0,31.6,78.5,102.38,162.0,217.62,241.5,293.4,326.0,326.0,163.0
1,0.0,31.0,72.0,93.8,146.0,194.2,217.0,259.0,286.0,286.0,145.0
2,0.0,17.2,41.5,54.86,81.0,107.14,118.5,140.8,159.0,159.0,77.0
3,0.0,25.0,66.5,85.9,129.0,169.1,188.5,227.0,253.0,253.0,122.0
4,0.0,11.6,30.5,38.98,59.0,77.02,86.5,105.4,117.0,117.0,56.0


In [7]:
# Смещения оценок относительно средних в разных разрезах
other_bias = df\
.merge(
    df.groupby('user_id').rating.mean().round(4).rename('user_bias').reset_index(),
    on='user_id',
    how='left'
)\
.merge(
    df.groupby('item_id').rating.mean().round(4).rename('item_bias').reset_index(),
    on='item_id',
    how='left'
)\
.merge(
    df.groupby('gender').rating.mean().round(4).rename('gender_bias').reset_index(),
    on='gender',
    how='left'
)\
.merge(
    df.groupby('age').rating.mean().round(4).rename('age_bias').reset_index(),
    on='age',
    how='left'
).iloc[:, -4:]

# С жанрами все чуть хитрее - мы будем считать смещение только если фильм относится к соответствующему жанру
genre_mean_ratings = pd.Series(index=genre_columns, name='avg_rating')
for g in genre_columns:
    genre_mean_ratings.loc[g] = df[df[g] == 1].rating.mean().round(4)
genre_bias = df[genre_columns].mask(df[genre_columns] == 1).fillna(genre_mean_ratings).where(df[genre_columns] == 1)
genre_bias.columns = [i+'_bias' for i in genre_columns]

# Соединяем
ratings_bias_xtra = pd.concat([other_bias, genre_bias], axis=1)

ratings_bias_xtra = ratings_bias_xtra.apply(lambda col: col - df.rating)

ratings_bias_xtra.head()

Unnamed: 0,user_bias,item_bias,gender_bias,age_bias,genre_0_bias,genre_1_bias,genre_2_bias,genre_3_bias,genre_4_bias,genre_5_bias,...,genre_8_bias,genre_9_bias,genre_10_bias,genre_11_bias,genre_12_bias,genre_13_bias,genre_14_bias,genre_15_bias,genre_16_bias,genre_17_bias
0,-0.0209,0.3601,-0.4319,-0.3817,-0.509,,,,,-0.2932,...,,,,,,,,,,
1,0.9791,0.3813,0.5681,0.6183,,,,,0.5207,,...,,,,,,0.607,,,,
2,-0.0209,-0.2216,-0.4319,-0.3817,-0.509,-0.5238,,,,,...,,,,,,,-0.5328,,,
3,0.9791,0.7969,0.5681,0.6183,,,,,0.5207,0.7068,...,,,,,,,,,,
4,-0.0209,-0.6288,-0.4319,-0.3817,-0.509,,,,,-0.2932,...,,,,,,,,,,


In [8]:
# Косинусное расстояние - используется для определения "схожести" двух векторов
def cosine_distance(m1, m2):
    # Нормируем строки матриц
    m1 = m1 / np.linalg.norm(m1, axis=1, keepdims=True)
    m2 = m2 / np.linalg.norm(m2, axis=1, keepdims=True)

    # Нормы строк матриц
    norm1 = np.linalg.norm(m1, axis=1)
    norm2 = np.linalg.norm(m2, axis=1)

    # Скалярное произведение каждой пары строк двух матриц
    dot_product = np.sum(m1 * m2, axis=1)

    # Избегаем деления на ноль
    norms_product = norm1 * norm2
    norms_product = np.where(norms_product == 0, 1e-10, norms_product)  # Для стабильности

    # Косинусное расстояние
    return dot_product / norms_product



# Для каждого пользователя рассчитаем, насколько предпочитаемые им жанры совпадают с жанрами конкретного фильма
# Предпочтения пользователей будем определять по хорошим оценкам (4+)
# Рассчитывать будем через косинусное расстояние между профилем пользователя (доли жанров, которые он предпочитает) и профилем фильма
user_preferences = df[df.rating >= 4].groupby('user_id')[genre_columns].mean().round(4)
user_preferences_xtra = pd.DataFrame(
    cosine_distance(df[genre_columns].values, user_preferences.reindex(df.index).fillna(-1).values).round(4),
    index=df.index,
    columns=['user_preference']
)

user_preferences_xtra.head()

Unnamed: 0,user_preference
0,0.5092
1,0.3635
2,0.7862
3,0.3793
4,0.706


In [9]:
# Собираем воедино user фичи
user_features = pd.concat(
    [user_ratings_xtra, user_genre_xtra, user_timestamp_xtra],
    axis=1
)
# Отдельно собираем смещения и пользовательские предпочтения, т.к. они собирались на уровне клиент-фильм, а не клиент
# Затем наконец собираем все-все вместе
user_features = pd.concat([df[['user_id', 'item_id']], ratings_bias_xtra, user_preferences_xtra], axis=1)\
.merge(user_features, on='user_id', how='left')

print(user_features.shape)
user_features.head()

(894149, 66)


Unnamed: 0,user_id,item_id,user_bias,item_bias,gender_bias,age_bias,genre_0_bias,genre_1_bias,genre_2_bias,genre_3_bias,...,user_timestamp_q10,user_timestamp_q25,user_timestamp_q33,user_timestamp_q50,user_timestamp_q67,user_timestamp_q75,user_timestamp_q90,user_timestamp_q100,user_timestamp_range,user_timestamp_iqr
0,0,1505,-0.0209,0.3601,-0.4319,-0.3817,-0.509,,,,...,31.6,78.5,102.38,162.0,217.62,241.5,293.4,326.0,326.0,163.0
1,0,3669,0.9791,0.3813,0.5681,0.6183,,,,,...,31.6,78.5,102.38,162.0,217.62,241.5,293.4,326.0,326.0,163.0
2,0,584,-0.0209,-0.2216,-0.4319,-0.3817,-0.509,-0.5238,,,...,31.6,78.5,102.38,162.0,217.62,241.5,293.4,326.0,326.0,163.0
3,0,3390,0.9791,0.7969,0.5681,0.6183,,,,,...,31.6,78.5,102.38,162.0,217.62,241.5,293.4,326.0,326.0,163.0
4,0,2885,-0.0209,-0.6288,-0.4319,-0.3817,-0.509,,,,...,31.6,78.5,102.38,162.0,217.62,241.5,293.4,326.0,326.0,163.0


# Item фичи

In [11]:
# Расчёт долей полов и возрастов по фильмам
# Доли полов по фильмам
gender_distribution = df.groupby('item_id')['gender'].value_counts(normalize=True).unstack().fillna(0)
gender_distribution.columns = ['female_ratio', 'male_ratio']

In [12]:
# Доли возрастных групп по фильмам
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 45, 100], labels=['young', 'adult', 'senior'])
age_distribution = df.groupby('item_id')['age_group'].value_counts(normalize=True).unstack().fillna(0)
age_distribution.columns = ['young_ratio', 'adult_ratio', 'senior_ratio']

In [13]:
# Столбцы жанров
genre_columns = [col for col in item_features.columns if col.startswith('genre')]

# Доли полов по жанрам
gender_by_genre = df.groupby(genre_columns)['gender'].value_counts(normalize=True).unstack().fillna(0)
gender_by_genre.columns = ['female_ratio_genre', 'male_ratio_genre']

In [14]:
# Доли возрастных групп по жанрам
age_by_genre = df.groupby(genre_columns)['age_group'].value_counts(normalize=True).unstack().fillna(0)
age_by_genre.columns = ['young_ratio_genre', 'adult_ratio_genre', 'senior_ratio_genre']

In [17]:
# Подсчёт количества оценок, среднего и медианного рейтингов, квантилей
item_stats = df[events.columns].groupby('item_id').agg(
    item_rating_count=('rating', 'count'),
    avg_item_rating=('rating', 'mean'),
    item_rating_std=('rating', 'std'),
    item_rating_quantile_0=('rating', lambda x: x.quantile(0)),
    item_rating_quantile_10=('rating', lambda x: x.quantile(0.1)),
    item_rating_quantile_25=('rating', lambda x: x.quantile(0.25)),
    item_rating_quantile_33=('rating', lambda x: x.quantile(0.33)),
    item_rating_quantile_50=('rating', lambda x: x.quantile(0.5)),
    item_rating_quantile_67=('rating', lambda x: x.quantile(0.67)),
    item_rating_quantile_75=('rating', lambda x: x.quantile(0.75)),
    item_rating_quantile_90=('rating', lambda x: x.quantile(0.9)),
    item_rating_quantile_100=('rating', lambda x: x.quantile(1)),
    avg_rating_time=('timestamp', lambda x: (x - x.min()).dt.total_seconds().mean()),
    rating_time_range=('timestamp', lambda x: (x.max() - x.min()).total_seconds())
).reset_index()

# Отклонения от средней и медианной оценок
item_stats = pd.merge(events.drop(columns=['timestamp']), item_stats, on='item_id', how='left')
item_stats['rating_deviation_from_mean'] = item_stats['rating'] - item_stats['avg_item_rating']
item_stats['rating_deviation_from_median'] = item_stats['rating'] - item_stats['item_rating_quantile_50']
item_stats = item_stats.drop(columns=['rating'])

# Время оценки фильма - среднее и размах
item_stats = item_stats\
.merge(
    df[events.columns].groupby('item_id')['timestamp'].apply(lambda x: (x - x.min()).dt.total_seconds().mean())\
    .rename('avg_rating_time').reset_index(),
    on='item_id',
    how='left'
)\
.merge(
    df[events.columns].groupby('item_id')['timestamp'].apply(lambda x: (x.max() - x.min()).total_seconds())\
    .rename('rating_time_range').reset_index(),
    on='item_id',
    how='left'
)

In [18]:
# Объединение всех фичей с основными данными о фильмах
item_features = df[['user_id', 'item_id']+genre_columns]\
.merge(gender_distribution, on='item_id', how='left')\
.merge(age_distribution, on='item_id', how='left')\
.merge(gender_by_genre, on=genre_columns, how='left')\
.merge(age_by_genre, on=genre_columns, how='left')\
.drop(genre_columns, axis=1)

item_features = pd.concat([item_features, item_stats], axis=1)

print(item_features.shape)
item_features.head()

(894149, 32)


Unnamed: 0,user_id,item_id,female_ratio,male_ratio,young_ratio,adult_ratio,senior_ratio,female_ratio_genre,male_ratio_genre,young_ratio_genre,...,item_rating_quantile_67,item_rating_quantile_75,item_rating_quantile_90,item_rating_quantile_100,avg_rating_time_x,rating_time_range_x,rating_deviation_from_mean,rating_deviation_from_median,avg_rating_time_y,rating_time_range_y
0,0,1505,0.205316,0.794684,0.165449,0.691694,0.142857,0.193299,0.806701,0.180325,...,5.0,5.0,5.0,5.0,94.038538,1198.0,-0.360133,-1.0,94.038538,1198.0
1,0,3669,0.38806,0.61194,0.24152,0.629579,0.128901,0.364833,0.635167,0.214445,...,4.0,4.0,5.0,5.0,207.8019,1524.0,-0.381275,0.0,207.8019,1524.0
2,0,584,0.213785,0.786215,0.235246,0.65291,0.111845,0.162308,0.837692,0.199397,...,4.0,4.0,5.0,5.0,81.919109,1432.0,0.221626,0.0,81.919109,1432.0
3,0,3390,0.271835,0.728165,0.244357,0.692836,0.062807,0.262777,0.737223,0.218908,...,4.0,4.0,5.0,5.0,152.402355,1154.0,-0.79686,-1.0,152.402355,1154.0
4,0,2885,0.189928,0.810072,0.161151,0.694964,0.143885,0.193299,0.806701,0.180325,...,4.0,4.0,5.0,5.0,153.188489,1793.0,0.628777,1.0,153.188489,1793.0


In [19]:
total_features = pd.concat(
    [
        df, # исходные фичи
        user_features.drop(columns=['user_id', 'item_id']), #user фичи
        item_features.drop(columns=['user_id', 'item_id']) #item фичи
    ],
    axis=1
)

total_features.timestamp = events.timestamp.values

print(total_features.shape)
total_features.head()

(894149, 117)


Unnamed: 0,user_id,item_id,rating,timestamp,gender,age,genre_0,genre_1,genre_2,genre_3,...,item_rating_quantile_67,item_rating_quantile_75,item_rating_quantile_90,item_rating_quantile_100,avg_rating_time_x,rating_time_range_x,rating_deviation_from_mean,rating_deviation_from_median,avg_rating_time_y,rating_time_range_y
0,0,1505,4,0,M,35,1,0,0,0,...,5.0,5.0,5.0,5.0,94.038538,1198.0,-0.360133,-1.0,94.038538,1198.0
1,0,3669,3,1,M,35,0,0,0,0,...,4.0,4.0,5.0,5.0,207.8019,1524.0,-0.381275,0.0,207.8019,1524.0
2,0,584,4,2,M,35,1,1,0,0,...,4.0,4.0,5.0,5.0,81.919109,1432.0,0.221626,0.0,81.919109,1432.0
3,0,3390,3,3,M,35,0,0,0,0,...,4.0,4.0,5.0,5.0,152.402355,1154.0,-0.79686,-1.0,152.402355,1154.0
4,0,2885,4,4,M,35,1,0,0,0,...,4.0,4.0,5.0,5.0,153.188489,1793.0,0.628777,1.0,153.188489,1793.0


In [20]:
total_features.to_csv("/content/total_features.csv", index=False)