In [1]:
import numpy as np
import pandas as pd
import zipfile

# Загрузка и чтение данных

In [3]:
# Настраиваем доступ к kaggle.json
!mkdir -p ~/.kaggle/
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Скачиваем архив с данными
!kaggle competitions download -c hse-rec-sys-challenge-2024

# Распаковываем его
with zipfile.ZipFile("/content/hse-rec-sys-challenge-2024.zip","r") as zip_f:
    zip_f.extractall("/content/hse-rec-sys-challenge-2024")

Downloading hse-rec-sys-challenge-2024.zip to /content
  0% 0.00/4.25M [00:00<?, ?B/s]
100% 4.25M/4.25M [00:00<00:00, 85.5MB/s]


In [3]:
dir = "/content/hse-rec-sys-challenge-2024"

# Читаем все данные
events = pd.read_csv(f"{dir}/events.csv")
item_features = pd.read_csv(f"{dir}/item_features.csv")
user_features = pd.read_csv(f"{dir}/user_features.csv")

# Собираем их в единый датасет
df = events\
.merge(user_features, on="user_id", how="left")\
.merge(item_features, on="item_id", how="left")

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,gender,age,genre_0,genre_1,genre_2,genre_3,...,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,1505,4,0,M,35,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,3669,3,1,M,35,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,584,4,2,M,35,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,3390,3,3,M,35,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2885,4,4,M,35,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# User-фичи

In [4]:
import seaborn as sns

In [38]:
user_xtra_ratings = pd.concat(
    [
        df.groupby('user_id').rating.count(),
        df.groupby('user_id').rating.mean().round(4),
        df.groupby('user_id').rating.std().round(4),
        df.groupby('user_id').rating.quantile([0, 0.1, 0.25, 0.33, 0.5, 0.67, 0.75, 0.9, 1]).unstack(level=1)
    ],
    axis=1
)
user_xtra.columns = [
    'user_rating_count',
    'user_rating_mean',
    'user_rating_std',
    'user_rating_min',
    'user_rating_q10',
    'user_rating_q25',
    'user_rating_q33',
    'user_rating_q50',
    'user_rating_q67',
    'user_rating_q75',
    'user_rating_q90',
    'user_rating_max'
]
user_xtra.head()

Unnamed: 0_level_0,user_rating_count,user_rating_mean,user_rating_std,user_rating_min,user_rating_q10,user_rating_q25,user_rating_q33,user_rating_q50,user_rating_q67,user_rating_q75,user_rating_q90,user_rating_max
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,287,3.9791,0.8965,1.0,3.0,3.0,4.0,4.0,4.0,5.0,5.0,5.0
1,261,3.6475,0.9719,1.0,2.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0
2,143,3.7972,0.7077,1.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0
3,231,3.3506,1.048,1.0,2.0,3.0,3.0,3.0,4.0,4.0,5.0,5.0
4,107,4.0467,0.8398,2.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0


In [7]:
gender_xtra_ratings = pd.concat(
    [
        df.groupby('gender').rating.mean().round(4),
        df.groupby('gender').rating.std().round(4),
        df.groupby('gender').rating.quantile([0, 0.1, 0.25, 0.33, 0.5, 0.67, 0.75, 0.9, 1]).unstack(level=1)
    ],
    axis=1
)
gender_xtra_ratings.columns = [
    'gender_rating_mean',
    'gender_rating_std',
    'gender_rating_min',
    'gender_rating_q10',
    'gender_rating_q25',
    'gender_rating_q33',
    'gender_rating_q50',
    'gender_rating_q67',
    'gender_rating_q75',
    'gender_rating_q90',
    'gender_rating_max'
]
gender_xtra_ratings.head()

Unnamed: 0_level_0,gender_rating_mean,gender_rating_std,gender_rating_min,gender_rating_q10,gender_rating_q25,gender_rating_q33,gender_rating_q50,gender_rating_q67,gender_rating_q75,gender_rating_q90,gender_rating_max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
F,3.6203,1.1102,1.0,2.0,3.0,3.0,4.0,4.0,4.0,5.0,5.0
M,3.5681,1.1185,1.0,2.0,3.0,3.0,4.0,4.0,4.0,5.0,5.0
