# === Двухстадийный подход: построение признаков

In [1]:
# Признаки объектов
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")
events_train = pd.read_parquet("events_train.par")
events_test = pd.read_parquet("events_test.par")

events_labels = pd.read_parquet("events_labels.parquet")
candidates_for_train = pd.read_parquet("candidates_for_train.parquet")
candidates_to_rank = pd.read_parquet("candidates_to_rank.parquet")

# user_features_for_train = pd.read_csv("user_features_for_train.csv")

In [2]:
# Задание 1 из 6

# Посчитаем новый признак — «возраст» книги на основе года публикации. Назовём его age. 
# Используя обновлённый справочник объектов items, добавьте признаки возраста age и средней популярности average_rating 
# к кандидатам для тренировки модели candidates_for_train и к кандидатам для ранжирования candidates_to_rank.
items["age"] = 2018-items["publication_year"]
invalid_age_idx = items["age"] < 0

items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

In [3]:
# candidates_for_train = candidates_for_train.merge(items, on="item_id", how="left")
# candidates_for_train.to_parquet("candidates_for_train.parquet")

candidates_to_rank = candidates_to_rank.merge(items, on="item_id", how="left")
# candidates_to_rank.to_parquet("candidates_to_rank.parquet")

In [4]:
# Признаки пользователей

# Задание 2 из 6

# Используя события в events_train и events_inference, посчитайте и добавьте признаки пользователей 
# к кандидатам в candidates_for_train и candidates_to_rank соответственно:

#     reading_years — длительность истории пользователя,
#     books_read — количество книг, прочитанных за всё время,
#     books_per_year — среднее количество прочитанных книг в год,
#     rating_avg — средняя оценка,
#     rating_std — дисперсия оценок.

def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=("is_read","count"),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features
    
user_features_for_train = get_user_features(events_train)

In [5]:
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
candidates_for_train.head()

Unnamed: 0,user_id,item_id,als_score,cnt_score,target,author,title,description,genre_and_votes,num_pages,...,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,age,reading_years,books_read,rating_avg,rating_std,books_per_year
0,1000006,7445,0.230529,,1,Jeannette Walls,The Glass Castle,"A tender, moving tale of unconditional love in...","{'Nonfiction': 6451, 'Autobiography-Memoir': 5...",288,...,074324754X,9780743247542,"{'Academic': None, 'Academic-Academia': None, ...","Nonfiction 6451, Autobiography-Memoir 5734",12.0,1.820671,17.0,4.294118,0.685994,9.337218
1,1000006,18812405,0.178382,,1,Mary Kubica,The Good Girl,I've been following her for the past few days....,"{'Mystery': 1262, 'Fiction': 1197, 'Thriller':...",352,...,0778316556,9780778316558,"{'Academic': None, 'Academic-Academia': None, ...","Mystery 1262, Fiction 1197, Thriller 1051, Sus...",4.0,1.820671,17.0,4.294118,0.685994,9.337218
2,1000006,29868610,0.286715,,1,Anna Kendrick,Scrappy Little Nobody,A collection of humorous autobiographical essa...,"{'Nonfiction': 2027, 'Autobiography-Memoir': 1...",275,...,1501117203,9781501117206,"{'Academic': None, 'Academic-Academia': None, ...","Nonfiction 2027, Autobiography-Memoir 1100, Au...",,1.820671,17.0,4.294118,0.685994,9.337218
3,1000019,37415,0.043595,,1,Zora Neale Hurston,Their Eyes Were Watching God,"When Janie, at sixteen, is caught kissing shif...","{'Classics': 5441, 'Fiction': 4807, 'Historica...",237,...,0061120065,9780061120060,"{'Academic': None, 'Academic-Academia': None, ...","Classics 5441, Fiction 4807, Historical-Histor...",12.0,0.276523,6.0,4.166667,1.169045,21.69802
4,1000023,5094,0.082626,,1,Stephen King,The Drawing of the Three,"In 1978, Stephen King introduced the world to ...","{'Fantasy': 6250, 'Fiction': 2159, 'Horror': 1...",463,...,0451210859,9780451210852,"{'Academic': None, 'Academic-Academia': None, ...","Fantasy 6250, Fiction 2159, Horror 1871, Scien...",15.0,0.005476,2.0,3.5,0.707107,365.25


In [6]:
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

In [7]:
user_features_for_ranking = get_user_features(events_inference)
user_features_for_ranking.head()

Unnamed: 0_level_0,reading_years,books_read,rating_avg,rating_std,books_per_year
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000003,7.4141,94,3.287234,0.712746,12.678545
1000005,1.180014,16,4.375,0.957427,13.559165
1000006,1.998631,20,4.2,0.695852,10.006849
1000007,5.399042,59,3.983051,0.955772,10.927865
1000009,4.862423,92,4.086957,0.909681,18.920608


In [8]:
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")
candidates_to_rank.head()

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank,author,title,description,genre_and_votes,...,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,age,reading_years,books_read,rating_avg,rating_std,books_per_year
0,1000003,13538873,0.627915,,0.51743,1,Robin Sloan,Mr. Penumbra's 24-Hour Bookstore (Mr. Penumbra...,The Great Recession has shuffled Clay Jannon a...,"{'Fiction': 4694, 'Mystery': 1906, 'Fantasy': ...",...,0374214913,9780374214913,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 4694, Mystery 1906, Fantasy 1474, Writ...",6.0,7.4141,94.0,3.287234,0.712746,12.678545
1,1000003,5060378,0.770063,,0.486272,2,"Stieg Larsson, Reg Keeland","The Girl Who Played with Fire (Millennium, #2)",The Expose\nMillenniumpublisher Mikael Blomkvi...,"{'Fiction': 7738, 'Mystery': 6082, 'Thriller':...",...,0307269981,9780307269980,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 7738, Mystery 6082, Thriller 4229, Mys...",9.0,7.4141,94.0,3.287234,0.712746,12.678545
2,1000003,6892870,0.767938,,0.486272,3,"Stieg Larsson, Reg Keeland",The Girl Who Kicked the Hornet's Nest (Millenn...,The stunning third and final novel in Stieg La...,"{'Fiction': 6710, 'Mystery': 4943, 'Thriller':...",...,030726999X,9780307269997,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 6710, Mystery 4943, Thriller 3699, Mys...",8.0,7.4141,94.0,3.287234,0.712746,12.678545
3,1000003,6493208,0.680771,,0.484585,4,Rebecca Skloot,The Immortal Life of Henrietta Lacks,"Intimate in feeling, astonishing in scope, and...","{'Nonfiction': 10432, 'Science': 4513, 'Biogra...",...,1400052173,9781400052172,"{'Academic': None, 'Academic-Academia': None, ...","Nonfiction 10432, Science 4513, Biography 2854...",8.0,7.4141,94.0,3.287234,0.712746,12.678545
4,1000003,13526165,0.449987,,0.483984,5,Maria Semple,"Where'd You Go, Bernadette",A compulsively readable and touching novel abo...,"{'Fiction': 5873, 'Humor': 1442, 'Contemporary...",...,0316204277,9780316204279,"{'Academic': None, 'Academic-Academia': None, ...","Fiction 5873, Humor 1442, Contemporary 1391, M...",6.0,7.4141,94.0,3.287234,0.712746,12.678545


In [9]:
candidates_for_train["books_read"].median()

32.0

In [13]:
# Парные признаки

# Задание 3 из 6

# Используя истории events_train и events_inference, а также ранее полученные артефакты по жанрам книг — словарь жанров genres, 
# оценки книг по жанрам all_items_genres_csr — добавьте парные признаки, по одному на каждый жанр, которые совместно показывают, 
# какие жанры предпочитает пользователь. 

def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v["genre_and_votes"]
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += 1
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items)
genres.head()

Unnamed: 0_level_0,name,votes
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Womens Fiction-Chick Lit,1705
1,Fiction,22440
2,Politics,933
3,Humor,2101
4,Christian,1482


In [12]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)
items.head()

Unnamed: 0,item_id,author,title,description,genre_and_votes,num_pages,average_rating,ratings_count,text_reviews_count,publisher,publication_year,country_code,language_code,format,is_ebook,isbn,isbn13,genre_and_votes_dict,genre_and_votes_str,age
3,6066819,Jennifer Weiner,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,"{'Womens Fiction-Chick Lit': 739, 'Fiction': 442}",368.0,3.49,51184,3282,Atria Books,2009.0,US,eng,Hardcover,False,0743294297,9780743294294,"{'Academic': None, 'Academic-Academia': None, ...","Womens Fiction-Chick Lit 739, Fiction 442",9.0
6,378460,Michael Halberstam,The Wanting of Levine,,"{'Politics': 1, 'Humor': 1}",,4.38,12,4,Berkley Publishing Group,1979.0,US,,Paperback,False,0425040887,9780425040881,"{'Academic': None, 'Academic-Academia': None, ...","Politics 1user, Humor 1user",39.0
15,89375,"Don Piper, Cecil Murphey",90 Minutes in Heaven: A True Story of Death an...,As he is driving home from a minister's confer...,"{'Christian': 395, 'Nonfiction': 392, 'Religio...",,3.91,68157,2885,,,US,,,False,0800759494,9780800759490,"{'Academic': None, 'Academic-Academia': None, ...","Christian 395, Nonfiction 392, Religion 142, S...",
16,89376,Randy Alcorn,Heaven,What is Heaven really going to be like? What w...,"{'Christian': 225, 'Religion-Theology': 154, '...",533.0,4.26,7345,566,,,US,eng,,False,0842379428,9780842379427,"{'Academic': None, 'Academic-Academia': None, ...","Christian 225, Religion-Theology 154, Nonficti...",
17,89377,Jennifer L. Holm,Penny from Heaven,It's 1953 and 11-year-old Penny dreams of a su...,"{'Historical-Historical Fiction': 284, 'Childr...",288.0,3.98,6949,615,Random House Books for Young Readers,2006.0,US,,Hardcover,False,037583687X,9780375836879,"{'Academic': None, 'Academic-Academia': None, ...","Historical-Historical Fiction 284, Childrens-M...",12.0


In [18]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,22440,0.106816
34,Romance,11166,0.053151
25,Fantasy,11108,0.052875
18,Young Adult,8614,0.041003
5,Nonfiction,6822,0.032473
52,Contemporary,5518,0.026266
16,Historical-Historical Fiction,5497,0.026166
20,Mystery,5110,0.024324
33,Fantasy-Paranormal,4413,0.021006
38,Classics,4373,0.020816


In [23]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [25]:
import scipy
import sklearn
# Получим матрицу с весами по жанрам для каждой книги:
# items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

AttributeError: module 'sklearn' has no attribute 'preprocessing'

In [14]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"


In [17]:
genres

Unnamed: 0_level_0,name,votes
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Womens Fiction-Chick Lit,1705
1,Fiction,22440
2,Politics,933
3,Humor,2101
4,Christian,1482
...,...,...
810,German History-Nazi Party,0
811,Favorites,0
812,History-Latin American History,0
813,Cryptids-Bigfoot,0


In [None]:
genre_columns = # ваш код здесь #

In [None]:
# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        # топ жанров
        # ваш код здесь #,
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    
user_genres_for_train = # ваш код здесь #
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

user_genres_for_ranking = get_user_genres(events_inference, items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")

In [None]:
# Обучение и получение рекомендаций
# Вы добавили в candidates_for_train и candidates_to_rank различные признаки. Обучите новую ранжирующую модель, которая их будет учитывать.

# Задание 4 из 6
# Обучите модель, выполнив код ниже:

from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 
    'age', 'average_rating', 'reading_years', 'books_read', 
    'rating_avg', 'rating_std', 
    'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data)

In [None]:
# Получите топ-100 самых релевантных рекомендация для каждого пользователя, используя обученную модель.

inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = # ваш код здесь #

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = # ваш код здесь #

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [None]:
# Задание 5 из 6
# Используя отложенную тестовую выборку events_test_2, посчитайте метрики recall и precision для полученных рекомендаций.

# для экономии ресурсов оставим события только тех пользователей, 
# для которых следует оценить рекомендации
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test_2["user_id"].drop_duplicates())]

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    # ваш код здесь #,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    # ваш код здесь #)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

In [None]:
# Проверка важности признаков
# Любопытно понять, какие признаки вносят наибольший вклад в ранжирование. 
# Алгоритм CatBoost позволяет получить такую информацию (англ. feature importance), которая генерируется во время тренировки модели. Для этого используйте метод get_feature_importance(). 

# Задание 6
# Выполните код для получения информации о важности признаков. Выведите список признаков feature_importance в порядке убывания их важности.

feature_importance = pd.DataFrame(cb_model.get_feature_importance(), 
    index=features, 
    columns=["fi"])
feature_importance = # ваш код здесь #

print(feature_importance )