In [1]:
# Установка необходимых библиотек
!pip install pandas numpy scikit-learn catboost

# Импорт библиотек
import pandas as pd
import numpy as np
from datetime import datetime
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("Библиотеки успешно импортированы!")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Библиотеки успешно импортированы!


In [3]:
def load_data():
    print("Загрузка данных...")

    try:
        # Основные данные
        train = pd.read_csv('/content/data/train.csv', sep=',', quotechar='"')
        test = pd.read_csv('/content/data/test.csv', sep=',', quotechar='"')

        # Метаданные
        books = pd.read_csv('/content/data/books.csv', sep=',', quotechar='"')
        users = pd.read_csv('/content/data/users.csv', sep=',', quotechar='"')
        genres = pd.read_csv('/content/data/genres.csv', sep=',', quotechar='"')
        book_genres = pd.read_csv('/content/data/book_genres.csv', sep=',', quotechar='"')
        book_descriptions = pd.read_csv('/content/data/book_descriptions.csv', sep=',', quotechar='"')

        print("Данные успешно загружены!")
        print(f"Train: {train.shape}")
        print(f"Test: {test.shape}")
        print(f"Books: {books.shape}")
        print(f"Users: {users.shape}")

        return train, test, books, users, genres, book_genres, book_descriptions

    except FileNotFoundError as e:
        print(f"Ошибка загрузки файлов: {e}")
        print("Убедитесь, что все файлы загружены в папку /content/data/")
        return None

# Загружаем данные
data = load_data()
if data:
    train, test, books, users, genres, book_genres, book_descriptions = data

Загрузка данных...
Данные успешно загружены!
Train: (268581, 5)
Test: (2894, 2)
Books: (50490, 8)
Users: (7277, 3)


In [4]:
def preprocess_data(train, test, books, users, genres, book_genres, book_descriptions):
    print("Предобработка данных...")

    # Создаем копии данных
    train_processed = train.copy()
    test_processed = test.copy()
    books_processed = books.copy()
    users_processed = users.copy()

    # Оставляем только прочитанные книги для обучения
    train_processed = train_processed[train_processed['has_read'] == 1].copy()
    print(f"После фильтрации прочитанных книг: {train_processed.shape}")

    # Преобразование временных меток
    train_processed['timestamp'] = pd.to_datetime(train_processed['timestamp'])
    train_processed['year'] = train_processed['timestamp'].dt.year
    train_processed['month'] = train_processed['timestamp'].dt.month
    train_processed['day'] = train_processed['timestamp'].dt.day
    train_processed['dayofweek'] = train_processed['timestamp'].dt.dayofweek

    # Обработка пропущенных значений в books
    books_processed['publication_year'] = books_processed['publication_year'].fillna(books_processed['publication_year'].median())
    books_processed['language'] = books_processed['language'].fillna(books_processed['language'].mode()[0] if not books_processed['language'].mode().empty else 0)
    books_processed['publisher'] = books_processed['publisher'].fillna(books_processed['publisher'].mode()[0] if not books_processed['publisher'].mode().empty else 0)

    # Обработка users
    users_processed['age'] = users_processed['age'].fillna(users_processed['age'].median())
    users_processed['gender'] = users_processed['gender'].fillna(users_processed['gender'].mode()[0] if not users_processed['gender'].mode().empty else 0)

    print("Предобработка завершена!")
    return train_processed, test_processed, books_processed, users_processed

# Применяем предобработку
train_proc, test_proc, books_proc, users_proc = preprocess_data(
    train, test, books, users, genres, book_genres, book_descriptions
)

Предобработка данных...
После фильтрации прочитанных книг: (156179, 5)
Предобработка завершена!


In [5]:
def create_user_features(train, users):
    print("Создание признаков пользователей...")

    # Статистики по пользователям из train
    user_stats = train.groupby('user_id').agg({
        'rating': ['mean', 'std', 'min', 'max', 'count'],
        'book_id': 'nunique',
        'year': ['min', 'max', 'nunique'],
        'month': 'nunique',
        'dayofweek': 'nunique'
    }).reset_index()

    # Выравниваем multi-index columns
    user_stats.columns = ['user_id'] + [f'user_{col[0]}_{col[1]}' for col in user_stats.columns[1:]]

    # Объединяем с метаданными пользователей
    user_features = pd.merge(users, user_stats, on='user_id', how='left')

    # Заполняем пропуски для пользователей без истории
    numeric_cols = user_features.select_dtypes(include=[np.number]).columns
    user_features[numeric_cols] = user_features[numeric_cols].fillna(user_features[numeric_cols].median())

    print(f"Признаки пользователей созданы: {user_features.shape}")
    return user_features

# Создаем признаки пользователей
user_features = create_user_features(train_proc, users_proc)

Создание признаков пользователей...
Признаки пользователей созданы: (7277, 14)


In [6]:
def create_book_features(train, books, book_genres, book_descriptions):
    print("Создание признаков книг...")

    # Статистики по книгам из train
    book_stats = train.groupby('book_id').agg({
        'rating': ['mean', 'std', 'min', 'max', 'count'],
        'user_id': 'nunique',
        'year': ['min', 'max'],
        'month': 'nunique'
    }).reset_index()

    book_stats.columns = ['book_id'] + [f'book_{col[0]}_{col[1]}' for col in book_stats.columns[1:]]

    # Признаки жанров
    book_genre_counts = book_genres.groupby('book_id').size().reset_index(name='book_genre_count')
    book_main_genre = book_genres.groupby('book_id')['genre_id'].first().reset_index()
    book_main_genre.columns = ['book_id', 'book_main_genre']

    # Объединяем все признаки книг
    book_features = pd.merge(books, book_stats, on='book_id', how='left')
    book_features = pd.merge(book_features, book_genre_counts, on='book_id', how='left')
    book_features = pd.merge(book_features, book_main_genre, on='book_id', how='left')

    # Заполняем пропуски
    book_features['book_genre_count'] = book_features['book_genre_count'].fillna(0)
    book_main_genre_mode = book_features['book_main_genre'].mode()
    book_features['book_main_genre'] = book_features['book_main_genre'].fillna(book_main_genre_mode[0] if not book_main_genre_mode.empty else 0)

    numeric_cols = book_features.select_dtypes(include=[np.number]).columns
    book_features[numeric_cols] = book_features[numeric_cols].fillna(book_features[numeric_cols].median())

    print(f"Признаки книг созданы: {book_features.shape}")
    return book_features

# Создаем признаки книг
book_features = create_book_features(train_proc, books_proc, book_genres, book_descriptions)

Создание признаков книг...
Признаки книг созданы: (50490, 19)


In [7]:
def create_feature_matrix(train, test, user_features, book_features):
    print("Создание матрицы признаков...")

    # Обучающая выборка
    X_train = pd.merge(train[['user_id', 'book_id', 'rating']], user_features, on='user_id', how='left')
    X_train = pd.merge(X_train, book_features, on='book_id', how='left')

    # Тестовая выборка
    X_test = pd.merge(test[['user_id', 'book_id']], user_features, on='user_id', how='left')
    X_test = pd.merge(X_test, book_features, on='book_id', how='left')

    # Сохраняем идентификаторы для submission
    train_ids = X_train[['user_id', 'book_id']].copy()
    test_ids = X_test[['user_id', 'book_id']].copy()

    # Удаляем исходные колонки
    columns_to_drop = ['user_id', 'book_id', 'title', 'author_name']

    X_train = X_train.drop(columns=[col for col in columns_to_drop if col in X_train.columns])
    X_test = X_test.drop(columns=[col for col in columns_to_drop if col in X_test.columns])

    y_train = X_train['rating']
    X_train = X_train.drop('rating', axis=1)

    # Выравниваем колонки
    common_columns = X_train.columns.intersection(X_test.columns)
    X_train = X_train[common_columns]
    X_test = X_test[common_columns]

    print(f"Обучающая выборка: {X_train.shape}")
    print(f"Тестовая выборка: {X_test.shape}")

    return X_train, y_train, X_test, train_ids, test_ids

# Создаем матрицу признаков
X_train, y_train, X_test, train_ids, test_ids = create_feature_matrix(
    train_proc, test_proc, user_features, book_features
)

Создание матрицы признаков...
Обучающая выборка: (156189, 29)
Тестовая выборка: (2894, 29)


In [8]:
def validate_model(X_train, y_train):
    print("Валидация модели...")

    # Разделяем данные на train/validation
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=pd.cut(y_train, bins=5)
    )

    # Определяем категориальные признаки
    cat_features = list(X_tr.select_dtypes(include=['object', 'category']).columns)
    print(f"Категориальные признаки: {cat_features}")

    # Обучаем модель с валидацией
    model = CatBoostRegressor(
        iterations=500,
        learning_rate=0.1,
        depth=6,
        random_seed=42,
        verbose=100,
        early_stopping_rounds=50
    )

    model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        cat_features=cat_features
    )

    # Предсказания на валидации
    val_pred = model.predict(X_val)
    val_pred = np.clip(val_pred, 0, 10)

    # Метрики
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    mae = mean_absolute_error(y_val, val_pred)

    # Расчет итогового балла
    normalized_rmse = rmse / 10
    normalized_mae = mae / 10
    score = 1 - (normalized_rmse + normalized_mae) / 2

    print("\n" + "="*50)
    print("РЕЗУЛЬТАТЫ ВАЛИДАЦИИ:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"Score: {score:.4f}")
    print("="*50)

    return model, rmse, mae, score

# Проводим валидацию
model_val, rmse, mae, score = validate_model(X_train, y_train)

Валидация модели...
Категориальные признаки: []
0:	learn: 2.8346815	test: 2.8307879	best: 2.8307879 (0)	total: 78ms	remaining: 38.9s
100:	learn: 2.0529589	test: 2.0496815	best: 2.0496815 (100)	total: 2.75s	remaining: 10.9s
200:	learn: 2.0248983	test: 2.0422187	best: 2.0422187 (200)	total: 6.77s	remaining: 10.1s
300:	learn: 2.0028157	test: 2.0393089	best: 2.0393089 (300)	total: 9.39s	remaining: 6.21s
400:	learn: 1.9834207	test: 2.0384171	best: 2.0382444 (392)	total: 12s	remaining: 2.95s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 2.038244389
bestIteration = 392

Shrink model to first 393 iterations.

РЕЗУЛЬТАТЫ ВАЛИДАЦИИ:
RMSE: 2.0378
MAE: 1.3230
Score: 0.8320


In [9]:
def train_final_model(X_train, y_train):
    print("Обучение финальной модели на всех данных...")

    # Определяем категориальные признаки
    cat_features = list(X_train.select_dtypes(include=['object', 'category']).columns)

    # Финальная модель
    model_final = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        random_seed=42,
        verbose=100,
        early_stopping_rounds=50
    )

    model_final.fit(
        X_train, y_train,
        cat_features=cat_features
    )

    print("Финальная модель обучена!")
    return model_final

# Обучаем финальную модель
final_model = train_final_model(X_train, y_train)

Обучение финальной модели на всех данных...
0:	learn: 2.8340847	total: 36.1ms	remaining: 36.1s
100:	learn: 2.0504953	total: 3.23s	remaining: 28.8s
200:	learn: 2.0255488	total: 7.83s	remaining: 31.1s
300:	learn: 2.0073773	total: 12s	remaining: 27.9s
400:	learn: 1.9918279	total: 15.5s	remaining: 23.2s
500:	learn: 1.9779762	total: 20s	remaining: 19.9s
600:	learn: 1.9648786	total: 23.1s	remaining: 15.3s
700:	learn: 1.9519892	total: 26.3s	remaining: 11.2s
800:	learn: 1.9397532	total: 29.6s	remaining: 7.36s
900:	learn: 1.9292020	total: 34s	remaining: 3.73s
999:	learn: 1.9189836	total: 37.4s	remaining: 0us
Финальная модель обучена!


In [10]:
def make_predictions(model, X_test, test_ids):
    print("Создание предсказаний...")

    predictions = model.predict(X_test)

    # Ограничиваем предсказания диапазоном [0, 10]
    predictions = np.clip(predictions, 0, 10)

    # Создаем submission файл
    submission = test_ids.copy()
    submission['rating_predict'] = predictions

    # Статистика предсказаний
    print("\nСтатистика предсказаний:")
    print(f"Min: {submission['rating_predict'].min():.2f}")
    print(f"Max: {submission['rating_predict'].max():.2f}")
    print(f"Mean: {submission['rating_predict'].mean():.2f}")
    print(f"Std: {submission['rating_predict'].std():.2f}")

    return submission

# Создаем предсказания
submission = make_predictions(final_model, X_test, test_ids)

Создание предсказаний...

Статистика предсказаний:
Min: 0.00
Max: 10.00
Mean: 8.01
Std: 1.65


In [11]:
def save_results(submission, model, X_train):
    print("Сохранение результатов...")

    # Сохраняем submission файл
    submission.to_csv('submission.csv', index=False)
    print("Файл submission.csv сохранен!")

    # Сохраняем важность признаков
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.get_feature_importance()
    }).sort_values('importance', ascending=False)

    print("\nТоп-10 важных признаков:")
    print(feature_importance.head(10))

    # Сохраняем важность признаков в файл
    feature_importance.to_csv('feature_importance.csv', index=False)
    print("Файл feature_importance.csv сохранен!")

    return feature_importance

# Сохраняем результаты
feature_importance = save_results(submission, final_model, X_train)

Сохранение результатов...
Файл submission.csv сохранен!

Топ-10 важных признаков:
              feature  importance
2    user_rating_mean   30.814376
18   book_rating_mean   22.700650
19    book_rating_std   10.604049
3     user_rating_std    9.575801
20    book_rating_min    4.615054
13          author_id    2.137611
17         avg_rating    1.855637
22  book_rating_count    1.855049
1                 age    1.660254
21    book_rating_max    1.657314
Файл feature_importance.csv сохранен!
