In [11]:
import time
import numpy as np
import pandas as pd
import joblib
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import torch
from transformers import AutoModel, AutoTokenizer
from pathlib import Path
import zipfile

In [12]:
ROOT_DIR = Path(".")
DATA_DIR = ROOT_DIR / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
MODEL_DIR = ROOT_DIR / "models"
SUBMISSION_DIR = ROOT_DIR / "submissions"

TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"
USER_DATA_FILENAME = "users.csv"
BOOK_DATA_FILENAME = "books.csv"
BOOK_GENRES_FILENAME = "book_genres.csv"
GENRES_FILENAME = "genres.csv"
BOOK_DESCRIPTIONS_FILENAME = "book_descriptions.csv"
PROCESSED_DATA_FILENAME = "processed_features.parquet"
MODEL_FILENAME = "lgb_model.txt"

COL_USER_ID = "user_id"
COL_BOOK_ID = "book_id"
COL_TARGET = "rating"
COL_SOURCE = "source"
COL_PREDICTION = "rating_predict"
COL_HAS_READ = "has_read"
COL_TIMESTAMP = "timestamp"
COL_GENDER = "gender"
COL_AGE = "age"
COL_AUTHOR_ID = "author_id"
COL_PUBLICATION_YEAR = "publication_year"
COL_LANGUAGE = "language"
COL_PUBLISHER = "publisher"
COL_AVG_RATING = "avg_rating"
COL_GENRE_ID = "genre_id"
COL_DESCRIPTION = "description"
COL_RATING = "rating"

F_USER_MEAN_RATING = "user_mean_rating"
F_USER_RATINGS_COUNT = "user_ratings_count"
F_BOOK_MEAN_RATING = "book_mean_rating"
F_BOOK_RATINGS_COUNT = "book_ratings_count"
F_AUTHOR_MEAN_RATING = "author_mean_rating"
F_BOOK_GENRES_COUNT = "book_genres_count"

VAL_SOURCE_TRAIN = "train"
VAL_SOURCE_TEST = "test"
MISSING_CAT_VALUE = "-1"
MISSING_NUM_VALUE = -1
PREDICTION_MIN_VALUE = 0
PREDICTION_MAX_VALUE = 10

BERT_MODEL_NAME = "DeepPavlov/rubert-base-cased"
BERT_BATCH_SIZE = 8
BERT_MAX_LENGTH = 512
BERT_EMBEDDING_DIM = 768
BERT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

LGB_PARAMS = {
    "objective": "rmse",
    "metric": "rmse",
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "num_leaves": 31,
    "verbose": -1,
    "n_jobs": -1,
    "seed": 42,
    "boosting_type": "gbdt",
}

CAT_FEATURES = [
    COL_USER_ID, COL_BOOK_ID, COL_GENDER, COL_AGE, COL_AUTHOR_ID,
    COL_PUBLICATION_YEAR, COL_LANGUAGE, COL_PUBLISHER
]

In [13]:
zip_path = "/content/stage1_individual_data.zip"
extract_dir = Path("data/raw")
extract_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)

print("Данные разархивированы в:", extract_dir)

Данные разархивированы в: data/raw


In [14]:
def load_and_merge_data():
    dtype_spec = {
        COL_USER_ID: "int32",
        COL_BOOK_ID: "int32",
        COL_TARGET: "float32",
        COL_GENDER: "category",
        COL_AGE: "float32",
        COL_AUTHOR_ID: "int32",
        COL_PUBLICATION_YEAR: "float32",
        COL_LANGUAGE: "category",
        COL_PUBLISHER: "category",
        COL_AVG_RATING: "float32",
        COL_GENRE_ID: "int16",
    }

    train_df = pd.read_csv(RAW_DATA_DIR / TRAIN_FILENAME,
                           dtype={k:v for k,v in dtype_spec.items() if k in [COL_USER_ID, COL_BOOK_ID, COL_TARGET]},
                           parse_dates=[COL_TIMESTAMP])
    train_df = train_df[train_df[COL_HAS_READ]==1].copy()
    test_df = pd.read_csv(RAW_DATA_DIR / TEST_FILENAME,
                          dtype={k:v for k,v in dtype_spec.items() if k in [COL_USER_ID, COL_BOOK_ID]})
    user_data_df = pd.read_csv(RAW_DATA_DIR / USER_DATA_FILENAME,
                               dtype={k:v for k,v in dtype_spec.items() if k in [COL_USER_ID, COL_GENDER, COL_AGE]})
    book_data_df = pd.read_csv(RAW_DATA_DIR / BOOK_DATA_FILENAME,
                               dtype={k:v for k,v in dtype_spec.items() if k in [COL_BOOK_ID, COL_AUTHOR_ID, COL_PUBLICATION_YEAR, COL_LANGUAGE, COL_AVG_RATING, COL_PUBLISHER]})
    book_genres_df = pd.read_csv(RAW_DATA_DIR / BOOK_GENRES_FILENAME,
                                 dtype={k:v for k,v in dtype_spec.items() if k in [COL_BOOK_ID, COL_GENRE_ID]})
    genres_df = pd.read_csv(RAW_DATA_DIR / GENRES_FILENAME)
    book_descriptions_df = pd.read_csv(RAW_DATA_DIR / BOOK_DESCRIPTIONS_FILENAME,
                                       dtype={COL_BOOK_ID:"int32"})

    train_df[COL_SOURCE] = VAL_SOURCE_TRAIN
    test_df[COL_SOURCE] = VAL_SOURCE_TEST
    combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
    combined_df = combined_df.merge(user_data_df, on=COL_USER_ID, how="left")
    book_data_df = book_data_df.drop_duplicates(subset=[COL_BOOK_ID])
    combined_df = combined_df.merge(book_data_df, on=COL_BOOK_ID, how="left")

    return combined_df, book_genres_df, genres_df, book_descriptions_df

merged_df, book_genres_df, genres_df, descriptions_df = load_and_merge_data()
print("Данные загружены и объединены")

# === Временной сплит ===
def get_split_date_from_ratio(df, ratio, timestamp_col=COL_TIMESTAMP):
    sorted_ts = df[timestamp_col].sort_values()
    threshold_index = int(len(sorted_ts)*ratio)
    return sorted_ts.iloc[threshold_index]

def temporal_split_by_date(df, split_date, timestamp_col=COL_TIMESTAMP):
    train_mask = df[timestamp_col]<=split_date
    val_mask = df[timestamp_col]>split_date
    return train_mask, val_mask

split_date = get_split_date_from_ratio(merged_df[merged_df[COL_SOURCE]==VAL_SOURCE_TRAIN], 0.8)
print("Split date:", split_date)


Данные загружены и объединены
Split date: 2020-09-27 16:17:15


In [15]:
def add_bert_features(df, train_df, descriptions_df):
    BERT_EMBEDDINGS_CACHE_PATH = Path("/content/bert_embeddings.pkl")
    embeddings_path = BERT_EMBEDDINGS_CACHE_PATH

    if embeddings_path.exists():
        print("Загрузка BERT эмбеддингов из:", embeddings_path)
        embeddings_dict = joblib.load(embeddings_path)
    else:
        print("Файл BERT эмбеддингов не найден. Генерим. Может быть долго на cpu")

        device = BERT_DEVICE
        tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
        model = AutoModel.from_pretrained(BERT_MODEL_NAME)
        model.to(device)
        model.eval()

        all_descriptions = descriptions_df[[COL_BOOK_ID, COL_DESCRIPTION]].copy()
        all_descriptions[COL_DESCRIPTION] = all_descriptions[COL_DESCRIPTION].fillna("")
        unique_books = all_descriptions.drop_duplicates(subset=[COL_BOOK_ID])

        book_ids = unique_books[COL_BOOK_ID].to_numpy()
        descriptions = unique_books[COL_DESCRIPTION].to_numpy().tolist()

        embeddings_dict = {}
        batch_size = BERT_BATCH_SIZE if device == "cpu" else BERT_BATCH_SIZE * 4
        num_batches = (len(descriptions) + batch_size - 1) // batch_size

        for batch_idx in tqdm(range(num_batches), desc="BERT batches", unit="batch"):
            start = batch_idx * batch_size
            end = min(start + batch_size, len(descriptions))
            batch_desc = descriptions[start:end]
            batch_ids = book_ids[start:end]

            encoded = tokenizer(batch_desc,
                                padding=True,
                                truncation=True,
                                max_length=BERT_MAX_LENGTH,
                                return_tensors="pt")
            encoded = {k: v.to(device) for k, v in encoded.items()}

            with torch.no_grad():
                outputs = model(**encoded)
                mask = encoded["attention_mask"].unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
                sum_emb = torch.sum(outputs.last_hidden_state * mask, dim=1)
                sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
                mean_emb = (sum_emb / sum_mask).cpu().numpy()

            for book_id, emb in zip(batch_ids, mean_emb):
                embeddings_dict[book_id] = emb

        joblib.dump(embeddings_dict, embeddings_path)
        print("BERT эмбеддинги сгенерированы и сохранены в:", embeddings_path)

    if not embeddings_dict:
        print("Словарь эмбеддингов пуст. BERT фичи не будут добавлены.")
        return df

    embeddings_list = []
    for book_id in df[COL_BOOK_ID].to_numpy():
        embeddings_list.append(embeddings_dict.get(book_id, np.zeros(BERT_EMBEDDING_DIM)))

    bert_df = pd.DataFrame(np.array(embeddings_list),
                           columns=[f"bert_{i}" for i in range(BERT_EMBEDDING_DIM)],
                           index=df.index)

    print(f"Добавлено {BERT_EMBEDDING_DIM} BERT признаков.")
    return pd.concat([df.reset_index(drop=True), bert_df.reset_index(drop=True)], axis=1)

featured_df = add_bert_features(merged_df, merged_df[merged_df[COL_SOURCE]==VAL_SOURCE_TRAIN], descriptions_df)
print("BERT фичи добавлены")


Загрузка BERT эмбеддингов из: /content/bert_embeddings.pkl
Добавлено 768 BERT признаков.
BERT фичи добавлены


In [16]:
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
processed_path = PROCESSED_DATA_DIR / PROCESSED_DATA_FILENAME
featured_df.to_parquet(processed_path, index=False, engine="pyarrow", compression="snappy")
print("Обработанные данные сохранены:", processed_path)

# === Функции для признаков ===
def add_book_features(df, ref_df):
    train_raw = pd.read_csv(RAW_DATA_DIR / TRAIN_FILENAME, usecols=[COL_BOOK_ID, COL_HAS_READ])

    book_popularity = train_raw[train_raw[COL_HAS_READ] == 1].groupby(COL_BOOK_ID).size().reset_index(name='book_popularity_count')
    book_interaction = train_raw.groupby(COL_BOOK_ID).size().reset_index(name='book_interaction_count')

    book_features = book_popularity.merge(book_interaction, on=COL_BOOK_ID, how='outer').fillna(0)
    book_features['book_read_ratio'] = book_features['book_popularity_count'] / book_features['book_interaction_count'].replace(0, np.nan)
    book_features['book_read_ratio'] = book_features['book_read_ratio'].fillna(0)

    df = df.merge(book_features, on=COL_BOOK_ID, how='left').fillna({
        'book_popularity_count': 0,
        'book_interaction_count': 0,
        'book_read_ratio': 0
    })

    return df

def add_author_features(df, ref_df):
    train_raw = pd.read_csv(RAW_DATA_DIR / TRAIN_FILENAME, usecols=[COL_BOOK_ID, COL_HAS_READ, COL_TARGET])
    train_raw = train_raw.merge(ref_df[[COL_BOOK_ID, COL_AUTHOR_ID]].drop_duplicates(), on=COL_BOOK_ID, how='left')

    author_avg = train_raw[(train_raw[COL_HAS_READ] == 1) & train_raw[COL_TARGET].notna()].groupby(COL_AUTHOR_ID)[COL_TARGET].mean().reset_index(name='author_avg_rating')
    author_books = train_raw.groupby(COL_AUTHOR_ID)[COL_BOOK_ID].nunique().reset_index(name='author_books_count')
    author_reads = train_raw[train_raw[COL_HAS_READ] == 1].groupby(COL_AUTHOR_ID).size().reset_index(name='author_reads_count')

    author_features = author_avg.merge(author_books, on=COL_AUTHOR_ID, how='outer')
    author_features = author_features.merge(author_reads, on=COL_AUTHOR_ID, how='outer').fillna(0)

    author_features['author_popularity_score'] = (author_features['author_reads_count'] * author_features['author_avg_rating']) / author_features['author_books_count'].replace(0, np.nan)
    author_features['author_popularity_score'] = author_features['author_popularity_score'].fillna(0)

    df = df.merge(author_features, on=COL_AUTHOR_ID, how='left').fillna({
        'author_avg_rating': 0,
        'author_books_count': 0,
        'author_reads_count': 0,
        'author_popularity_score': 0
    })

    return df

def add_user_bias_features(df, train_df):
    train_read = train_df[train_df[COL_HAS_READ] == 1].copy()

    user_std = train_read.groupby(COL_USER_ID)[COL_RATING].std().reset_index()
    user_std.columns = [COL_USER_ID, 'user_rating_std']
    user_std['user_rating_std'] = user_std['user_rating_std'].fillna(0)

    df = df.merge(user_std, on=COL_USER_ID, how='left')
    df['user_rating_std'] = df['user_rating_std'].fillna(0)

    global_mean_rating = train_read[COL_RATING].mean()

    if F_USER_MEAN_RATING in df.columns:
        df['user_global_bias'] = df[F_USER_MEAN_RATING].fillna(global_mean_rating) - global_mean_rating
    else:
        user_mean = train_read.groupby(COL_USER_ID)[COL_RATING].mean().reset_index()
        user_mean.columns = [COL_USER_ID, 'temp_user_mean']
        df = df.merge(user_mean, on=COL_USER_ID, how='left')
        df['user_global_bias'] = df['temp_user_mean'].fillna(global_mean_rating) - global_mean_rating
        df = df.drop(columns=['temp_user_mean'])

    if F_USER_MEAN_RATING in df.columns and F_BOOK_MEAN_RATING in df.columns:
        df['user_book_rating_diff'] = df[F_USER_MEAN_RATING].fillna(global_mean_rating) - df[F_BOOK_MEAN_RATING].fillna(global_mean_rating)
    else:
        df['user_book_rating_diff'] = 0

    return df

def add_aggregate_features(df, ref_df):
    user_agg = ref_df.groupby(COL_USER_ID)[COL_TARGET].agg(['mean', 'count']).reset_index()
    user_agg.columns = [COL_USER_ID, F_USER_MEAN_RATING, F_USER_RATINGS_COUNT]
    df = df.merge(user_agg, on=COL_USER_ID, how='left')

    book_agg = ref_df.groupby(COL_BOOK_ID)[COL_TARGET].agg(['mean', 'count']).reset_index()
    book_agg.columns = [COL_BOOK_ID, F_BOOK_MEAN_RATING, F_BOOK_RATINGS_COUNT]
    df = df.merge(book_agg, on=COL_BOOK_ID, how='left')

    author_agg = ref_df.groupby(COL_AUTHOR_ID)[COL_TARGET].agg('mean').reset_index()
    author_agg.columns = [COL_AUTHOR_ID, F_AUTHOR_MEAN_RATING]
    df = df.merge(author_agg, on=COL_AUTHOR_ID, how='left')

    genres_count = book_genres_df.groupby(COL_BOOK_ID)[COL_GENRE_ID].nunique().reset_index()
    genres_count.columns = [COL_BOOK_ID, F_BOOK_GENRES_COUNT]
    df = df.merge(genres_count, on=COL_BOOK_ID, how='left')

    return df

def handle_missing_values(df, ref_df):
    for col in CAT_FEATURES:
        if col in df.columns:
            if df[col].dtype.name == 'category':
                df[col] = df[col].astype(object)
            df[col] = df[col].fillna(MISSING_CAT_VALUE).astype(str)

    num_agg_features = [F_USER_MEAN_RATING, F_BOOK_MEAN_RATING, F_AUTHOR_MEAN_RATING,
                         F_USER_RATINGS_COUNT, F_BOOK_RATINGS_COUNT, F_BOOK_GENRES_COUNT,
                         'book_popularity_count', 'book_interaction_count', 'book_read_ratio',
                         'author_avg_rating', 'author_books_count', 'author_reads_count', 'author_popularity_score',
                         'user_rating_std', 'user_global_bias', 'user_book_rating_diff']
    for col in num_agg_features:
        if col in df.columns:
            mean_val = ref_df[col].mean(skipna=True) if col in ref_df.columns else MISSING_NUM_VALUE
            df[col] = df[col].fillna(mean_val)

    other_nums = df.select_dtypes(include=['float', 'int']).columns.difference(
        num_agg_features + [COL_TARGET, COL_PREDICTION, COL_USER_ID, COL_BOOK_ID]
    )
    for col in other_nums:
        df[col] = df[col].fillna(MISSING_NUM_VALUE)

    return df

Обработанные данные сохранены: data/processed/processed_features.parquet


In [17]:
def train_model():
    df = pd.read_parquet(PROCESSED_DATA_DIR / PROCESSED_DATA_FILENAME)
    train_set = df[df[COL_SOURCE] == VAL_SOURCE_TRAIN].copy()

    split_date = get_split_date_from_ratio(train_set, 0.8)
    train_mask, val_mask = temporal_split_by_date(train_set, split_date)

    train_split = train_set[train_mask].copy()
    val_split = train_set[val_mask].copy()

    train_split = add_aggregate_features(train_split, train_split)
    val_split = add_aggregate_features(val_split, train_split)

    train_split = add_book_features(train_split, train_split)
    train_split = add_author_features(train_split, train_split)
    train_split = add_user_bias_features(train_split, train_split)

    val_split = add_book_features(val_split, train_split)
    val_split = add_author_features(val_split, train_split)
    val_split = add_user_bias_features(val_split, train_split)

    train_split = handle_missing_values(train_split, train_split)
    val_split = handle_missing_values(val_split, train_split)

    exclude_cols = [COL_SOURCE, COL_TARGET, COL_PREDICTION, COL_TIMESTAMP]
    features = [c for c in train_split.columns if c not in exclude_cols]

    non_feature_object_cols = train_split[features].select_dtypes(include=["object"]).columns.tolist()
    features = [f for f in features if f not in non_feature_object_cols]

    X_train = train_split[features]
    y_train = train_split[COL_TARGET]
    X_val = val_split[features]
    y_val = val_split[COL_TARGET]

    print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}")
    print(f"Features: {len(features)}")

    MODEL_DIR.mkdir(parents=True, exist_ok=True)

    model = lgb.LGBMRegressor(**LGB_PARAMS)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
        eval_metric="rmse"
    )

    model.booster_.save_model(str(MODEL_DIR / MODEL_FILENAME))
    joblib.dump(features, MODEL_DIR / "feature_names.pkl")

    print("Модель обучена и сохранена")

    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    score = 1 - 0.5 * (rmse / 10 + mae / 10)
    print(f"Validation RMSE: {rmse:.4f}, MAE: {mae:.4f}, Score: {score:.4f}")

train_model()

Train shape: (124944, 786), Val shape: (31235, 786)
Features: 786
Модель обучена и сохранена
Validation RMSE: 2.8987, MAE: 2.1122, Score: 0.7495


In [21]:
def predict_and_save_submission():
    df = pd.read_parquet(PROCESSED_DATA_DIR / PROCESSED_DATA_FILENAME)
    train_set = df[df[COL_SOURCE] == VAL_SOURCE_TRAIN].copy()
    test_set = df[df[COL_SOURCE] == VAL_SOURCE_TEST].copy()

    train_set = add_aggregate_features(train_set, train_set)
    test_set = add_aggregate_features(test_set, train_set)

    train_set = add_book_features(train_set, train_set)
    train_set = add_author_features(train_set, train_set)
    train_set = add_user_bias_features(train_set, train_set)

    test_set = add_book_features(test_set, train_set)
    test_set = add_author_features(test_set, train_set)
    test_set = add_user_bias_features(test_set, train_set)

    train_set = handle_missing_values(train_set, train_set)
    test_set = handle_missing_values(test_set, train_set)

    features = joblib.load(MODEL_DIR / "feature_names.pkl")

    missing_features = set(features) - set(test_set.columns)
    if missing_features:
        print(f"Отсутствующие признаки: {missing_features}")
        for feat in missing_features:
            test_set[feat] = 0

    X_test = test_set[features]

    model = lgb.Booster(model_file=str(MODEL_DIR / MODEL_FILENAME))

    test_preds = model.predict(X_test)
    clipped_preds = np.clip(test_preds, PREDICTION_MIN_VALUE, PREDICTION_MAX_VALUE)

    submission_df = test_set[[COL_USER_ID, COL_BOOK_ID]].copy()
    submission_df[COL_PREDICTION] = clipped_preds

    submission_path = "submission.csv"
    submission_df.to_csv(submission_path, index=False)
    print(f"Сабмит создан, победа")

predict_and_save_submission()

Сабмит создан, победа
