In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import Counter

In [None]:
data = pd.read_csv(r"C:\Users\asus\Desktop\RecSys\total_features.csv")

In [None]:
data = data.sort_values(by='timestamp')

In [None]:
def hybrid_filtration(interaction_matrix_scaled, user_similarity_df, item_similarity_df,  client, num_recommendations):
    interacted_items = interaction_matrix_scaled.loc[client][interaction_matrix_scaled.loc[client] > 0].index.tolist()
    # Функция для получения рекомендаций на основе User-based CF
    def get_user_based_recommendations(user_id, num_recommendations=num_recommendations):
        similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:num_recommendations + 1]
        recommended_items = interaction_matrix_scaled.loc[similar_users].sum().sort_values(ascending=False).index[:num_recommendations]
        return recommended_items

    # Функция для получения рекомендаций на основе Item-based CF
    def get_item_based_recommendations(item_id, num_recommendations=num_recommendations*3):
        similar_items = item_similarity_df[item_id].sort_values(ascending=False).index[1:num_recommendations + 1]
        return similar_items

    def get_all_item_recomendations(user_id, num_recommendations=num_recommendations):
        item_based_recommendations = []
        for item_id in interacted_items:
            item_based_recommendations.extend(get_item_based_recommendations(item_id, num_recommendations * 3))
        filtered_based_recommendations = [item for item in item_based_recommendations if item not in interacted_items]
        recommendation_counts = Counter(filtered_based_recommendations)
        # Получаем топ num_recommendations
        top_recommendations = recommendation_counts.most_common(num_recommendations)
        # Возвращаем только предметы
        return [item for item, count in top_recommendations]


    # Гибридная рекомендация
    def hybrid_recommendation(user_id, num_recommendations=num_recommendations):
        user_based_recommendations = get_user_based_recommendations(user_id, num_recommendations)
        user_filtered_based_recommendations = [item for item in user_based_recommendations if item not in interacted_items]
        item_based_recommendations = get_all_item_recomendations(user_id, num_recommendations)
        # Объединение рекомендаций
        # print(user_based_recommendations)
        # print(item_based_recommendations)
        combined_recommendations = set(user_based_recommendations).union(set(item_based_recommendations))
        return tuple([user_filtered_based_recommendations, item_based_recommendations])

    # Пример использования
    return  hybrid_recommendation(client)


In [None]:
from tqdm import tqdm
def get_filter_recs(data):
    
    # Создание матрицы взаимодействий
    interaction_matrix = data.pivot(index='user_id', columns='item_id', values='timestamp').fillna(0)

    # Нормализация матрицы взаимодействий
    # scaler = StandardScaler()
    # interaction_matrix_scaled = scaler.fit_transform(interaction_matrix)

    # Коллаборативная фильтрация (User-based)
    user_similarity = cosine_similarity(interaction_matrix)
    user_similarity_df = pd.DataFrame(user_similarity, index=interaction_matrix.index, columns=interaction_matrix.index)

    # Коллаборативная фильтрация (Item-based)
    item_similarity = cosine_similarity(interaction_matrix.T)
    item_similarity_df = pd.DataFrame(item_similarity, index=interaction_matrix.columns, columns=interaction_matrix.columns)
    recommendation_dict ={}
    for user in tqdm(data.user_id.unique()):
        recommendation_dict[user] = hybrid_filtration(interaction_matrix, user_similarity_df, item_similarity_df,  user, num_recommendations = 50 )
    return recommendation_dict
    

Выполняем то же самое, но теперь рекомендации коллаборативной фильтрацией выполняются с помощью матрицы взаимодействий на всём датасете

In [None]:
recommendation_dict = get_filter_recs(data)

In [None]:
import joblib
joblib.dump(recommendation_dict, 'dict_to_inference.joblib')

То же самое, но теперь выгоняем все пары взаимодействий котрые были на всём датасете

In [None]:
unique_users = data['user_id'].unique()
unique_items = data['item_id'].unique()

# Создаем все возможные пары user_id и item_id
all_pairs = pd.MultiIndex.from_product([unique_users, unique_items], names=['user_id', 'item_id']).to_frame(index=False)

# Находим пары, которые есть в test
train_pairs = data[['user_id', 'item_id']]

# Удаляем пары, которые есть в test из all_pairs
val = all_pairs.merge(train_pairs, on=['user_id', 'item_id'], how='left', indicator=True)
val = val[val['_merge'] == 'left_only'].drop(columns=['_merge'])

In [None]:
def add_recommendation_columns(data, recommendation_dict):
    """Add columns indicating if there are recommendations for each algorithm."""
    recommended_algo1 = []
    recommended_algo2 = []
    for index, row in tqdm(data.iterrows()):
        user = row['user_id']
        item = row['item_id']
        if user in recommendation_dict.keys():
            if item in recommendation_dict[user][0]:
                recommended_algo1.append(1)
            else:
                recommended_algo1.append(0)
            if item in recommendation_dict[user][1]:
                recommended_algo2.append(1)
            else:
                recommended_algo2.append(0)
        else:
            recommended_algo1.append(0)
            recommended_algo2.append(0)
    
    return recommended_algo1, recommended_algo2

In [None]:
val['recommended_algo1'], val['recommended_algo2'] = add_recommendation_columns(val, recommendation_dict)

In [None]:
client_features = data.groupby(by = 'user_id')['age_group', 'user_rating_count', 'user_genre0_share', 'user_genre1_share',
    'user_genre2_share', 'user_genre3_share', 'user_genre4_share',
    'user_genre5_share', 'user_genre6_share', 'user_genre7_share',
    'user_genre8_share', 'user_genre9_share', 'user_genre10_share',
    'user_genre11_share', 'user_genre12_share', 'user_genre13_share',
    'user_genre14_share', 'user_genre15_share', 'user_genre16_share',
    'user_genre17_share', 'user_timestamp_q0', 'user_timestamp_q10',
    'user_timestamp_q25', 'user_timestamp_q33', 'user_timestamp_q50',
    'user_timestamp_q67', 'user_timestamp_q75', 'user_timestamp_q90',
    'user_timestamp_q100', 'user_timestamp_range', 'user_timestamp_iqr',
    'gender', 'age'].mean().reset_index()

item_features = data.groupby(by = 'item_id')['female_ratio', 'male_ratio',
    'young_ratio', 'adult_ratio', 'senior_ratio',
    'female_ratio_genre', 'male_ratio_genre', 'young_ratio_genre',
    'adult_ratio_genre', 'senior_ratio_genre', 'item_rating_count',
    'avg_rating_time_x', 'rating_time_range_x', 
    'genre_0', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5',
    'genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11',
    'genre_12', 'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17'].mean().reset_index()

In [None]:
val_with_client_features = val.merge(client_features, on='user_id', how='left')

# Выполняем left join с item_features по item_id
val_final = val_with_client_features.merge(item_features, on='item_id', how='left')

In [None]:
val_final['boosting_forecast'] = model.predict(before_range)

In [None]:
def get_submission_sample(before_range):
    top_items = (before_range
                .sort_values(by='boosting_forecast', ascending=False)
                .groupby('user_id')
                .head(10))

    # Теперь создаем новый датафрейм с user_id и строкой из item_id
    result = (top_items
            .groupby('user_id')['item_id']
            .apply(lambda x: ' '.join(x.astype(str)))
            .reset_index())

    # Переименуем столбцы для ясности
    result.columns = ['user_id', 'item_id']

    # Теперь result содержит user_id и соответствующие топ-10 item_id
    return result

In [None]:
res = get_submission_sample(val_final)

In [None]:
res.to_csv('submission_sample.csv', delimiter = ',')