In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

In [2]:
%cd /kaggle/input/vk-recsys

/kaggle/input/vk-recsys


In [3]:
train_interactions = pd.read_parquet('train_interactions.parquet')
users_meta = pd.read_parquet('users_meta.parquet')
items_meta = pd.read_parquet('items_meta.parquet')

In [26]:
gender_counts = users_meta['gender'].value_counts().reset_index()

# Rename columns for clarity
gender_counts.columns = ['gender', 'count']

print(gender_counts)

   gender   count
0       2  117200
1       1   66204


In [4]:
items_meta=items_meta.drop(columns=['embeddings'])

In [5]:
def calculate_unique_sources(df):
    """ Calculates the number of unique source_ids for each user_id in the given DataFrame. """
    user_sources = df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    unique_source_count = user_sources.groupby('user_id')['source_id'].nunique().reset_index(name='unique_source_count')
    return unique_source_count

In [6]:
# Prepare DataFrames for storing item metrics and user metrics
item_metrics_final = pd.DataFrame()
user_metrics_final = pd.DataFrame()

# Define functions for metrics calculations
def calculate_item_metrics(df):
    item_metrics = df.groupby('item_id').agg(
        total_likes=('like', 'sum'),
        total_dislikes=('dislike', 'sum'),
        total_shares=('share', 'sum'),
        total_bookmarks=('bookmarks', 'sum'),
        avg_timespent=('timespent', 'mean'),
    ).reset_index()
    return item_metrics

def calculate_gender_metrics(df, users_meta):
    # Merge interactions with user metadata
    extended_df = df.merge(users_meta, on='user_id', how='left')

    # Calculate likes and dislikes for males (1) and females (2)
    gender_likes = extended_df[extended_df['like'] == 1].groupby(['item_id', 'gender']).size().unstack(fill_value=0)
    
    female_likes = gender_likes.get(2, pd.Series(0)).reset_index(name='female_likes')


    # Merge the results
    female_likes = female_likes.rename(columns={female_likes.columns[0]: 'item_id'})

    return female_likes

def calculate_unique_sources(df):
    user_sources = df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    unique_source_count = user_sources.groupby('user_id')['source_id'].nunique().reset_index(name='unique_source_count')
    return unique_source_count


In [7]:
import pandas as pd

def create_source_matrix(train_interactions, users_meta, items_meta):
    extended_df = train_interactions.merge(users_meta, on='user_id', how='left')

    # Step 2: Merge with items_meta to get source_id
    extended_df = extended_df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    # Step 3: Calculate total likes and encounters by source_id
    likes_count = extended_df[extended_df['like'] == 1] \
        .groupby('source_id')['item_id'] \
        .count() \
        .reset_index(name='total_likes_all_video')

    dislikes_count = extended_df[extended_df['dislike'] == 1] \
        .groupby('source_id')['item_id'] \
        .count() \
        .reset_index(name='total_dislikes_all_video')
    # Step 4: Calculate gender sums who liked each source's items
    gender_sum = extended_df[extended_df['like'] == 1] \
        .groupby('source_id')['gender'] \
        .sum() \
        .reset_index(name='liked_gender_sum_all_video')

    encounters_count = extended_df \
        .groupby('source_id')['item_id'] \
        .count() \
        .reset_index(name='total_encounters_all_video')

    # Step 5: Merge all metrics into a final DataFrame
    result_matrix = likes_count \
        .merge(dislikes_count, on='source_id', how='outer') \
        .merge(encounters_count, on='source_id', how='outer') \
        .merge(gender_sum, on='source_id', how='outer')
   
    # Step 6: Calculate the gender ratio
    result_matrix['gender_ratio'] = result_matrix['liked_gender_sum_all_video'].fillna(0) / (result_matrix['total_likes_all_video']).replace(0, pd.NA)
    # Fill NaN values for likes and encounters
    result_matrix=result_matrix.drop(columns=['liked_gender_sum_all_video'])
    result_matrix.fillna(0, inplace=True)
    return result_matrix



In [8]:
# Call the function to calculate metrics and merge with items_meta
result = create_source_matrix(train_interactions, users_meta, items_meta)


# Assuming item_metrics, result, and items_meta are already defined DataFrames.

def merge_item_metrics_with_result(item_metrics, result, items_meta):
    # Step 1: Merge item_metrics with items_meta to get source_id
    merged_df = item_metrics.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    
    # Step 2: Merge the result DataFrame with the merged DataFrame on source_id
    final_df = merged_df.merge(result, on='source_id', how='left')
    
    return final_df


In [9]:
# Loop through each specified row count
# Calculate item metrics
item_metrics = calculate_item_metrics(train_interactions)
female_likes = calculate_gender_metrics(train_interactions, users_meta)

# Merge gender metrics
item_metrics = item_metrics.merge(female_likes, on='item_id', how='left')
item_metrics = item_metrics.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
item_metrics = item_metrics.merge(result, on='source_id', how='left')
item_metrics = item_metrics.drop(columns=['source_id'])
# Add suffix to distinguish metrics by row count

item_metrics.columns = [f'{col}_all' if col != 'item_id' else 'item_id' for col in item_metrics.columns]

# Merge into the final aggregated metrics DataFrame
if item_metrics_final.empty:
    item_metrics_final = item_metrics
else:
    item_metrics_final = item_metrics_final.merge(item_metrics, on='item_id', how='outer')

# Calculate unique source counts for users and merge into user metrics DataFrame
user_unique_sources = calculate_unique_sources(train_interactions)

# Add suffix for user metrics by row count
user_unique_sources.columns = ['user_id', f'unique_source_count_all']

if user_metrics_final.empty:
    user_metrics_final = user_unique_sources
else:
    user_metrics_final = user_metrics_final.merge(user_unique_sources, on='user_id', how='outer')


In [10]:
items_meta = items_meta.merge(item_metrics_final, on='item_id', how='left')
items_meta.fillna(0, inplace=True)

users_meta = users_meta.merge(user_metrics_final, on='user_id', how='left')
users_meta.fillna(0, inplace=True)

In [13]:
items_meta

Unnamed: 0,item_id,source_id,duration,total_likes_all,total_dislikes_all,total_shares_all,total_bookmarks_all,avg_timespent_all,female_likes_all,total_likes_all_video_all,total_dislikes_all_video_all,total_encounters_all_video_all,gender_ratio_all
0,0,1869,13,3,0,5,0,7.019608,3.0,113.0,1.0,3157,1.938053
1,1,1869,8,3,0,3,0,5.302326,3.0,113.0,1.0,3157,1.938053
2,2,1869,10,5,0,4,0,5.968254,4.0,113.0,1.0,3157,1.938053
3,3,1869,49,7,0,8,2,19.670103,7.0,113.0,1.0,3157,1.938053
4,4,1869,8,16,0,19,0,7.062500,16.0,113.0,1.0,3157,1.938053
...,...,...,...,...,...,...,...,...,...,...,...,...,...
337722,337722,14899,28,0,0,0,0,14.600000,0.0,128.0,8.0,7446,1.984375
337723,337723,14899,33,0,0,0,0,13.888889,0.0,128.0,8.0,7446,1.984375
337724,337724,10206,18,1,0,0,0,13.069767,1.0,1.0,0.0,43,2.000000
337725,337725,19530,15,0,0,0,0,11.636364,0.0,0.0,0.0,22,0.000000


In [11]:
test_pairs = pd.read_csv('test_pairs.csv')

users = test_pairs['user_id'].unique()
items = test_pairs['item_id'].unique()

In [12]:
train_interactions['like'] = train_interactions['like'].astype('int32')    # Convert to signed integer
train_interactions['dislike'] = train_interactions['dislike'].astype('int32')  # Convert to signed integer

train_interactions['label'] = abs(train_interactions['like'] - train_interactions['dislike'])

#train_interactions = train_interactions[train_interactions['user_id'].isin(users)][train_interactions['item_id'].isin(items)]
train_interactions = train_interactions.loc[10_000_000:]

In [13]:
train_interactions = train_interactions.merge(users_meta, on='user_id', how='left')
train_interactions = train_interactions.merge(items_meta, on='item_id', how='left')

In [14]:
train_interactions.columns

Index(['user_id', 'item_id', 'timespent', 'like', 'dislike', 'share',
       'bookmarks', 'label', 'gender', 'age', 'unique_source_count_all',
       'source_id', 'duration', 'total_likes_all', 'total_dislikes_all',
       'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
       'female_likes_all', 'total_likes_all_video_all',
       'total_dislikes_all_video_all', 'total_encounters_all_video_all',
       'gender_ratio_all'],
      dtype='object')

In [15]:
train_interactions = train_interactions[['gender', 'age', 'unique_source_count_all',
       'source_id', 'duration', 'total_likes_all', 'total_dislikes_all',
       'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
       'female_likes_all', 'total_likes_all_video_all',
       'total_dislikes_all_video_all', 'total_encounters_all_video_all',
       'gender_ratio_all', 'label']]

In [15]:
train_interactions.columns

Index(['gender', 'age', 'unique_source_count_all', 'source_id', 'duration',
       'total_likes_x_all', 'total_dislikes_all', 'total_shares_all',
       'total_bookmarks_all', 'avg_timespent_all', 'male_likes_x_all',
       'female_likes_x_all', 'male_dislikes_all', 'female_dislikes_all',
       'item_encounter_count_all', 'avg_likes_per_gender_all', 'label'],
      dtype='object')

In [16]:
len(train_interactions)

140667282

In [None]:
params = {
    'tree_method': 'exact',
    'objective': 'binary:logistic'
}
num_boost_round = 20
class_weight = len(train_interactions[train_interactions['label'] == 0]) / len(train_interactions[train_interactions['label'] == 1])

clf = xgb.XGBClassifier(n_estimators=num_boost_round, scale_pos_weight=class_weight, **params)
clf.fit(train_interactions.drop(columns=['label']), train_interactions['label'])

In [21]:
test_pairs = pd.read_csv('data/test_pairs.csv')

test_pairs = test_pairs.merge(users_meta, on='user_id', how='left')
test_pairs = test_pairs.merge(items_meta, on='item_id', how='left')

test_pairs = test_pairs[['gender', 'age', 'unique_source_count_all',
       'source_id', 'duration', 'total_likes_all', 'total_dislikes_all',
       'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
       'female_likes_all', 'total_likes_all_video_all',
       'total_dislikes_all_video_all', 'total_encounters_all_video_all',
       'gender_ratio_all']]

In [22]:
test_pairs.head()

Unnamed: 0,gender,age,unique_source_count_all,source_id,duration,total_likes_x_all,total_dislikes_all,total_shares_all,total_bookmarks_all,avg_timespent_all,male_likes_x_all,female_likes_x_all,male_dislikes_all,female_dislikes_all,item_encounter_count_all,avg_likes_per_gender_all
0,2,35,145,5119,9,2308,1,614,35,7.494987,558.0,1750.0,0.0,1.0,15659.0,1154.0
1,2,35,145,2720,5,1110,1,144,53,5.802824,180.0,930.0,0.0,1.0,6091.0,555.0
2,2,35,145,11601,32,379,1,101,10,21.974728,2.0,377.0,0.0,1.0,3126.0,189.5
3,2,35,145,4296,10,1532,2,463,8,8.33773,139.0,1393.0,1.0,1.0,14556.0,766.0
4,2,35,145,18812,38,134,2,52,4,26.483355,2.0,132.0,0.0,2.0,5317.0,67.0


In [23]:
result = clf.predict_proba(test_pairs)
result = [i[1] for i in result]

test_pairs = pd.read_csv('data/test_pairs.csv')
test_pairs['predict'] = result

test_pairs.to_csv('sub_lightgbm.csv', index=False)