In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

In [3]:
%cd /kaggle/input/vk-recsys

/kaggle/input/vk-recsys


In [4]:
train_interactions = pd.read_parquet('train_interactions.parquet')
train_interactions = train_interactions.sort_values('user_id').reset_index(drop=True)

users_meta = pd.read_parquet('users_meta.parquet')
items_meta = pd.read_parquet('items_meta.parquet')

In [5]:
items_meta=items_meta.drop(columns=['embeddings'])

In [6]:
def calculate_unique_sources(df):
    """ Calculates the number of unique source_ids for each user_id in the given DataFrame. """
    user_sources = df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    unique_source_count = user_sources.groupby('user_id')['source_id'].nunique().reset_index(name='unique_source_count')
    return unique_source_count

In [7]:
import pandas as pd



# Prepare DataFrames for storing item metrics and user metrics
item_metrics_final = pd.DataFrame()
user_metrics_final = pd.DataFrame()

# Define functions for metrics calculations
def calculate_item_metrics(df):
    item_metrics = df.groupby('item_id').agg(
        total_likes=('like', 'sum'),
        total_dislikes=('dislike', 'sum'),
        total_shares=('share', 'sum'),
        total_bookmarks=('bookmarks', 'sum'),
        avg_timespent=('timespent', 'mean'),
    ).reset_index()
    return item_metrics

def calculate_gender_metrics(df, users_meta):
    # Merge interactions with user metadata
    extended_df = df.merge(users_meta, on='user_id', how='left')

    # Calculate likes and dislikes for males (1) and females (2)
    gender_likes = extended_df[extended_df['like'] == 1].groupby(['item_id', 'gender']).size().unstack(fill_value=0)

    male_likes = gender_likes.get(1, pd.Series(0)).reset_index(name='male_likes')
    female_likes = gender_likes.get(2, pd.Series(0)).reset_index(name='female_likes')

    gender_dislikes = extended_df[extended_df['dislike'] == 1].groupby(['item_id', 'gender']).size().unstack(fill_value=0)

    male_dislikes = gender_dislikes.get(1, pd.Series(0)).reset_index(name='male_dislikes')
    female_dislikes = gender_dislikes.get(2, pd.Series(0)).reset_index(name='female_dislikes')

    # Merge the results
    male_likes = male_likes.rename(columns={male_likes.columns[0]: 'item_id'})
    female_likes = female_likes.rename(columns={female_likes.columns[0]: 'item_id'})
    male_dislikes = male_dislikes.rename(columns={male_dislikes.columns[0]: 'item_id'})
    female_dislikes = female_dislikes.rename(columns={female_dislikes.columns[0]: 'item_id'})

    return male_likes, female_likes, male_dislikes, female_dislikes

def calculate_unique_sources(df):
    user_sources = df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    unique_source_count = user_sources.groupby('user_id')['source_id'].nunique().reset_index(name='unique_source_count')
    return unique_source_count


In [8]:
import pandas as pd

def create_source_matrix(train_interactions, users_meta, items_meta):
    extended_df = train_interactions.merge(users_meta, on='user_id', how='left')

    # Step 2: Merge with items_meta to get source_id
    extended_df = extended_df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')

    # Step 3: Calculate total likes and encounters by source_id
    likes_count = extended_df[extended_df['like'] == 1] \
        .groupby('source_id')['item_id'] \
        .count() \
        .reset_index(name='total_likes_all_video')
    dislikes_count = extended_df[extended_df['dislike'] == 1] \
        .groupby('source_id')['item_id'] \
        .count() \
        .reset_index(name='total_dislikes_all_video')

    encounters_count = extended_df \
        .groupby('source_id')['item_id'] \
        .count() \
        .reset_index(name='total_encounters_all_video')

    # Step 4: Calculate gender sums who liked each source's items
    gender_sum = extended_df[extended_df['like'] == 1] \
        .groupby('source_id')['gender'] \
        .sum() \
        .reset_index(name='liked_gender_sum_all_video')

    # Step 5: Merge all metrics into a final DataFrame
    result_matrix = likes_count \
        .merge(dislikes_count, on='source_id', how='outer') \
        .merge(encounters_count, on='source_id', how='outer') \
        .merge(gender_sum, on='source_id', how='outer')

    # Step 6: Calculate the gender ratio
    result_matrix['gender_ratio'] = result_matrix['liked_gender_sum_all_video'].fillna(0) / result_matrix['total_likes_all_video'].replace(0, pd.NA)

    # Fill NaN values for likes and encounters
    result_matrix.fillna({'total_likes_all_video': 0, 'total_encounters_all_video': 0, 'liked_gender_sum_all_video': 0}, inplace=True)
    result_matrix=result_matrix.drop(columns=['liked_gender_sum_all_video'])
    return result_matrix

# Example of calling the function
# result = create_source_matrix("path/to/train_interactions.parquet", "path/to/users_meta.parquet", "path/to/items_meta.parquet")


In [9]:
# Call the function to calculate metrics and merge with items_meta
result = create_source_matrix(train_interactions, users_meta, items_meta)
result.fillna(0, inplace=True)
# Display the result DataFrame (or save it)
print(result)


       source_id  total_likes_all_video  total_dislikes_all_video  \
0              0                    9.0                       3.0   
1              1                    7.0                       0.0   
2              2                    1.0                       0.0   
3              3                   14.0                       0.0   
4              4                   21.0                       0.0   
...          ...                    ...                       ...   
19608      19608                    6.0                       0.0   
19609      19609                    7.0                       0.0   
19610      19610                  200.0                       6.0   
19611      19611                 1337.0                      18.0   
19612      19612                  217.0                       0.0   

       total_encounters_all_video  gender_ratio  
0                            2063      1.333333  
1                             178      1.000000  
2                    

In [9]:
import pandas as pd

# Assuming item_metrics, result, and items_meta are already defined DataFrames.

def merge_item_metrics_with_result(item_metrics, result, items_meta):
    # Step 1: Merge item_metrics with items_meta to get source_id
    merged_df = item_metrics.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    
    # Step 2: Merge the result DataFrame with the merged DataFrame on source_id
    final_df = merged_df.merge(result, on='source_id', how='left')
    
    return final_df

# Example usage:
# item_metrics = pd.DataFrame(...)  # Your item metrics DataFrame
# result = pd.DataFrame(...)         # Your result DataFrame
# items_meta = pd.DataFrame(...)     # Your items_meta DataFrame

# final_merged_result = merge_item_metrics_with_result(item_metrics, result, items_meta)


In [10]:
# Loop through each specified row count
# Calculate item metrics
item_metrics = calculate_item_metrics(train_interactions)
male_likes, female_likes, male_dislikes, female_dislikes = calculate_gender_metrics(train_interactions, users_meta)

# Merge gender metrics
item_metrics = item_metrics.merge(male_likes, on='item_id', how='left')
item_metrics = item_metrics.merge(female_likes, on='item_id', how='left')
item_metrics = item_metrics.merge(male_dislikes, on='item_id', how='left')
item_metrics = item_metrics.merge(female_dislikes, on='item_id', how='left')
item_metrics = item_metrics.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
item_metrics = item_metrics.merge(result, on='source_id', how='left')
item_metrics = item_metrics.drop(columns=['source_id'])
# Add suffix to distinguish metrics by row count

item_metrics.columns = [f'{col}_all' if col != 'item_id' else 'item_id' for col in item_metrics.columns]

# Merge into the final aggregated metrics DataFrame
if item_metrics_final.empty:
    item_metrics_final = item_metrics
else:
    item_metrics_final = item_metrics_final.merge(item_metrics, on='item_id', how='outer')

# Calculate unique source counts for users and merge into user metrics DataFrame
user_unique_sources = calculate_unique_sources(train_interactions)

# Add suffix for user metrics by row count
user_unique_sources.columns = ['user_id', f'unique_source_count_all']

if user_metrics_final.empty:
    user_metrics_final = user_unique_sources
else:
    user_metrics_final = user_metrics_final.merge(user_unique_sources, on='user_id', how='outer')


In [11]:
items_meta = items_meta.merge(item_metrics_final, on='item_id', how='left')
items_meta.fillna(0, inplace=True)

users_meta = users_meta.merge(user_metrics_final, on='user_id', how='left')
users_meta.fillna(0, inplace=True)

In [12]:
test_pairs = pd.read_csv('test_pairs.csv')

users = test_pairs['user_id'].unique()
items = test_pairs['item_id'].unique()

In [13]:
train_interactions['like'] = train_interactions['like'].astype('int32')    # Convert to signed integer
train_interactions['dislike'] = train_interactions['dislike'].astype('int32')  # Convert to signed integer

train_interactions['label'] = abs(train_interactions['like'] - train_interactions['dislike'])

#train_interactions = train_interactions[train_interactions['user_id'].isin(users)][train_interactions['item_id'].isin(items)]
train_interactions = train_interactions.loc[5_000_000:]

In [None]:
train_interactions = train_interactions.merge(users_meta, on='user_id', how='left')
train_interactions = train_interactions.merge(items_meta, on='item_id', how='left')

In [None]:
train_interactions.columns

In [16]:
train_interactions = train_interactions[['gender', 'age', 'unique_source_count_all',
       'source_id', 'duration', 'total_likes_x_all', 'total_dislikes_all',
       'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
       'male_likes_x_all', 'female_likes_x_all', 'male_dislikes_all',
       'female_dislikes_all', 'total_likes_y_all', 'item_encounter_count_all',
       'male_likes_y_all', 'female_likes_y_all', 'avg_likes_per_gender_all', 'label']]

In [None]:
params = {
    'tree_method': 'exact',
    'objective': 'binary:logistic'
}
num_boost_round = 5

clf = xgb.XGBClassifier(n_estimators=num_boost_round, scale_pos_weight=11.839, **params)
clf.fit(train_interactions.drop(columns=['label']), train_interactions['label'], 
        verbose=10)

In [None]:
test_pairs = pd.read_csv('data/test_pairs.csv')

test_pairs = test_pairs.merge(users_meta, on='user_id', how='left')
test_pairs = test_pairs.merge(items_meta, on='item_id', how='left')

test_pairs = test_pairs[['gender', 'age', 'unique_source_count_all', 'source_id', 'duration', 'total_likes_all', 'total_dislikes_all',
                            'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
                            'male_likes_all', 'female_likes_all', 'male_dislikes_all',
                            'female_dislikes_all']]

In [None]:
result = clf.predict_proba(test_pairs)
result = [i[1] for i in result]

test_pairs = pd.read_csv('data/test_pairs.csv')
test_pairs['predict'] = result

test_pairs.to_csv('sub_lightgbm.csv', index=False)