In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

In [None]:
%cd /Users/vladimirkalajcidi/vk_recsys

In [None]:
train_interactions = pd.read_parquet('data/train_interactions.parquet')
users_meta = pd.read_parquet('data/users_meta.parquet')
items_meta = pd.read_parquet('data/items_meta.parquet')

In [None]:
def calculate_unique_sources(df):
    """ Calculates the number of unique source_ids for each user_id in the given DataFrame. """
    user_sources = df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    unique_source_count = user_sources.groupby('user_id')['source_id'].nunique().reset_index(name='unique_source_count')
    return unique_source_count

In [None]:
import pandas as pd


# Define row counts
row_counts = [5_000_000, 10_000_000, 50_000_000]

# Prepare DataFrames for storing item metrics and user metrics
item_metrics_final = pd.DataFrame()
user_metrics_final = pd.DataFrame()

# Define functions for metrics calculations
def calculate_item_metrics(df):
    item_metrics = df.groupby('item_id').agg(
        total_likes=('like', 'sum'),
        total_dislikes=('dislike', 'sum'),
        total_shares=('share', 'sum'),
        total_bookmarks=('bookmarks', 'sum'),
        avg_timespent=('timespent', 'mean'),
    ).reset_index()
    return item_metrics

def calculate_gender_metrics(df, users_meta):
    # Merge interactions with user metadata
    extended_df = df.merge(users_meta, on='user_id', how='left')

    # Calculate likes and dislikes for males (1) and females (2)
    gender_likes = extended_df[extended_df['like'] == 1].groupby(['item_id', 'gender']).size().unstack(fill_value=0)

    male_likes = gender_likes.get(1, pd.Series(0)).reset_index(name='male_likes')
    female_likes = gender_likes.get(2, pd.Series(0)).reset_index(name='female_likes')

    gender_dislikes = extended_df[extended_df['dislike'] == 1].groupby(['item_id', 'gender']).size().unstack(fill_value=0)

    male_dislikes = gender_dislikes.get(1, pd.Series(0)).reset_index(name='male_dislikes')
    female_dislikes = gender_dislikes.get(2, pd.Series(0)).reset_index(name='female_dislikes')

    # Merge the results
    male_likes = male_likes.rename(columns={male_likes.columns[0]: 'item_id'})
    female_likes = female_likes.rename(columns={female_likes.columns[0]: 'item_id'})
    male_dislikes = male_dislikes.rename(columns={male_dislikes.columns[0]: 'item_id'})
    female_dislikes = female_dislikes.rename(columns={female_dislikes.columns[0]: 'item_id'})

    return male_likes, female_likes, male_dislikes, female_dislikes

def calculate_unique_sources(df):
    user_sources = df.merge(items_meta[['item_id', 'source_id']], on='item_id', how='left')
    unique_source_count = user_sources.groupby('user_id')['source_id'].nunique().reset_index(name='unique_source_count')
    return unique_source_count


In [None]:

# Loop through each specified row count
for count in row_counts:
    if count == len(train_interactions):
        df_sample = train_interactions
    else:
        df_sample = train_interactions.tail(count)

    # Calculate item metrics
    item_metrics = calculate_item_metrics(df_sample)
    male_likes, female_likes, male_dislikes, female_dislikes = calculate_gender_metrics(df_sample, users_meta)

    # Merge gender metrics
    item_metrics = item_metrics.merge(male_likes, on='item_id', how='left')
    item_metrics = item_metrics.merge(female_likes, on='item_id', how='left')
    item_metrics = item_metrics.merge(male_dislikes, on='item_id', how='left')
    item_metrics = item_metrics.merge(female_dislikes, on='item_id', how='left')

    # Add suffix to distinguish metrics by row count
    suffix = f'_{count}'
    item_metrics.columns = [f'{col}{suffix}' if col != 'item_id' else 'item_id' for col in item_metrics.columns]

    # Merge into the final aggregated metrics DataFrame
    if item_metrics_final.empty:
        item_metrics_final = item_metrics
    else:
        item_metrics_final = item_metrics_final.merge(item_metrics, on='item_id', how='outer')

    # Calculate unique source counts for users and merge into user metrics DataFrame
    user_unique_sources = calculate_unique_sources(df_sample)

    # Add suffix for user metrics by row count
    user_unique_sources.columns = ['user_id', f'unique_source_count{suffix}']

    if user_metrics_final.empty:
        user_metrics_final = user_unique_sources
    else:
        user_metrics_final = user_metrics_final.merge(user_unique_sources, on='user_id', how='outer')



In [None]:
# Loop through each specified row count
# Calculate item metrics
item_metrics = calculate_item_metrics(train_interactions)
male_likes, female_likes, male_dislikes, female_dislikes = calculate_gender_metrics(train_interactions, users_meta)

# Merge gender metrics
item_metrics = item_metrics.merge(male_likes, on='item_id', how='left')
item_metrics = item_metrics.merge(female_likes, on='item_id', how='left')
item_metrics = item_metrics.merge(male_dislikes, on='item_id', how='left')
item_metrics = item_metrics.merge(female_dislikes, on='item_id', how='left')

# Add suffix to distinguish metrics by row count

item_metrics.columns = [f'{col}_all' if col != 'item_id' else 'item_id' for col in item_metrics.columns]

# Merge into the final aggregated metrics DataFrame
if item_metrics_final.empty:
    item_metrics_final = item_metrics
else:
    item_metrics_final = item_metrics_final.merge(item_metrics, on='item_id', how='outer')

# Calculate unique source counts for users and merge into user metrics DataFrame
user_unique_sources = calculate_unique_sources(train_interactions)

# Add suffix for user metrics by row count
user_unique_sources.columns = ['user_id', f'unique_source_count_all']

if user_metrics_final.empty:
    user_metrics_final = user_unique_sources
else:
    user_metrics_final = user_metrics_final.merge(user_unique_sources, on='user_id', how='outer')


In [None]:
items_meta = items_meta.merge(item_metrics_final, on='item_id', how='left')
items_meta.fillna(0, inplace=True)

users_meta = users_meta.merge(user_metrics_final, on='user_id', how='left')
users_meta.fillna(0, inplace=True)

In [None]:
test_pairs = pd.read_csv('data/test_pairs.csv')

users = test_pairs['user_id'].unique()
items = test_pairs['item_id'].unique()

In [None]:
train_interactions['like'] = train_interactions['like'].astype('int32')    # Convert to signed integer
train_interactions['dislike'] = train_interactions['dislike'].astype('int32')  # Convert to signed integer

# Now create the 'weight' column
train_interactions['label'] = train_interactions['like'] - train_interactions['dislike']

train_interactions = train_interactions[train_interactions['user_id'].isin(users)][train_interactions['item_id'].isin(items)]

In [None]:
train_interactions = train_interactions.merge(users_meta, on='user_id', how='left')
train_interactions = train_interactions.merge(items_meta, on='item_id', how='left')

In [None]:
new_mapping = {
    1: 1,      # Change to 1
    0: 0,    # Change to 0
    -1: 0    # Change to 0
}

# Use replace to update the interaction column
train_interactions['label'] = train_interactions['label'].replace(new_mapping)

In [None]:
train_interactions = train_interactions[['gender', 'age', 'unique_source_count_all', 'source_id', 'duration', 'total_likes_all', 'total_dislikes_all',
                            'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
                            'male_likes_all', 'female_likes_all', 'male_dislikes_all',
                            'female_dislikes_all', 'label']]

In [None]:
params = {
    'tree_method': 'exact',
    'objective': 'binary:logistic'
}
num_boost_round = 50

clf = xgb.XGBClassifier(n_estimators=num_boost_round, scale_pos_weight=11.839, **params)
clf.fit(interactions.drop(columns=['label']), interactions['label'], 
        verbose=10)

In [None]:
test_pairs = pd.read_csv('data/test_pairs.csv')

test_pairs = test_pairs.merge(users_meta, on='user_id', how='left')
test_pairs = test_pairs.merge(items_meta, on='item_id', how='left')

test_pairs = test_pairs[['gender', 'age', 'unique_source_count_all', 'source_id', 'duration', 'total_likes_all', 'total_dislikes_all',
                            'total_shares_all', 'total_bookmarks_all', 'avg_timespent_all',
                            'male_likes_all', 'female_likes_all', 'male_dislikes_all',
                            'female_dislikes_all']]

In [None]:
result = clf.predict_proba(test_pairs)
result = [i[1] for i in result]

test_pairs = pd.read_csv('data/test_pairs.csv')
test_pairs['predict'] = result

test_pairs.to_csv('sub_lightgbm.csv', index=False)