In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/data/'

# items_meta 
items_meta = pd.read_parquet(f'{data_folder}items_meta.parquet', engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

# users_meta
users_meta = pd.read_parquet(f'{data_folder}users_meta.parquet', engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['age'] = users_meta['age'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta.set_index('user_id', inplace=True)

train = pd.read_parquet(f'{data_folder}train_interactions.parquet', engine='pyarrow')
train['like'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['dislike'], inplace=True)
train['like'] = train['like'].astype('int8')
train.rename(columns={'like' : 'target'}, inplace=True)

test = pd.read_csv(f'{data_folder}test_pairs.csv')

train['user_id'] = train['user_id'].astype('category')
train['item_id'] = train['item_id'].astype('category')

test['user_id'] = test['user_id'].astype('category')
test['item_id'] = test['item_id'].astype('category')

In [3]:
memory_usage = train.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 * 1024)
print(f"Размер train: {memory_usage_mb} мб")
print(train.dtypes)

Размер train: 1683.099609375 мб
user_id      category
item_id      category
timespent       uint8
target           int8
share           uint8
bookmarks       uint8
dtype: object


In [4]:
memory_usage = test.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 * 1024)
print(f"Размер test: {memory_usage_mb} мб")
print(test.dtypes)

Размер test: 20.448028564453125 мб
user_id    category
item_id    category
dtype: object


In [5]:
CONFIG = {
    # Всегда True
    'user_info' : True,
    'item_info' : True,
    'source_info' : True,
    'item_duration' : True,
    
    # U. пользователя
    'age' : True,
    'gender' : True,
    'user_like_ratio' : True,
    'user_dislike_ratio' : True,
    'user_ignore_ratio' : True,
    'user_share_ratio' : True,
    'user_bookmark_ratio' : True,
    'user_avg_spent_time' : True,
    'user_view_ratio' : True,
    'user_full_view_ratio' : True,
    
    # I. клипа
    'item_like_ratio' : True,
    'item_dislike_ratio' : True,
    'item_ignore_ratio' : True,
    'item_share_ratio' : True,
    'item_bookmark_ratio' : True,
    'item_avg_spent_time_ratio' : True,
    'item_view_ratio' : True,
    'item_full_view_ratio' : True,
    
    # S. источника
    'source_id' : True,
    'source_like_ratio' : True,
    'source_dislike_ratio' : True,
    'source_ignore_ratio' : True,
    'source_share_ratio' : True,
    'source_bookmark_ratio' : True,
    'source_avg_spent_time_ratio' : True,
    'source_view_ratio' : True,
    'source_full_view_ratio' : True
}

In [6]:
user_info = train.groupby('user_id').agg(
    likes=('target', lambda x: (x == 1).sum()),
    dislikes=('target', lambda x: (x == -1).sum()),
    ignores=('target', lambda x: (x == 0).sum()),
    shares=('share', lambda x: (x == 1).sum()),
    bookmarks=('bookmarks', lambda x: (x == 1).sum()),
    count_target=('target', 'count'),
)

user_info['user_like_ratio'] = user_info['likes'] / user_info['count_target']
user_info['user_dislike_ratio'] = user_info['dislikes'] / user_info['count_target']
user_info['user_ignore_ratio'] = user_info['ignores'] / user_info['count_target']
user_info['user_share_ratio'] = user_info['shares'] / user_info['count_target']
user_info['user_bookmark_ratio'] = user_info['bookmarks'] / user_info['count_target']

user_info.drop(columns=['likes', 'dislikes', 'ignores', 'shares', 'bookmarks', 'count_target'], inplace=True)

In [7]:
item_info = train.groupby('item_id').agg(
    likes=('target', lambda x: (x == 1).sum()),  
    dislikes=('target', lambda x: (x == -1).sum()),   
    ignores=('target', lambda x: (x == 0).sum()),
    shares=('share', lambda x: (x == 1).sum()),
    bookmarks=('bookmarks', lambda x: (x == 1).sum()),
    count_item_id=('item_id', lambda x: x.count())
)

item_info['item_like_ratio'] = item_info['likes'] / item_info['count_item_id']
item_info['item_dislike_ratio'] = item_info['dislikes'] / item_info['count_item_id']
item_info['item_ignore_ratio'] = item_info['ignores'] / item_info['count_item_id']
item_info['item_share_ratio'] = item_info['shares'] / item_info['count_item_id']
item_info['item_bookmark_ratio'] = item_info['bookmarks'] / item_info['count_item_id']

item_info.drop(columns=['likes', 'dislikes', 'ignores', 'shares', 'bookmarks', 'count_item_id'], inplace=True)

In [8]:
train['item_duration'] = train['item_id'].map(items_meta['duration']).astype('uint8')
test['item_duration'] = test['item_id'].map(items_meta['duration']).astype('uint8')

In [9]:
train['source_id'] = train['item_id'].map(items_meta['source_id']).astype('category')
test['source_id'] = test['item_id'].map(items_meta['source_id']).astype('category')

In [10]:
if CONFIG.get('age', False):
    def age(train_df, test_df, users_meta_df):

        train_df['age'] = train_df['user_id'].map(users_meta_df['age'])
        test_df['age'] = test_df['user_id'].map(users_meta_df['age'])
        
        return train_df, test_df
    
    train, test = age(train, test, users_meta) 

In [11]:
if CONFIG.get('gender', False):
    def gender(train_df, test_df, users_meta_df):

        train_df['gender'] = train_df['user_id'].map(users_meta_df['gender'])
        test_df['gender'] = test_df['user_id'].map(users_meta_df['gender'])
        
        return train_df, test_df
    
    train, test = gender(train, test, users_meta)

In [12]:
if CONFIG.get('user_like_ratio', False):
    train['user_like_ratio'] = train['user_id'].map(user_info['user_like_ratio']).astype('float16')
    test['user_like_ratio'] = test['user_id'].map(user_info['user_like_ratio']).astype('float16')

In [13]:
if CONFIG.get('user_dislike_ratio', False):
    train['user_dislike_ratio'] = train['user_id'].map(user_info['user_dislike_ratio']).astype('float16')
    test['user_dislike_ratio'] = test['user_id'].map(user_info['user_dislike_ratio']).astype('float16')

In [14]:
if CONFIG.get('user_ignore_ratio', False):
    train['user_ignore_ratio'] = train['user_id'].map(user_info['user_ignore_ratio']).astype('float16')
    test['user_ignore_ratio'] = test['user_id'].map(user_info['user_ignore_ratio']).astype('float16')

In [15]:
if CONFIG.get('user_share_ratio', False):
    train['user_share_ratio'] = train['user_id'].map(user_info['user_share_ratio']).astype('float16')
    test['user_share_ratio'] = test['user_id'].map(user_info['user_share_ratio']).astype('float16')

In [16]:
if CONFIG.get('user_bookmark_ratio', False):
    train['user_bookmark_ratio'] = train['user_id'].map(user_info['user_bookmark_ratio']).astype('float16')
    test['user_bookmark_ratio'] = test['user_id'].map(user_info['user_bookmark_ratio']).astype('float16')

In [17]:
if CONFIG.get('user_avg_spent_time', False):
    train['view_percentage'] = train['timespent'] / train['item_duration']

    user_avg_spent_time = train.groupby('user_id')['view_percentage'].mean()
    user_avg_spent_time.rename('user_avg_spent_time_ratio', inplace=True)
    
    train = train.merge(user_avg_spent_time, left_on='user_id', right_index=True, how='left')
    test = test.merge(user_avg_spent_time, left_on='user_id', right_index=True, how='left')
    test['user_id'] = test['user_id'].astype('category')
    train.drop(columns=['view_percentage'], inplace=True)
    
    train['user_avg_spent_time_ratio'] = train['user_avg_spent_time_ratio'].astype('float16')
    test['user_avg_spent_time_ratio'] = test['user_avg_spent_time_ratio'].astype('float16')

In [18]:
if CONFIG.get('user_view_ratio', False):
    user_view_count = train.groupby('user_id').agg(
        user_view_ratio =('user_id', 'count')
    )

    # Рассчитываем user_view_ratio для train, исключая текущее видео
    train = train.merge(user_view_count[['user_view_ratio']], left_on='user_id', right_index=True,  how='left')
    train['user_view_ratio'] /= len(items_meta)
    
    test = test.merge(user_view_count[['user_view_ratio']], left_on='user_id', right_index=True,  how='left')
    test['user_id'] = test['user_id'].astype('category')
    test['user_view_ratio'] /= len(items_meta)
    
    train['user_view_ratio'] = train['user_view_ratio'].astype('float16')
    test['user_view_ratio'] = test['user_view_ratio'].astype('float16')

In [19]:
if CONFIG.get('user_full_view_ratio', False):
    
    train['full_view'] = (train['item_duration'] <= train['timespent'])
    train['full_view'] = train['full_view'].astype(int)
    
    user_view_count = train.groupby('user_id').agg(
        user_view_ratio = ('user_id', 'count'),
        user_full_viewed = ('full_view', lambda x: (x == 1).sum()),
    )
    
    user_view_count['user_full_view_ratio'] = user_view_count['user_full_viewed'] / user_view_count['user_view_ratio']

    train = train.merge(user_view_count[['user_full_view_ratio']], left_on='user_id', right_index=True,  how='left')
    
    test = test.merge(user_view_count[['user_full_view_ratio']], left_on='user_id', right_index=True,  how='left')
    test['user_id'] = test['user_id'].astype('category')
    
    train.drop(columns=['full_view'], inplace=True)
    
    train['user_full_view_ratio'] = train['user_full_view_ratio'].astype('float16')
    test['user_full_view_ratio'] = test['user_full_view_ratio'].astype('float16')

In [20]:
if CONFIG.get('item_like_ratio', False):
    train['item_like_ratio'] = train['item_id'].map(item_info['item_like_ratio']).astype('float16')
    test['item_like_ratio'] = test['item_id'].map(item_info['item_like_ratio']).astype('float16')

In [21]:
if CONFIG.get('item_dislike_ratio', False):
    train['item_dislike_ratio'] = train['item_id'].map(item_info['item_dislike_ratio']).astype('float16')
    test['item_dislike_ratio'] = test['item_id'].map(item_info['item_dislike_ratio']).astype('float16')

In [22]:
if CONFIG.get('item_ignore_ratio', False):
    train['item_ignore_ratio'] = train['item_id'].map(item_info['item_ignore_ratio']).astype('float16')
    test['item_ignore_ratio'] = test['item_id'].map(item_info['item_ignore_ratio']).astype('float16')

In [23]:
if CONFIG.get('item_share_ratio', False):
    train['item_share_ratio'] = train['item_id'].map(item_info['item_share_ratio']).astype('float16')
    test['item_share_ratio'] = test['item_id'].map(item_info['item_share_ratio']).astype('float16')

In [24]:
if CONFIG.get('item_bookmark_ratio', False):
    train['item_bookmark_ratio'] = train['item_id'].map(item_info['item_bookmark_ratio']).astype('float16')
    test['item_bookmark_ratio'] = test['item_id'].map(item_info['item_bookmark_ratio']).astype('float16')

In [25]:
if CONFIG.get('item_avg_spent_time_ratio', False):
    
    train['view_percentage'] = train['timespent'] / train['item_duration']
    
    user_avg_spent_time = train.groupby('item_id')['view_percentage'].mean()
    user_avg_spent_time.rename('item_avg_spent_time_ratio', inplace=True)
    
    train = train.merge(user_avg_spent_time, left_on='item_id', right_index=True, how='left')
    test = test.merge(user_avg_spent_time, left_on='item_id', right_index=True, how='left')
    test['item_id'] = test['item_id'].astype('category')
    train.drop(columns=['view_percentage'], inplace=True)

    train['item_avg_spent_time_ratio'] = train['item_avg_spent_time_ratio'].astype('float16')
    test['item_avg_spent_time_ratio'] = test['item_avg_spent_time_ratio'].astype('float16')

In [26]:
if CONFIG.get('item_view_ratio', False):
    item_view_count = train.groupby('item_id').agg(
        item_view_ratio =('item_id', 'count')
    )

    train = train.merge(item_view_count[['item_view_ratio']], left_on='item_id', right_index=True,  how='left')
    train['item_view_ratio'] /= len(users_meta)
    
    test = test.merge(item_view_count[['item_view_ratio']], left_on='item_id', right_index=True,  how='left')
    test['item_id'] = test['item_id'].astype('category')
    test['item_view_ratio'] /= len(users_meta)
    
    train['item_view_ratio'] = train['item_view_ratio'].astype('float16')
    test['item_view_ratio'] = test['item_view_ratio'].astype('float16')

In [27]:
if CONFIG.get('item_full_view_ratio', False):
    
    train['full_view'] = (train['item_duration'] <= train['timespent'])
    train['full_view'] = train['full_view'].astype(int)

    item_view_count = train.groupby('item_id').agg(
        item_view_ratio=('item_id', 'count'),
        item_full_viewed=('full_view', lambda x: (x == 1).sum()),
    )

    item_view_count['item_full_view_ratio'] = item_view_count['item_full_viewed'] / item_view_count['item_view_ratio']

    train = train.merge(item_view_count[['item_full_view_ratio']], left_on='item_id', right_index=True, how='left')

    test = test.merge(item_view_count[['item_full_view_ratio']], left_on='item_id', right_index=True, how='left')
    test['item_id'] = test['item_id'].astype('category')

    train.drop(columns=['full_view'], inplace=True)
    
    train['item_full_view_ratio'] = train['item_full_view_ratio'].astype('float16')
    test['item_full_view_ratio'] = test['item_full_view_ratio'].astype('float16')

In [28]:
if CONFIG.get('source_like_ratio', False):
    
    source_like_ratio = train.groupby('source_id')['item_like_ratio'].mean()
    source_like_ratio.rename('source_like_ratio', inplace=True)
    
    train['source_like_ratio'] = train['source_id'].map(source_like_ratio).astype('float16')
    test['source_like_ratio'] = test['source_id'].map(source_like_ratio).astype('float16')
    
    del source_like_ratio

In [29]:
if CONFIG.get('source_dislike_ratio', False):
    source_dislike_ratio = train.groupby('source_id')['item_dislike_ratio'].mean()
    source_dislike_ratio.rename('source_dislike_ratio', inplace=True)
    
    train['source_dislike_ratio'] = train['source_id'].map(source_dislike_ratio).astype('float16')
    test['source_dislike_ratio'] = test['source_id'].map(source_dislike_ratio).astype('float16')
    
    del source_dislike_ratio

In [30]:
if CONFIG.get('source_ignore_ratio', False):
    source_ignore_ratio = train.groupby('source_id')['item_ignore_ratio'].mean()
    source_ignore_ratio.rename('source_ignore_ratio', inplace=True)
    
    train['source_ignore_ratio'] = train['source_id'].map(source_ignore_ratio).astype('float16')
    test['source_ignore_ratio'] = test['source_id'].map(source_ignore_ratio).astype('float16')
    
    del source_ignore_ratio

In [31]:
if CONFIG.get('source_share_ratio', False):
    source_share_ratio = train.groupby('source_id')['item_share_ratio'].mean()
    source_share_ratio.rename('source_share_ratio', inplace=True)
    
    train['source_share_ratio'] = train['source_id'].map(source_share_ratio).astype('float16')
    test['source_share_ratio'] = test['source_id'].map(source_share_ratio).astype('float16')
    
    del source_share_ratio

In [32]:
if CONFIG.get('source_bookmark_ratio', False):
    
    source_bookmark_ratio = train.groupby('source_id')['item_bookmark_ratio'].mean()
    source_bookmark_ratio.rename('source_bookmark_ratio', inplace=True)
    
    train['source_bookmark_ratio'] = train['source_id'].map(source_bookmark_ratio).astype('float16')
    test['source_bookmark_ratio'] = test['source_id'].map(source_bookmark_ratio).astype('float16')
    
    del source_bookmark_ratio

In [33]:
if CONFIG.get('source_avg_spent_time_ratio', False):
    
    source_avg_spent_time_ratio = train.groupby('source_id')['item_avg_spent_time_ratio'].mean()
    source_avg_spent_time_ratio.rename('source_avg_spent_time_ratio', inplace=True)
    
    train['source_avg_spent_time_ratio'] = train['source_id'].map(source_avg_spent_time_ratio).astype('float16')
    test['source_avg_spent_time_ratio'] = test['source_id'].map(source_avg_spent_time_ratio).astype('float16')
    
    del source_avg_spent_time_ratio

In [34]:
if CONFIG.get('source_view_ratio', False):
    
    source_view_ratio = train.groupby('source_id')['item_view_ratio'].mean()
    source_view_ratio.rename('source_view_ratio', inplace=True)
    
    train['source_view_ratio'] = train['source_id'].map(source_view_ratio).astype('float16')
    test['source_view_ratio'] = test['source_id'].map(source_view_ratio).astype('float16')
    
    del source_view_ratio

In [35]:
if CONFIG.get('source_full_view_ratio', False):
    
    source_full_view_ratio = train.groupby('source_id')['item_full_view_ratio'].mean()
    source_full_view_ratio.rename('source_full_view_ratio', inplace=True)
    
    train['source_full_view_ratio'] = train['source_id'].map(source_full_view_ratio).astype('float16')
    test['source_full_view_ratio'] = test['source_id'].map(source_full_view_ratio).astype('float16')
    
    del source_full_view_ratio

In [36]:
len(train.columns)

34

In [37]:
len(test.columns)

30

In [38]:
train.drop(columns=['timespent', 'share', 'bookmarks'], inplace=True)

In [39]:
custom_data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/custom_data/'

In [41]:
train.to_parquet(f'{custom_data_folder}av1_train.parquet', index=False)

In [42]:
test.to_parquet(f'{custom_data_folder}av1_test.parquet', index=False)