In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/data/'

# items_meta 
items_meta = pd.read_parquet(f'{data_folder}items_meta.parquet', engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

# users_meta
users_meta = pd.read_parquet(f'{data_folder}users_meta.parquet', engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['age'] = users_meta['age'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta.set_index('user_id', inplace=True)

train = pd.read_parquet(f'{data_folder}train_interactions.parquet', engine='pyarrow')
train['like'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['dislike'], inplace=True)
train['like'] = train['like'].astype('int8')
train.rename(columns={'like' : 'target'}, inplace=True)

test = pd.read_csv(f'{data_folder}test_pairs.csv')

train['user_id'] = train['user_id'].astype('category')
train['item_id'] = train['item_id'].astype('category')

test['user_id'] = test['user_id'].astype('category')
test['item_id'] = test['item_id'].astype('category')

In [3]:
memory_usage = train.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 * 1024)
print(f"Размер train: {memory_usage_mb} мб")
print(train.dtypes)

Размер train: 1683.099609375 мб
user_id      category
item_id      category
timespent       uint8
target           int8
share           uint8
bookmarks       uint8
dtype: object


In [4]:
memory_usage = test.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 * 1024)
print(f"Размер test: {memory_usage_mb} мб")
print(test.dtypes)

Размер test: 20.448028564453125 мб
user_id    category
item_id    category
dtype: object


In [5]:
CONFIG = {
    # U. пользователя
    'age' : True,
    'gender' : True,
    
    # I. клипа
    'duration' : True,
    'embeddings' : True,
    
    # S. источника
    'source_id' : True
}   

In [6]:
if CONFIG.get('age', False):
    train['age'] = train['user_id'].map(users_meta['age'])
    test['age'] = test['user_id'].map(users_meta['age'])

In [7]:
if CONFIG.get('gender', False):
    train['gender'] = train['user_id'].map(users_meta['gender'])
    test['gender'] = test['user_id'].map(users_meta['gender'])

In [8]:
if CONFIG.get('duration', False):
    train['duration'] = train['item_id'].map(items_meta['duration']).astype('uint8')
    test['duration'] = test['item_id'].map(items_meta['duration']).astype('uint8')

In [9]:
if CONFIG.get('embeddings', False):
    train = train.merge(items_meta[['embeddings']], left_on='item_id', right_index=True,  how='left')
    test = test.merge(items_meta[['embeddings']], left_on='item_id', right_index=True,  how='left')

In [10]:
if CONFIG.get('source_id', False):   
    train['source_id'] = train['item_id'].map(items_meta['source_id']).astype('category')
    test['source_id'] = test['item_id'].map(items_meta['source_id']).astype('category')

In [11]:
len(train.columns)

11

In [12]:
len(test.columns)

7

In [13]:
train.drop(columns=['timespent', 'share', 'bookmarks'], inplace=True)

In [14]:
custom_data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/custom_data/'

In [None]:
train.to_parquet(f'{custom_data_folder}fv3_train.parquet', index=False)

In [None]:
test.to_parquet(f'{custom_data_folder}fv3_test.parquet', index=False)