In [None]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [None]:
df_train = pd.read_pickle('../temp_data/train_all_le.pickle')    # 最好不要用包含 valid 的数据来做特征
df_test = pd.read_pickle('../temp_data/test_all_le.pickle')


display(df_train)
display(df_test)

print(df_train.shape, df_test.shape)

In [None]:
# 统计各用户，id，tag的总数量
for col in tqdm(['userid', 'videoid', 'tag']):
    tmp = df_train[col].value_counts().reset_index(name=f'{col}_cnt')
    tmp.columns = [col, f'{col}_cnt']
    tmp.to_pickle(f'../temp_data/{col}_cnt.pickle')

In [None]:
# 用户粒度统计四个指标的数量
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('userid')[col].sum().to_frame().reset_index(level=[0])
    tmp.columns = ['userid', f'userid_{col}_sum']
    tmp.to_pickle(f'../temp_data/userid_{col}_sum.pickle')

In [None]:
# video粒度统计四个指标的数量
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('videoid')[col].sum().to_frame().reset_index(level=[0])
    tmp.columns = ['videoid', f'videoid_{col}_sum']
    tmp.to_pickle(f'../temp_data/videoid_{col}_sum.pickle')

In [None]:
# tag粒度统计四个指标的数量
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('tag')[col].sum().to_frame().reset_index(level=[0])
    tmp.columns = ['tag', f'tag_{col}_sum']
    tmp.to_pickle(f'../temp_data/tag_{col}_sum.pickle')

In [None]:
# user粒度统计四个指标的std
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('userid')[col].std().to_frame().reset_index(level=[0])
    tmp.columns = ['userid', f'userid_{col}_std']
    tmp.to_pickle(f'../temp_data/userid_{col}_std.pickle')
# video粒度统计四个指标的std
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('videoid')[col].std().to_frame().reset_index(level=[0])
    tmp.columns = ['videoid', f'videoid_{col}_std']
    tmp.to_pickle(f'../temp_data/videoid_{col}_std.pickle')
# tag粒度统计四个指标的std
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('tag')[col].std().to_frame().reset_index(level=[0])
    tmp.columns = ['tag', f'tag_{col}_std']
    tmp.to_pickle(f'../temp_data/tag_{col}_std.pickle')

In [None]:
# user—tag粒度统计四个指标的数量 (需要分批聚合，一次性太大了)
# for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
#     tmp = df_train.groupby(['userid','videoid'])[col].sum().to_frame().reset_index(level=[0,1])
#     tmp.columns = ['userid','videoid', f'tag_{col}_sum']
#     tmp.to_pickle(f'/home/workspace/output/feats/user_video_{col}_sum.pickle')

In [None]:
# user—tag粒度统计四个指标的数量 (全量需要分批聚合，一次性太大了)
from tqdm import tqdm
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby(['userid','tag'])[col].sum().to_frame().reset_index(level=[0,1])
    tmp.columns = ['userid','tag', f'user_tag_{col}_sum']
    tmp.to_pickle(f'../temp_data/user_tag_{col}_sum.pickle')
tmp.head()

In [None]:
# mean是有意义的，反应了用户各动作的执行比例
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('userid')[col].mean().to_frame().reset_index(level=[0])
    tmp.columns = ['userid', f'userid_{col}_mean']
    tmp.to_pickle(f'../temp_data/userid_{col}_mean.pickle')
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('videoid')[col].mean().to_frame().reset_index(level=[0])
    tmp.columns = ['videoid', f'videoid_{col}_mean']
    tmp.to_pickle(f'../temp_data/videoid_{col}_mean.pickle')
# tag粒度统计四个指标的数量
for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    tmp = df_train.groupby('tag')[col].mean().to_frame().reset_index(level=[0])
    tmp.columns = ['tag', f'tag_{col}_mean']
    tmp.to_pickle(f'../temp_data/tag_{col}_mean.pickle')

In [None]:
# user—tag粒度统计count
from tqdm import tqdm
tmp = df_train.groupby(['userid','tag'])['videoid'].count().to_frame().reset_index(level=[0,1])
tmp.columns = ['userid','tag', f'user_tag_cnt']
tmp.to_pickle(f'../temp_data/user_tag_cnt.pickle')
tmp.head()

In [None]:
# video-tag 粒度统计count
tmp = df_train.groupby(['videoid','tag'])[col].count().to_frame().reset_index(level=[0,1])
tmp.columns = ['userid','tag', f'user_tag_{col}_cnt']
tmp.to_pickle(f'../temp_data/video_tag_{col}_cnt.pickle')
tmp.head()