In [None]:
import warnings
warnings.simplefilter('ignore')

import gc

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [None]:
df_train = pd.read_pickle('../temp_data/train_all_le.pickle')
for col in ['videoid', 'tag', 'is_like', 'is_favourite', 'is_share', 'is_finish']:
    del df_train[col]
gc.collect()
df_train['seq_no_rank'] = df_train.groupby('userid').cumcount(ascending=False)
df_train['seq_no_rank'] = df_train['seq_no_rank'].astype(np.int16)
tmp = pd.read_pickle('../temp_data/train_all_le.pickle')
del tmp['userid']

df_train = pd.concat([df_train, tmp], axis=1)

del tmp
gc.collect()

df_train = df_train[df_train['seq_no_rank'] < 1600].reset_index(drop=True)
df_valid = df_train[df_train['seq_no_rank'] < 100].reset_index(drop=True)
df_train = df_train[df_train['seq_no_rank'] >= 100].reset_index(drop=True)
print(df_valid.shape[0], df_valid['userid'].nunique())
df_valid.to_pickle('../temp_data/valid.pickle')
df_train.to_pickle('../temp_data/train.pickle')
del df_train
del df_valid
gc.collect()

In [None]:
datatype_list = ['train','valid','test']
drop_list = []

In [None]:
def reduce_mem_usage(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in tqdm(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    return df

In [None]:
for datatype in datatype_list:
    if datatype == 'test':
        filename = f'../temp_data/test_all_le.pickle'
    else:
        filename = f'../temp_data/{datatype}.pickle'
    df_data = pd.read_pickle(filename)
    if datatype == 'train':
        #df_data = df_data.sample(n=n_samples, random_state=seed).reset_index(drop=True)
        pass
    elif datatype == 'valid':
        #df_data = df_data.iloc[:478206]
        pass
    if datatype in ['train', 'valid']:
        del df_data['seq_no_rank']   
    display(df_data)
    
#     df = pd.read_pickle(f'/home/workspace/output/feats/user_tag_cnt.pickle')
#     df_data = pd.merge(left=df_data, right=df, how='left',on = ['userid','tag'])
#     df_data.head()   
    
#     for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
#         df = pd.read_pickle(f'/home/workspace/output/feats/user_tag_{col}_sum.pickle')
#         df_data = pd.merge(left=df_data, right=df, how='left',on = ['userid','tag'])
#     df_data.head()
    
    # 统计各用户，id，tag的总数量
    for col in tqdm(['userid', 'videoid', 'tag']):
        df = pd.read_pickle(f'../temp_data/{col}_cnt.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = [col])
        
    
    for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
        df = pd.read_pickle(f'../temp_data/tag_{col}_mean.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = ['tag'])
    for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
        if f'videoid_{col}_mean' in drop_list: continue
        df = pd.read_pickle(f'../temp_data/videoid_{col}_mean.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = ['videoid'])
    for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
        if f'userid_{col}_mean' in drop_list: continue
        df = pd.read_pickle(f'../temp_data/userid_{col}_mean.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = ['userid'])


    for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
        if f'tag_{col}_sum' in drop_list: continue
        df = pd.read_pickle(f'../temp_data/tag_{col}_sum.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = ['tag'])
    for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
        if f'videoid_{col}_sum' in drop_list: continue
        df = pd.read_pickle(f'../temp_data/videoid_{col}_sum.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = ['videoid'])
    for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
        if f'userid_{col}_sum' in drop_list: continue
        df = pd.read_pickle(f'../temp_data/userid_{col}_sum.pickle')
        df_data = pd.merge(left=df_data, right=df, how='left',on = ['userid'])
    df_data.head()


    # for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    #     if f'videoid_{col}_sum' in drop_list: continue
    #     df = pd.read_pickle(f'/home/workspace/output/feats/videoid_{col}_std.pickle')
    #     df_data = pd.merge(left=df_data, right=df, how='left',on = ['videoid'])
    # for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    #     if f'tag_{col}_sum' in drop_list: continue
    #     df = pd.read_pickle(f'/home/workspace/output/feats/tag_{col}_std.pickle')
    #     df_data = pd.merge(left=df_data, right=df, how='left',on = ['tag'])
    # for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    #     if f'userid_{col}_sum' in drop_list: continue
    #     df = pd.read_pickle(f'/home/workspace/output/feats/userid_{col}_std.pickle')
    #     df_data = pd.merge(left=df_data, right=df, how='left',on = ['userid'])

    # df_data = reduce_mem_usage(df_data)
    # df_data.head() 

    # for col in tqdm(['is_like', 'is_favourite', 'is_share', 'is_finish']):
    #     df_data[f'user_tag_{col}_rate'] = df_data[f'user_tag_{col}_sum']/df_data['user_tag_cnt']
    #     df_data[f'userid_{col}_mes'] = df_data[f'userid_{col}_mean']/df_data[f'userid_{col}_std']
    #     df_data[f'videoid_{col}_mes'] = df_data[f'videoid_{col}_mean']/df_data[f'videoid_{col}_std']
    #     df_data[f'tag_{col}_mes'] = df_data[f'tag_{col}_mean']/df_data[f'tag_{col}_std']
    # df_data.head()
    df_data = reduce_mem_usage(df_data)
    print(datatype,df_data.shape)
    df_data.to_pickle(f'../temp_data/df_{datatype}_v6.pickle')