In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import glob
import gc
import warnings
import pickle
from sklearn.preprocessing import LabelEncoder


warnings.simplefilter('ignore')

tags_map = {
    '拍摄': 0,
    '演绎': 1,
    '新闻': 2,
    '生活': 3,
    '美食': 4,
    '文化教育': 5,
    '影视': 6,
    '动植物': 7,
    '时尚': 8,
    '汽车': 9,
    '亲子': 10,
    '农村': 11,
    '军政法警': 12,
    '舞蹈': 13,
    '创意': 14,
    '旅行': 15,
    '运动': 16,
    '金融': 17,
    '情感': 18,
    '二次元': 19,
    '音乐': 20,
    '医疗': 21,
    '游戏': 22,
    '科技': 23,
    '辅助标签': 24
}
def tag2int(s):
    if s in tags_map:
        return tags_map[s]
    else:
        return 25
def string2int(s):
    ret = 0
    for i in range(len(s)):
        ret += ord(s[i]) * 36**i
    return ret

In [None]:
#分块读数据
chunksize = 20000000
df_train = pd.DataFrame()

data = pd.read_csv('../init_data/toUser/train/train.csv',
                     chunksize=chunksize,
                     dtype={'is_like': bool,
                            'is_favourite': bool,
                            'is_share': bool,
                            'is_finish': bool})#'../../../output/toUserA/train.csv'

for i, chunk in tqdm(enumerate(data)):
    chunk['userid'] = chunk['userid'].apply(lambda s: string2int(s)).astype(np.int32)
    chunk['videoid'] = chunk['videoid'].apply(lambda s: string2int(s)).astype(np.int64)
    chunk['tag'] = chunk['tag'].apply(lambda s: tag2int(s)).astype(np.int8)
    chunk.to_pickle(f'../temp_data/train_chunk{i}.pickle')#f'../temp_data/train_chunk{i}.pickle'
df_train = pd.DataFrame()

In [None]:
test = pd.read_csv('../init_data/toUser/test/test.csv', 
                   dtype={'is_like': bool,
                          'is_favourite': bool,
                          'is_share': bool,
                          'is_finish': bool})#
test['userid'] = test['userid'].apply(lambda s: string2int(s)).astype(np.int32)
test['videoid'] = test['videoid'].apply(lambda s: string2int(s)).astype(np.int64)
test['tag'] = test['tag'].apply(lambda s: tag2int(s)).astype(np.int8)

In [None]:
train_chunks = glob.glob('../temp_data/train_chunk*.pickle')#'../temp_data/train_chunk*.pickle'
train = pd.DataFrame()

for f in tqdm(train_chunks):
    chunk = pd.read_pickle(f)
    chunk = chunk[chunk['videoid'].isin(test['videoid'].unique())]
    train = pd.concat([train, chunk])
    
train

In [None]:
train.to_pickle('../temp_data/train.pickle')#'../temp_data/train.pickle'
test.to_pickle('../temp_data/test.pickle')#'../temp_data/test.pickle

In [None]:
train = pd.read_pickle('../temp_data/train.pickle')#'../temp_data/train.pickle'
test = pd.read_pickle('../temp_data/test.pickle')#'../temp_data/test.pickle'

In [None]:
for col in ['tag', 'is_like', 'is_favourite', 'is_share', 'is_finish']:
    del train[col]
    gc.collect()
    
for col in ['tag']:
    del test[col]
    gc.collect()
    
lbe = LabelEncoder()
train['userid'] = lbe.fit_transform(train['userid'])
test['userid'] = lbe.transform(test['userid'])
train['userid'] = train['userid'].astype(np.int16)
test['userid'] = test['userid'].astype(np.int16)

with open('../temp_data/userid_le.pickle', 'wb') as fh:
    pickle.dump(lbe, fh)

lbe = LabelEncoder()
train['videoid'] = lbe.fit_transform(train['videoid'])
test['videoid'] = lbe.transform(test['videoid'])
train['videoid'] = train['videoid'].astype(np.int32)
test['videoid'] = test['videoid'].astype(np.int32)
with open('../temp_data/videoid_le.pickle', 'wb') as fh:
    pickle.dump(lbe, fh)

In [None]:
tmp = pd.read_pickle('../temp_data/train.pickle')
del tmp['userid']
del tmp['videoid']
train = pd.concat([train, tmp], axis=1)

tmp = pd.read_pickle('../temp_data/test.pickle')
del tmp['userid']
del tmp['videoid']
del tmp['ID']
test = pd.concat([test, tmp], axis=1)
gc.collect()

train.to_pickle('../temp_data/train_all_le.pickle')#'../temp_data/train_all_le.pickle'
test.to_pickle('../temp_data/test_all_le.pickle')#'../temp_data/test_all_le.pickle'