In [1]:
import numpy as np
import pandas as pd
import gc
from tqdm import tqdm
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
df1 = pd.read_json('../Track1/user_video_act_train_1.json', lines=True)
df1['item_id'] = ['V_{}'.format(i) for i in range(len(df1))]
df1.shape

(34048, 4)

In [3]:
df2 = pd.read_json('../Track1/user_video_act_train_2.json', lines=True)
df2['item_id'] = ['U_{}'.format(i) for i in range(len(df2))]
df2.shape

(7296, 4)

In [4]:
df = pd.concat([df1, df2], axis=0, ignore_index=False)
df.shape

del df1, df2
gc.collect()

20

In [5]:
df.head()

Unnamed: 0,activity,course_list,label_list,item_id
0,[{'course_id': 'C_course-v1:TsinghuaX+30240184...,"[C_course-v1:TsinghuaX+30240184+sp, C_course-v...","[0, 1, 0, 0]",V_0
1,[{'course_id': 'C_course-v1:TsinghuaX+30700313...,"[C_course-v1:TsinghuaX+30700313X+sp, C_course-...","[0, 0, 1, 0]",V_1
2,[{'course_id': 'C_course-v1:TsinghuaX+30640014...,"[C_course-v1:TsinghuaX+30640014X+sp, C_course-...","[0, 0, 0]",V_2
3,[{'course_id': 'C_course-v1:TsinghuaX+00740043...,"[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[0, 1, 1]",V_3
4,[{'course_id': 'C_course-v1:TsinghuaX+10800163...,"[C_course-v1:TsinghuaX+10800163X+sp, C_course-...","[0, 0, 1]",V_4


In [6]:
df.iloc[0, 0][0]

{'course_id': 'C_course-v1:TsinghuaX+30240184+sp',
 'video_id': 'V_102d2536030d47b4b343c3848f6f50b6',
 'watching_count': 1,
 'video_duration': 233.0,
 'local_watching_time': 186,
 'video_progress_time': 233.0,
 'video_start_time': 0.0,
 'video_end_time': 233.0,
 'local_start_time': '2019-04-04 21:13:20',
 'local_end_time': '2019-04-04 21:16:26'}

# label处理

In [7]:
def label_process(x):
    item_id = x['item_id']
    course_id = x['course_list']
    label = x['label_list']
    
    frame = pd.DataFrame({
        'item_id': item_id,
        'course_id': course_id,
        'label': label
    })
    return frame

s = df.apply(label_process, axis=1)

label = pd.concat([i for i in s], ignore_index=False)

del s
gc.collect()

print(label.shape)
print(label['label'].mean())
label.head()

(227934, 3)
0.2872893030438636


Unnamed: 0,item_id,course_id,label
0,V_0,C_course-v1:TsinghuaX+30240184+sp,0
1,V_0,C_course-v1:TsinghuaX+00310222X+sp,1
2,V_0,C_course-v1:GXUST+2017041901X+2017_T1,0
3,V_0,C_course-v1:TsinghuaX+00740043X_2015_T2+sp,0
0,V_1,C_course-v1:TsinghuaX+30700313X+sp,0


In [8]:
label.to_hdf('../train_label.h5', 'df', index=False)
del label
gc.collect()

22

# activity处理

In [9]:
def activity_process(x):
    item_id = x['item_id']
    course_id = map(lambda x: x['course_id'], x['activity'])
    video_id = map(lambda x: x['video_id'], x['activity'])
    watching_count = map(lambda x: x['watching_count'], x['activity'])
    video_duration = map(lambda x: x['video_duration'], x['activity'])
    local_watching_time = map(lambda x: x['local_watching_time'], x['activity'])
    video_progress_time = map(lambda x: x['video_progress_time'], x['activity'])
    video_start_time = map(lambda x: x['video_start_time'], x['activity'])
    video_end_time = map(lambda x: x['video_end_time'], x['activity'])
    local_start_time = map(lambda x: x['local_start_time'], x['activity'])
    local_end_time = map(lambda x: x['local_end_time'], x['activity'])
    
    frame = pd.DataFrame({
        'item_id': item_id,
        'course_id': course_id,
        'video_id': video_id,
        'watching_count': watching_count,
        'video_duration': video_duration,
        'local_watching_time': local_watching_time,
        'video_progress_time': video_progress_time,
        'video_start_time': video_start_time,
        'video_end_time': video_end_time,
        'local_start_time': local_start_time,
        'local_end_time': local_end_time
    })
    return frame

s = df.apply(activity_process, axis=1)

activity = pd.concat([i for i in s], ignore_index=False)

del s, df
gc.collect()

print(activity.shape)
activity.head()

(3288358, 11)


Unnamed: 0,item_id,course_id,video_id,watching_count,video_duration,local_watching_time,video_progress_time,video_start_time,video_end_time,local_start_time,local_end_time
0,V_0,C_course-v1:TsinghuaX+30240184+sp,V_102d2536030d47b4b343c3848f6f50b6,1,233.0,186,233.0,0.0,233.0,2019-04-04 21:13:20,2019-04-04 21:16:26
1,V_0,C_course-v1:TsinghuaX+30240184+sp,V_4be5b36da31d4d15bc4afdd5a5cc1658,1,288.0,236,287.28,0.72,288.0,2019-04-04 21:20:35,2019-04-04 21:24:31
2,V_0,C_course-v1:TsinghuaX+00310222X+sp,V_4c914a96924c4ba2b6ecaa4f58336892,1,767.0,774,767.0,0.0,767.0,2016-11-28 19:05:59,2016-11-28 19:18:53
3,V_0,C_course-v1:TsinghuaX+30240184+sp,V_4f184a3de72d418caccbd3fa8624d5b6,3,209.0,613,209.0,0.0,209.0,2019-04-04 20:58:06,2019-04-04 21:08:35
4,V_0,C_course-v1:TsinghuaX+00310222X+sp,V_75f9b0188a2648358a469aee137f07be,2,1132.0,105,104.230011,0.0,231.550003,2016-12-02 09:49:02,2016-12-02 09:50:47


In [10]:
activity['local_start_time'] = pd.to_datetime(activity['local_start_time'])
activity['local_end_time'] = pd.to_datetime(activity['local_end_time'])
gc.collect()

20

In [11]:
uid1 = ['item_id', 'course_id', 'video_id']

In [12]:
df_video = activity.groupby(uid1, as_index=False)['watching_count'].agg({
    'video_cnt': 'count'
})

In [13]:
df_video.head()

Unnamed: 0,item_id,course_id,video_id,video_cnt
0,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_0d2ed7b0f77647388d3ad5420d1d6549,1
1,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_330acdbc14db49a88adbbc10c082155e,1
2,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_42b82d4372ed4350b661f68a825594be,1
3,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_6d701687d50e4ee4a897b3c74c45afa1,1
4,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_74598872443d4c10848782f021e1d0af,1


In [14]:
tmp = activity.groupby(uid1, as_index=False).agg({
    'watching_count': np.sum,
    'video_duration': np.max,
    'local_watching_time': np.sum,
    'video_progress_time': np.sum,
    'video_start_time': np.min,
    'video_end_time': np.max,
    'local_start_time': np.min,
    'local_end_time': np.max
})

df_video = df_video.merge(tmp, on=uid1, how='left')

del tmp, activity
gc.collect()

0

In [15]:
df_video.columns

Index(['item_id', 'course_id', 'video_id', 'video_cnt', 'watching_count',
       'video_duration', 'local_watching_time', 'video_progress_time',
       'video_start_time', 'video_end_time', 'local_start_time',
       'local_end_time'],
      dtype='object')

In [16]:
df_video.to_hdf('../train_video.h5', 'df', index=False)
del df_video
gc.collect()

95