In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
df = pd.read_hdf('../act_2_activity.h5')
df.head()

Unnamed: 0,item_id,course_id,video_id,watching_count,video_duration,local_watching_time,video_progress_time,video_start_time,video_end_time,local_start_time,local_end_time
0,U_0,C_course-v1:TsinghuaX+20430064X+sp,V_05f095f52ac6446ba05b883171c66713,1,563.0,563,563.0,0.0,563.0,2017-07-24 13:26:56,2017-07-24 13:36:19
1,U_0,C_course-v1:TsinghuaX+34000888X+sp,V_0d166f12024942fca500891c0346b86a,1,166.0,166,166.0,0.0,166.0,2017-07-06 14:47:00,2017-07-06 14:49:46
2,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_0d2ed7b0f77647388d3ad5420d1d6549,2,415.0,415,414.809998,0.0,415.0,2017-07-13 09:01:58,2017-07-13 13:18:15
3,U_0,C_course-v1:TsinghuaX+20430064X+sp,V_10355a6a239c4a1dbc28e99bfb7633c1,1,173.0,174,173.0,0.0,173.0,2017-07-24 13:11:47,2017-07-24 13:14:41
4,U_0,C_course-v1:TsinghuaX+34000888X+sp,V_11f15f23fca948808b5f44e25b75f6a2,1,63.0,19,18.82,0.0,18.82,2017-07-06 14:56:18,2017-07-06 14:56:37


In [3]:
df.shape

(581676, 11)

In [4]:
df.drop_duplicates(['item_id', 'course_id', 'video_id']).shape

(579277, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 581676 entries, 0 to 105
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   item_id              581676 non-null  object 
 1   course_id            581676 non-null  object 
 2   video_id             581676 non-null  object 
 3   watching_count       581676 non-null  int64  
 4   video_duration       581676 non-null  float64
 5   local_watching_time  581676 non-null  int64  
 6   video_progress_time  581676 non-null  float64
 7   video_start_time     581676 non-null  float64
 8   video_end_time       581676 non-null  float64
 9   local_start_time     581676 non-null  object 
 10  local_end_time       581676 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 53.3+ MB


In [6]:
df['local_start_time'] = pd.to_datetime(df['local_start_time'])
df['local_end_time'] = pd.to_datetime(df['local_end_time'])
gc.collect()

0

In [7]:
uid1 = ['item_id', 'course_id', 'video_id']
uid2 = ['item_id', 'course_id']

# video维度

In [8]:
# video_start_time, video_end_time, local_start_time, local_end_time
df_video = df.groupby(uid1, as_index=False).agg({
    'watching_count': np.sum,
    'video_duration': np.max,
    'local_watching_time': np.sum,
    'video_progress_time': np.sum,
    'video_start_time': np.min,
    'video_end_time': np.max,
    'local_start_time': np.min,
    'local_end_time': np.max
})

gc.collect()

0

In [9]:
# 每个video观看次数
tmp = df.groupby(uid1, as_index=False)['watching_count'].agg({
    'video_cnt': 'count'
})
df_video = df_video.merge(tmp, on=uid1, how='left')

del tmp
gc.collect()

20

In [10]:
# 进度条最晚的时间点-进度条最早的时间点
df_video['video_watching_duration'] = df_video['video_end_time'] - df_video['video_start_time']

# 视频：观看结束时间-观看开始时间
df_video['local_watching_duration'] = df_video.apply(lambda x: (x['local_end_time'] - x['local_start_time']).seconds, axis=1)

In [11]:
# 进度条是否读到最后，小于0
df_video['video_is_end'] = df_video['video_end_time'] - df_video['video_duration']

# 一开始是否拖到进度条
df_video['video_is_start'] = df_video['video_start_time'].map(lambda x: 1 if x > 0 else 0)

# 进度条是否从开始读到最后，小于0
df_video['video_is_start_end'] = df_video['video_watching_duration'] - df_video['video_duration']

# 进度条拖动的秒速 - 暂停时长
df_video['forward_seconds'] = df_video['video_watching_duration'] - df_video['video_progress_time']

# 暂停时长，
# df[''] = df[''] - df['']

# 实际观看时长是否有差误
df_video['local_watching_diff'] = df_video['local_watching_duration'] - df_video['local_watching_time']

# 倍速
df_video['speed'] = df_video['local_watching_time'] / (df_video['video_progress_time'] + 0.001)

In [12]:
df_video.columns

Index(['item_id', 'course_id', 'video_id', 'watching_count', 'video_duration',
       'local_watching_time', 'video_progress_time', 'video_start_time',
       'video_end_time', 'local_start_time', 'local_end_time', 'video_cnt',
       'video_watching_duration', 'local_watching_duration', 'video_is_end',
       'video_is_start', 'video_is_start_end', 'forward_seconds',
       'local_watching_diff', 'speed'],
      dtype='object')

# course维度

In [13]:
# video_start_time, video_end_time, local_start_time, local_end_time
df_course = df_video.groupby(uid2, as_index=False).agg({
    'local_start_time': np.min,
    'local_end_time': np.max
})

df_course.shape

(40727, 4)

In [14]:
# ['item_id', 'course_id', 'video_id', 'watching_count', 'video_duration',
#  'local_watching_time', 'video_progress_time', 'video_start_time',
#  'video_end_time', 'local_start_time', 'local_end_time', 'video_cnt',
#  'video_watching_duration', 'local_watching_duration', 'video_is_end',
#  'video_is_start', 'video_is_start_end', 'forward_seconds',
#  'local_watching_diff', 'speed']

In [15]:
# watching_count
tmp = df_video.groupby(uid2, as_index=False)['watching_count'].agg({
    'watching_count_sum': 'sum',
    'watching_count_count': 'count',
    'watching_count_mean': 'mean',
    'watching_count_max': 'max',
    'watching_count_min': 'min',
    'watching_count_std': 'std',
    'watching_count_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [16]:
# video_duration
tmp = df_video.groupby(uid2, as_index=False)['video_duration'].agg({
    'video_duration_sum': 'sum',
    'video_duration_count': 'count',
    'video_duration_mean': 'mean',
    'video_duration_max': 'max',
    'video_duration_min': 'min',
    'video_duration_std': 'std',
    'video_duration_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [17]:
# local_watching_time
tmp = df_video.groupby(uid2, as_index=False)['local_watching_time'].agg({
    'local_watching_time_sum': 'sum',
    'local_watching_time_count': 'count',
    'local_watching_time_mean': 'mean',
    'local_watching_time_max': 'max',
    'local_watching_time_min': 'min',
    'local_watching_time_std': 'std',
    'local_watching_time_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [18]:
# video_progress_time
tmp = df_video.groupby(uid2, as_index=False)['video_progress_time'].agg({
    'video_progress_time_sum': 'sum',
    'video_progress_time_count': 'count',
    'video_progress_time_mean': 'mean',
    'video_progress_time_max': 'max',
    'video_progress_time_min': 'min',
    'video_progress_time_std': 'std',
    'video_progress_time_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [19]:
# video_watching_duration
tmp = df_video.groupby(uid2, as_index=False)['video_watching_duration'].agg({
    'video_watching_duration_sum': 'sum',
    'video_watching_duration_count': 'count',
    'video_watching_duration_mean': 'mean',
    'video_watching_duration_max': 'max',
    'video_watching_duration_min': 'min',
    'video_watching_duration_std': 'std',
    'video_watching_duration_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [20]:
# local_watching_duration
tmp = df_video.groupby(uid2, as_index=False)['local_watching_duration'].agg({
    'local_watching_duration_sum': 'sum',
    'local_watching_duration_count': 'count',
    'local_watching_duration_mean': 'mean',
    'local_watching_duration_max': 'max',
    'local_watching_duration_min': 'min',
    'local_watching_duration_std': 'std',
    'local_watching_duration_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [21]:
# video_is_end
tmp = df_video.groupby(uid2, as_index=False)['video_is_end'].agg({
    'video_is_end_sum': 'sum',
    'video_is_end_count': 'count',
    'video_is_end_mean': 'mean',
    'video_is_end_max': 'max',
    'video_is_end_min': 'min',
    'video_is_end_std': 'std',
    'video_is_end_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [22]:
# video_is_start
tmp = df_video.groupby(uid2, as_index=False)['video_is_start'].agg({
    'video_is_start_sum': 'sum',
    'video_is_start_count': 'count',
    'video_is_start_mean': 'mean',
    'video_is_start_max': 'max',
    'video_is_start_min': 'min',
    'video_is_start_std': 'std',
    'video_is_start_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [23]:
# video_is_start_end
tmp = df_video.groupby(uid2, as_index=False)['video_is_start_end'].agg({
    'video_is_start_end_sum': 'sum',
    'video_is_start_end_count': 'count',
    'video_is_start_end_mean': 'mean',
    'video_is_start_end_max': 'max',
    'video_is_start_end_min': 'min',
    'video_is_start_end_std': 'std',
    'video_is_start_end_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [24]:
# forward_seconds
tmp = df_video.groupby(uid2, as_index=False)['forward_seconds'].agg({
    'forward_seconds_sum': 'sum',
    'forward_seconds_count': 'count',
    'forward_seconds_mean': 'mean',
    'forward_seconds_max': 'max',
    'forward_seconds_min': 'min',
    'forward_seconds_std': 'std',
    'forward_seconds_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [25]:
# local_watching_diff
tmp = df_video.groupby(uid2, as_index=False)['local_watching_diff'].agg({
    'local_watching_diff_sum': 'sum',
    'local_watching_diff_count': 'count',
    'local_watching_diff_mean': 'mean',
    'local_watching_diff_max': 'max',
    'local_watching_diff_min': 'min',
    'local_watching_diff_std': 'std',
    'local_watching_diff_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [26]:
# speed
tmp = df_video.groupby(uid2, as_index=False)['speed'].agg({
    'speed_sum': 'sum',
    'speed_count': 'count',
    'speed_mean': 'mean',
    'speed_max': 'max',
    'speed_min': 'min',
    'speed_std': 'std',
    'speed_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [27]:
# 用户观看了多少个视频
tmp = df_video.groupby('item_id', as_index=False)['course_id'].agg({
    'course_id_count': 'count'
})
df_course = df_course.merge(tmp, on='item_id', how='left')
del tmp
gc.collect()

0

In [28]:
# 课程：观看结束时间-观看开始时间
df_course['local_watching_duration'] = df_course.apply(lambda x: (x['local_end_time'] - x['local_start_time']).seconds, axis=1)

df_course['local_watching_diff'] = df_course['local_watching_duration'] - df_course['local_watching_time_sum']

df_course['local_watching_video_duration'] = df_course['local_watching_duration'] - df_course['video_duration_sum']

In [29]:
df_course.drop(['local_start_time', 'local_end_time'], axis=1, inplace=True)
gc.collect()

33

In [30]:
df_course.columns

Index(['item_id', 'course_id', 'watching_count_sum', 'watching_count_count',
       'watching_count_mean', 'watching_count_max', 'watching_count_min',
       'watching_count_std', 'watching_count_skew', 'video_duration_sum',
       'video_duration_count', 'video_duration_mean', 'video_duration_max',
       'video_duration_min', 'video_duration_std', 'video_duration_skew',
       'local_watching_time_sum', 'local_watching_time_count',
       'local_watching_time_mean', 'local_watching_time_max',
       'local_watching_time_min', 'local_watching_time_std',
       'local_watching_time_skew', 'video_progress_time_sum',
       'video_progress_time_count', 'video_progress_time_mean',
       'video_progress_time_max', 'video_progress_time_min',
       'video_progress_time_std', 'video_progress_time_skew',
       'video_watching_duration_sum', 'video_watching_duration_count',
       'video_watching_duration_mean', 'video_watching_duration_max',
       'video_watching_duration_min', 'video_watc

In [31]:
act_2_label = pd.read_hdf('../act_2_label.h5')
act_2_label.columns

Index(['item_id', 'course_id', 'label'], dtype='object')

In [32]:
df_course = df_course.merge(act_2_label, on=uid2, how='left')

In [33]:
df_course.to_hdf('../act_2_features.h5', 'df', index=False)