In [1]:
import time
import numpy as np
import pandas as pd
import gc
import warnings
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb

warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
train = pd.read_hdf('../input/train_video.h5')
train.head()

Unnamed: 0,item_id,course_id,video_id,video_cnt,watching_count,video_duration,local_watching_time,video_progress_time,video_start_time,video_end_time,local_start_time,local_end_time
0,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_0d2ed7b0f77647388d3ad5420d1d6549,1,2,415.0,415,414.809998,0.0,415.0,2017-07-13 09:01:58,2017-07-13 13:18:15
1,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_330acdbc14db49a88adbbc10c082155e,1,4,282.0,240,239.850001,0.01,212.149994,2017-07-13 08:25:40,2017-07-13 08:38:45
2,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_42b82d4372ed4350b661f68a825594be,1,1,306.0,307,306.0,0.0,306.0,2017-07-06 09:45:38,2017-07-06 09:50:45
3,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_6d701687d50e4ee4a897b3c74c45afa1,1,1,851.0,851,851.0,0.0,851.0,2017-07-13 13:18:57,2017-07-13 13:33:08
4,U_0,C_course-v1:TsinghuaX+00691153X+sp,V_74598872443d4c10848782f021e1d0af,1,1,898.0,202,201.529999,0.0,201.529999,2017-07-13 08:41:20,2017-07-13 08:44:42


In [3]:
train['video_cnt'].unique()

array([1, 2, 3], dtype=int64)

In [4]:
test = pd.read_hdf('../input/test_video.h5')
test.head()

Unnamed: 0,item_id,course_id,video_id,video_cnt,watching_count,video_duration,local_watching_time,video_progress_time,video_start_time,video_end_time,local_start_time,local_end_time
0,T_1,C_course-v1:TsinghuaX+00612642X+sp,V_01292610aa1748e79c8981de6f0464f8,1,2,1084.0,722,1080.65996,1.18,1081.959961,2018-05-12 17:49:32,2018-05-12 18:02:07
1,T_1,C_course-v1:TsinghuaX+00612642X+sp,V_0255c95155a9478291d4bb7818f22c07,1,1,85.0,69,84.98,0.02,85.0,2018-05-04 16:58:41,2018-05-04 16:59:50
2,T_1,C_course-v1:TsinghuaX+00612642X+sp,V_03df78abe20a439699d16f7608ae9425,1,1,1165.0,778,1165.0,0.0,1165.0,2018-05-04 18:29:10,2018-05-04 18:42:08
3,T_1,C_course-v1:TsinghuaX+00612642X+sp,V_0585e40baa8644319760cfca62354112,1,1,532.0,426,532.0,0.0,532.0,2018-05-09 16:37:49,2018-05-09 16:44:55
4,T_1,C_course-v1:TsinghuaX+00612642X+sp,V_0b4c5734035e4eb4a2a50d21707f0af3,1,4,937.0,747,931.999974,0.27,937.0,2018-05-10 18:16:35,2018-05-10 18:35:03


In [5]:
test['video_cnt'].unique()

array([1, 2], dtype=int64)

In [6]:
df_video = pd.concat([train, test], axis=0, ignore_index=False)
df_video.shape

(3531915, 12)

In [7]:
uid1 = ['item_id', 'course_id', 'video_id']
uid2 = ['item_id', 'course_id']

# video维度# 

In [8]:
# 进度条最晚的时间点-进度条最早的时间点
df_video['video_watching_duration'] = df_video['video_end_time'] - df_video['video_start_time']

# 视频：观看结束时间-观看开始时间
df_video['local_watching_duration'] = df_video.apply(lambda x: (x['local_end_time'] - x['local_start_time']).seconds, axis=1)

In [9]:
# 进度条是否读到最后，小于0
df_video['video_is_end'] = df_video['video_end_time'] - df_video['video_duration']

# 一开始是否拖到进度条
df_video['video_is_start'] = df_video['video_start_time'].map(lambda x: 1 if x > 0 else 0)

# 进度条是否从开始读到最后，小于0
df_video['video_is_start_end'] = df_video['video_watching_duration'] - df_video['video_duration']

# 进度条拖动的秒速 - 暂停时长
df_video['forward_seconds'] = df_video['video_watching_duration'] - df_video['video_progress_time']

# 暂停时长，
# df[''] = df[''] - df['']

# 实际观看时长是否有差误
df_video['local_watching_diff'] = df_video['local_watching_duration'] - df_video['local_watching_time']

# 倍速
df_video['speed'] = df_video['local_watching_time'] / (df_video['video_progress_time'] + 0.001)

In [10]:
df_video.columns

Index(['item_id', 'course_id', 'video_id', 'video_cnt', 'watching_count',
       'video_duration', 'local_watching_time', 'video_progress_time',
       'video_start_time', 'video_end_time', 'local_start_time',
       'local_end_time', 'video_watching_duration', 'local_watching_duration',
       'video_is_end', 'video_is_start', 'video_is_start_end',
       'forward_seconds', 'local_watching_diff', 'speed'],
      dtype='object')

# course维度

In [11]:
df_course = df_video.groupby(uid2, as_index=False).agg({
    'local_start_time': np.min,
    'local_end_time': np.max
})

df_course.shape

(247957, 4)

In [12]:
for i in tqdm(['local_end_time', 'local_start_time']):
    df_course['{i}_year'] = df_course[i].dt.year
    df_course['{i}_month'] = df_course[i].dt.month
    df_course['{i}_day'] = df_course[i].dt.day
    df_course['{i}_hour'] = df_course[i].dt.hour
    df_course['{i}_dow'] = df_course[i].dt.dayofweek

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  8.62it/s]


In [13]:
# voc_day_cnt_res = df_course.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
# for i in df_voc['voc_day'].unique():
#     phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

In [14]:
# video_cnt
tmp = df_video.groupby(uid2, as_index=False)['video_cnt'].agg({
    'video_cnt_sum': 'sum',
    'video_cnt_count': 'count',
    'video_cnt_mean': 'mean',
    'video_cnt_max': 'max',
    'video_cnt_min': 'min',
    'video_cnt_std': 'std',
    'video_cnt_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [15]:
# watching_count
tmp = df_video.groupby(uid2, as_index=False)['watching_count'].agg({
    'watching_count_sum': 'sum',
    # 'watching_count_count': 'count',
    'watching_count_mean': 'mean',
    'watching_count_max': 'max',
    'watching_count_min': 'min',
    'watching_count_std': 'std',
    'watching_count_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [16]:
# video_duration
tmp = df_video.groupby(uid2, as_index=False)['video_duration'].agg({
    'video_duration_sum': 'sum',
    # 'video_duration_count': 'count',
    'video_duration_mean': 'mean',
    'video_duration_max': 'max',
    'video_duration_min': 'min',
    'video_duration_std': 'std',
    'video_duration_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [17]:
# local_watching_time
tmp = df_video.groupby(uid2, as_index=False)['local_watching_time'].agg({
    'local_watching_time_sum': 'sum',
    # 'local_watching_time_count': 'count',
    'local_watching_time_mean': 'mean',
    'local_watching_time_max': 'max',
    'local_watching_time_min': 'min',
    'local_watching_time_std': 'std',
    'local_watching_time_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [18]:
# video_progress_time
tmp = df_video.groupby(uid2, as_index=False)['video_progress_time'].agg({
    'video_progress_time_sum': 'sum',
    # 'video_progress_time_count': 'count',
    'video_progress_time_mean': 'mean',
    'video_progress_time_max': 'max',
    'video_progress_time_min': 'min',
    'video_progress_time_std': 'std',
    'video_progress_time_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [19]:
# video_watching_duration
tmp = df_video.groupby(uid2, as_index=False)['video_watching_duration'].agg({
    'video_watching_duration_sum': 'sum',
    # 'video_watching_duration_count': 'count',
    'video_watching_duration_mean': 'mean',
    'video_watching_duration_max': 'max',
    'video_watching_duration_min': 'min',
    'video_watching_duration_std': 'std',
    'video_watching_duration_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [20]:
# local_watching_duration
tmp = df_video.groupby(uid2, as_index=False)['local_watching_duration'].agg({
    'local_watching_duration_sum': 'sum',
    # 'local_watching_duration_count': 'count',
    'local_watching_duration_mean': 'mean',
    'local_watching_duration_max': 'max',
    'local_watching_duration_min': 'min',
    'local_watching_duration_std': 'std',
    'local_watching_duration_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [21]:
# video_is_end
tmp = df_video.groupby(uid2, as_index=False)['video_is_end'].agg({
    'video_is_end_sum': 'sum',
    # 'video_is_end_count': 'count',
    'video_is_end_mean': 'mean',
    'video_is_end_max': 'max',
    'video_is_end_min': 'min',
    'video_is_end_std': 'std',
    'video_is_end_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [22]:
# video_is_start
tmp = df_video.groupby(uid2, as_index=False)['video_is_start'].agg({
    'video_is_start_sum': 'sum',
    # 'video_is_start_count': 'count',
    'video_is_start_mean': 'mean',
    'video_is_start_max': 'max',
    'video_is_start_min': 'min',
    'video_is_start_std': 'std',
    'video_is_start_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [23]:
# video_is_start_end
tmp = df_video.groupby(uid2, as_index=False)['video_is_start_end'].agg({
    'video_is_start_end_sum': 'sum',
    # 'video_is_start_end_count': 'count',
    'video_is_start_end_mean': 'mean',
    'video_is_start_end_max': 'max',
    'video_is_start_end_min': 'min',
    'video_is_start_end_std': 'std',
    'video_is_start_end_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [24]:
# forward_seconds
tmp = df_video.groupby(uid2, as_index=False)['forward_seconds'].agg({
    'forward_seconds_sum': 'sum',
    # 'forward_seconds_count': 'count',
    'forward_seconds_mean': 'mean',
    'forward_seconds_max': 'max',
    'forward_seconds_min': 'min',
    'forward_seconds_std': 'std',
    'forward_seconds_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [25]:
# local_watching_diff
tmp = df_video.groupby(uid2, as_index=False)['local_watching_diff'].agg({
    'local_watching_diff_sum': 'sum',
    # 'local_watching_diff_count': 'count',
    'local_watching_diff_mean': 'mean',
    'local_watching_diff_max': 'max',
    'local_watching_diff_min': 'min',
    'local_watching_diff_std': 'std',
    'local_watching_diff_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [26]:
# speed
tmp = df_video.groupby(uid2, as_index=False)['speed'].agg({
    'speed_sum': 'sum',
    # 'speed_count': 'count',
    'speed_mean': 'mean',
    'speed_max': 'max',
    'speed_min': 'min',
    'speed_std': 'std',
    'speed_skew': 'skew'
})
df_course = df_course.merge(tmp, on=uid2, how='left')
del tmp
gc.collect()

0

In [27]:
# 课程：观看结束时间-观看开始时间
df_course['local_watching_duration'] = df_course.apply(lambda x: (x['local_end_time'] - x['local_start_time']).seconds, axis=1)

df_course['local_watching_diff'] = df_course['local_watching_duration'] - df_course['local_watching_time_sum']

df_course['local_watching_video_duration'] = df_course['local_watching_duration'] - df_course['video_duration_sum']

In [28]:
df_course.drop(['local_start_time', 'local_end_time'], axis=1, inplace=True)
gc.collect()

33

In [29]:
df_course.columns

Index(['item_id', 'course_id', '{i}_year', '{i}_month', '{i}_day', '{i}_hour',
       '{i}_dow', 'video_cnt_sum', 'video_cnt_count', 'video_cnt_mean',
       'video_cnt_max', 'video_cnt_min', 'video_cnt_std', 'video_cnt_skew',
       'watching_count_sum', 'watching_count_mean', 'watching_count_max',
       'watching_count_min', 'watching_count_std', 'watching_count_skew',
       'video_duration_sum', 'video_duration_mean', 'video_duration_max',
       'video_duration_min', 'video_duration_std', 'video_duration_skew',
       'local_watching_time_sum', 'local_watching_time_mean',
       'local_watching_time_max', 'local_watching_time_min',
       'local_watching_time_std', 'local_watching_time_skew',
       'video_progress_time_sum', 'video_progress_time_mean',
       'video_progress_time_max', 'video_progress_time_min',
       'video_progress_time_std', 'video_progress_time_skew',
       'video_watching_duration_sum', 'video_watching_duration_mean',
       'video_watching_duration_max',

In [31]:
train_label = pd.read_hdf('../input//train_label.h5')

df_course = df_course.merge(train_label, on=uid2, how='left')
gc.collect()

93

In [None]:
train = df.loc[df_course['label'].notnull(), :]
test = df.loc[df_course['label'].isnull(), :]
train.shape, test.shape

In [None]:
train.drop(['item_id', 'course_id'], axis=1, inplace=True)
train.shape

In [None]:
submit = test[['item_id', 'course_id']]
test.drop(['item_id', 'course_id', 'label'], axis=1, inplace=True)
test.shape

In [None]:
y = train['label']
train.drop('label', axis=1, inplace=True)
train.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, stratify=y, random_state=2020, shuffle=True)

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)
lgb_all = lgb.Dataset(train, y, reference=lgb_train)

In [None]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'binary_error',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'min_data_in_leaf': 20,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

In [None]:
lgb_model_valid = lgb.train(params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_valid],
                            early_stopping_rounds=500,
                            num_boost_round=100000,
                            verbose_eval=500)

In [None]:
y_prob = lgb_model_valid.predict(X_valid)
y_pred = np.where(y_prob > 0.5, 1, 0)

acc = np.round(accuracy_score(y_valid, y_pred), 4)
auc = np.round(roc_auc_score(y_valid, y_prob), 4)

print('acc: ', acc)
print('auc: ', auc)

In [None]:
importance = lgb_model_valid.feature_importance(importance_type='gain')
feature_name = lgb_model_valid.feature_name()
feature_importance = pd.DataFrame({'feature_name': feature_name, 'importance': importance}).sort_values(by='importance', ascending=False)
feature_importance.to_csv('../input/feature_importance.csv', index=False)

In [None]:
lgb_model = lgb.train(params,
                      lgb_all,
                      num_boost_round=lgb_model_valid.best_iteration + 100)

In [None]:
submit['label'] = np.where(lgb_model.predict(test) > 0.5, 1, 0).astype(int)

In [None]:
test_label = pd.read_hdf('../input/test_label.h5')
test_label = test_label.merge(submit, how='left', on=['item_id', 'course_id'])
test_label.head()

In [None]:
def fun1(dx):
    return list(dx.values)

sub = test_label.groupby('item_id')['label'].apply(fun1).reset_index()
sub.columns = ['item_id', 'label_list']
sub = sub[['label_list', 'item_id']]

sub.to_json('../sub/sub_{}_{}.json'.format(time.strftime('%Y%m%d'), str(acc)), orient='records', lines=True)