## 套件&環境

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/Colab/資料分析期末') #切換該目錄
os.listdir() #確認目錄內容

Mounted at /content/drive


['LSTM0429.ipynb',
 'data',
 'models',
 '0519KKTV.ipynb',
 'KKTV_LSTM0517.ipynb',
 '0520KKTV.ipynb',
 'submit.csv',
 '0521KKTV (light).ipynb',
 'preprocessed_output',
 'large0520.ipynb']

In [None]:
# always needed
import math, os, random, csv
import pandas as pd
import numpy as np

# log and save
import json, logging, pickle, sys, shutil, copy
# torch
import torch
import torch.nn
from torch.nn import Conv2d, MaxPool2d, Flatten, Linear, ReLU
import torchvision
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,ConcatDataset
from torchvision import datasets, models, transforms
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler

# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter
%matplotlib inline
import seaborn as sns

# others
import matplotlib.pyplot as plt
from PIL import Image

# sklearn
from sklearn import preprocessing

# statistics
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statistics

# seeds
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


## 前處理
- user_id:
De-identification user id. The training set has user_id 0 to 30459, while the testing set has user_id 30460 to 38075.
- device_id:
De-identification device id.
- session_id:
The set of consecutive events. It is unique for an individual user, but a session_id may map to multiple user_id.
- platform:
The OS in which the event occurred. Four types in total.
- internet_connection_type :
12 types in total.
- event_time:
The timestamp of the event is logged.
- title_id:
When the event is logged, the encoded title of the movie/drama the user is watching.
- action_trigger:
The action triggers the event, 14 in total.
- played_duration:
When the event is triggered, the length of the movie/drama has been played (in seconds).
- title_in_simulcast:
If the drama is aired weekly, "1" is logged; otherwise, you see "0". "2" means unknown.

In [None]:
df = pd.read_csv("./data/train_source_events.csv")
df

Unnamed: 0,user_id,device_id,session_id,title_id,event_time,played_duration,action_trigger,platform,title_in_simulcast,internet_connection_type
0,0,525,2328,384,1.648950e+09,1361,1,0,0,1
1,0,525,2328,384,1.648950e+09,2,0,0,0,1
2,0,525,2400,68,1.648952e+09,2,0,0,0,1
3,0,525,2400,68,1.648952e+09,20,9,0,0,1
4,0,532,2401,68,1.648952e+09,8,10,2,1,4
...,...,...,...,...,...,...,...,...,...,...
9714098,30459,139403,4218064,113,1.663082e+09,2157,1,0,0,1
9714099,30459,139403,4220179,113,1.663156e+09,2743,1,0,0,1
9714100,30459,139403,4220179,113,1.663159e+09,2723,1,0,0,1
9714101,30459,139403,4222301,113,1.663242e+09,2699,1,0,0,1


#### 找哪個user炸裂

In [None]:
a = df[df['played_duration'] == 2065490]

In [None]:
a = df[df['user_id'] == 22595]
a['event_time'] = pd.to_datetime(a['event_time'], unit='s')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['event_time'] = pd.to_datetime(a['event_time'], unit='s')


In [None]:
a.loc[(a.index >= 7133750) & (a.index <= 7133760)]

Unnamed: 0,user_id,device_id,session_id,title_id,event_time,played_duration,action_trigger,platform,title_in_simulcast,internet_connection_type
7133750,22595,103815,3126310,464,2022-07-12 16:50:49.236000000,1467,1,0,0,1
7133751,22595,103815,3126310,464,2022-07-12 16:55:30.392000000,279,0,0,0,1
7133752,22595,103815,3126310,464,2022-07-12 17:15:51.040999936,1190,1,0,0,1
7133753,22595,103815,3127983,464,2022-07-13 16:57:33.813999872,380,9,0,0,1
7133754,22595,103815,3127983,464,2022-07-13 17:00:36.112999936,11,0,0,0,1
7133755,22595,103815,3129222,464,2022-07-14 14:35:26.064999936,2065490,1,0,0,1
7133756,22595,103815,3129222,464,2022-07-14 14:36:51.112000000,7,0,0,0,1
7133757,22595,103815,3129222,464,2022-07-14 14:37:29.160999936,10,0,0,0,1
7133758,22595,103815,3129222,464,2022-07-14 14:37:35.803000064,6,0,0,0,1
7133759,22595,103815,3129222,464,2022-07-14 15:00:18.304000000,1360,1,0,0,1


### 轉時間,小觀察

In [None]:
df['user_id'].value_counts()

28251    9941
8848     9323
27195    9282
13043    8662
23805    7560
         ... 
23766       1
28213       1
28210       1
8505        1
15718       1
Name: user_id, Length: 30460, dtype: int64

In [None]:
df['event_time'] = pd.to_datetime(df['event_time'], unit='s')
print(type(df['event_time'][0]))
df

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,user_id,device_id,session_id,title_id,event_time,played_duration,action_trigger,platform,title_in_simulcast,internet_connection_type
0,0,525,2328,384,2022-04-03 01:36:30.276999936,1361,1,0,0,1
1,0,525,2328,384,2022-04-03 01:36:35.136000000,2,0,0,0,1
2,0,525,2400,68,2022-04-03 02:06:01.136000000,2,0,0,0,1
3,0,525,2400,68,2022-04-03 02:06:36.019000064,20,9,0,0,1
4,0,532,2401,68,2022-04-03 02:07:04.120999936,8,10,2,1,4
...,...,...,...,...,...,...,...,...,...,...
9714098,30459,139403,4218064,113,2022-09-13 15:09:38.852000000,2157,1,0,0,1
9714099,30459,139403,4220179,113,2022-09-14 11:45:44.318000128,2743,1,0,0,1
9714100,30459,139403,4220179,113,2022-09-14 12:31:13.583000064,2723,1,0,0,1
9714101,30459,139403,4222301,113,2022-09-15 11:33:39.368999936,2699,1,0,0,1


In [None]:
min_event_time = df['event_time'].min()
max_event_time = df['event_time'].max()

print("Minimum event time:", min_event_time)
print("Maximum event time:", max_event_time)

Minimum event time: 2022-01-01 01:00:01.204999936
Maximum event time: 2022-09-17 00:59:58.588000


In [None]:
min_duration = df['played_duration'].min()
max_duration = df['played_duration'].max()

print("Minimum played_duration:", min_duration)
print("Maximum played_duration:", max_duration)

Minimum played_duration: 1
Maximum played_duration: 2065490


## 正式處理

## played duration是用秒來計算, 一天86400秒/4time-slot 

In [None]:
start_date = pd.to_datetime('2022-01-01')
end_date = pd.to_datetime('2022-09-17')

print('end_date-start_date: ', end_date - start_date)
num_weeks = (end_date - start_date).days / 7
print('weeks:', num_weeks)

end_date-start_date:  259 days 00:00:00
weeks: 37.0


# 開始專注於處理特徵 
日期方面直接用datatime物件操作

In [None]:
amount_ppl = len(df['user_id'].unique())
amount_ppl

30460

## duration
先不分成週，因為有些duration會超過，不好累加到下一週<br>
用timeslot直接來算每個slot的duration<br>
最後再轉成矩陣物件(每個人37*28)

In [None]:
import pandas as pd

start_date = pd.to_datetime('2022-01-01')
end_date = pd.to_datetime('2022-09-17')

time_slots = []

for i in range(259):
    # 01:00-09:00
    start_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=1)
    end_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=9)
    time_slots.append((start_slot, end_slot))
    
    # 09:00-17:00
    start_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=9)
    end_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=17)
    time_slots.append((start_slot, end_slot))
    
    # 17:00-21:00
    start_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=17)
    end_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=21)
    time_slots.append((start_slot, end_slot))
    
    # 21:00 to next day 01:00
    start_slot = start_date + pd.DateOffset(hours=i*24) + pd.DateOffset(hours=21)
    end_slot = start_date + pd.DateOffset(hours=(i+1)*24) + pd.DateOffset(hours=1)
    time_slots.append((start_slot, end_slot))

for slot in time_slots:
    print(slot[0], slot[1])
    break;

2022-01-01 01:00:00 2022-01-01 09:00:00


In [None]:
time_slots[1][0]

Timestamp('2022-01-01 09:00:00')

In [None]:
# num_slots = 1036
# duration_mtx = np.zeros((amount_ppl, num_slots))
# duration_mtx.shape

In [None]:
# for slot_index, slot in enumerate(time_slots):
#     start_slot = slot[0]
#     end_slot = slot[1]
#     print(slot_index, start_slot)

In [None]:
num_slots = 1036
duration_mtx = np.zeros((amount_ppl, num_slots))
duration_mtx.shape
for current_slot_index, slot in enumerate(time_slots):
    
    start_slot = slot[0]
    end_slot = slot[1]
    
    #if(current_slot_index == 2) : break
    print(f'slot_start: {start_slot}, slot_end: {end_slot}')

    slot_df = df[(df['event_time'] >= start_slot) & (df['event_time'] <= end_slot)] #所有屬於這個slot的data
    # display(slot_df)

    for idx, row in slot_df.iterrows(): #每個row代表這個slot的其中一筆user的使用資料
        user_id = row['user_id']
        duration = float(row['played_duration'])
        current_time = row['event_time']
        #print(f'\n\ntime_diff_seconds: {time_diff_seconds}')
        #print(f'played_duration: {duration}')
        #print(f'user_id: {user_id}')

        '''要處理duration時長過多溢出的問題, 理論上每段duration在一個slot最高是3600*8=28800
            Minimum played_duration: 1,
            Maximum played_duration: 2065490'''
        tmp_slot = current_slot_index #這段duration應該要加到user_id的哪一個slot

        time_diff_seconds = (current_time - start_slot).total_seconds() #從這個slot的起點已經過了多久了到這個event_time
        minus_value = min(time_diff_seconds, duration)
        #print(f'minus: {minus_value}')
        #print(f'duration_mtx value:, {duration_mtx[user_id][tmp_slot]}')
        duration_mtx[user_id][tmp_slot] += minus_value
        #print(f'duration_mtx value:, {duration_mtx[user_id][tmp_slot]}')
        duration -= minus_value
        tmp_slot -= 1 #準備進入之前迴圈
        while(duration > 0 and tmp_slot >= 0):
            #print(f'in the loop duration: {duration}')
            this_slot_duration = time_slots[tmp_slot][1] - time_slots[tmp_slot][0]
            minus_value = min(this_slot_duration.total_seconds(), duration) #看是duration剩餘的多還是這個slot總時間
            #print(f'in the loop minus_value: {minus_value}')
            duration_mtx[user_id][tmp_slot] += minus_value
            #print(f'duration_mtx value:, {duration_mtx[user_id][tmp_slot]}\n')
            duration -= minus_value
            tmp_slot -= 1

slot_start: 2022-01-01 01:00:00, slot_end: 2022-01-01 09:00:00
slot_start: 2022-01-01 09:00:00, slot_end: 2022-01-01 17:00:00
slot_start: 2022-01-01 17:00:00, slot_end: 2022-01-01 21:00:00
slot_start: 2022-01-01 21:00:00, slot_end: 2022-01-02 01:00:00
slot_start: 2022-01-02 01:00:00, slot_end: 2022-01-02 09:00:00
slot_start: 2022-01-02 09:00:00, slot_end: 2022-01-02 17:00:00
slot_start: 2022-01-02 17:00:00, slot_end: 2022-01-02 21:00:00
slot_start: 2022-01-02 21:00:00, slot_end: 2022-01-03 01:00:00
slot_start: 2022-01-03 01:00:00, slot_end: 2022-01-03 09:00:00
slot_start: 2022-01-03 09:00:00, slot_end: 2022-01-03 17:00:00
slot_start: 2022-01-03 17:00:00, slot_end: 2022-01-03 21:00:00
slot_start: 2022-01-03 21:00:00, slot_end: 2022-01-04 01:00:00
slot_start: 2022-01-04 01:00:00, slot_end: 2022-01-04 09:00:00
slot_start: 2022-01-04 09:00:00, slot_end: 2022-01-04 17:00:00
slot_start: 2022-01-04 17:00:00, slot_end: 2022-01-04 21:00:00
slot_start: 2022-01-04 21:00:00, slot_end: 2022-01-05 0

In [None]:
duration_mtx

array([[   0.,    0.,    0., ..., 1140.,    0.,    0.],
       [   0.,    0.,    0., ..., 4395.,    0., 1001.],
       [   0.,    0.,    0., ..., 1186., 3409.,    0.],
       ...,
       [   0.,    0.,    0., ...,  304.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.]])

In [None]:
largest = max(max(row) for row in duration_mtx)
largest

154013.008

## 從這邊開始load 原始處理過的duration_mtx

In [None]:
# np.save('./preprocessed_output/duration_data_origin.npy', duration_mtx)
# loaded_data = np.load('./preprocessed_output/duration_data_origin.npy')
# np.save('duration_data_origin.npy', duration_mtx)
duration_mtx = np.load('duration_data_origin.npy')
duration_mtx

array([[   0.,    0.,    0., ..., 1140.,    0.,    0.],
       [   0.,    0.,    0., ..., 4395.,    0., 1001.],
       [   0.,    0.,    0., ..., 1186., 3409.,    0.],
       ...,
       [   0.,    0.,    0., ...,  304.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.]])

## 對30460*1036的duration矩陣做出一些操作：
- 分別對短slot,長slot操作

In [None]:
short_slot = (np.arange(0, duration_mtx.shape[1], 4).reshape(-1, 1) + np.arange(2)).flatten()
long_slot = (np.arange(2, duration_mtx.shape[1], 4).reshape(-1, 1) + np.arange(2)).flatten()
def modify_long(n):
    m = n**(0.2) / (28800**0.2)
    return max(0, min(1, m))
def modify_short(n):
    m = n**(0.2) / (14400**0.2)
    return max(0, min(1, m))

In [None]:
duration_mtx

array([[   0.,    0.,    0., ..., 1140.,    0.,    0.],
       [   0.,    0.,    0., ..., 4395.,    0., 1001.],
       [   0.,    0.,    0., ..., 1186., 3409.,    0.],
       ...,
       [   0.,    0.,    0., ...,  304.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.]])

In [None]:
for i, data in enumerate(duration_mtx):
    for idx in short_slot:
        duration_mtx[i][idx] = modify_short(duration_mtx[i][idx])
    for idx in long_slot:
        duration_mtx[i][idx] = modify_long(duration_mtx[i][idx])

In [None]:
duration_mtx

array([[0.        , 0.        , 0.        , ..., 0.60215524, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.78871351, 0.        ,
        0.51074993],
       [0.        , 0.        , 0.        , ..., 0.60693815, 0.65259975,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.46227693, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
largest = max(max(row) for row in duration_mtx)
largest

1.0

In [None]:
np.save('./preprocessed_output/duration_data_scaled.npy', duration_mtx)

In [None]:
a = np.array([[1,2,3,4,5,5], [6,7,8,9,10,11], [77,7,8,9,10,11], [6,7,8,9,10,12]])
a1 = a.reshape(4, 2, 3)
a1

array([[[ 1,  2,  3],
        [ 4,  5,  5]],

       [[ 6,  7,  8],
        [ 9, 10, 11]],

       [[77,  7,  8],
        [ 9, 10, 11]],

       [[ 6,  7,  8],
        [ 9, 10, 12]]])

In [None]:
a

array([[ 1,  2,  3,  4,  5,  5],
       [ 6,  7,  8,  9, 10, 11],
       [77,  7,  8,  9, 10, 11],
       [ 6,  7,  8,  9, 10, 12]])

In [None]:
indices = np.array([0,1,4,5])
def addone(x):
    return x+1 
a[:, indices] = addone(a[:, indices])
a

array([[ 2,  3,  3,  4,  6,  6],
       [ 7,  8,  8,  9, 11, 12],
       [78,  8,  8,  9, 11, 12],
       [ 7,  8,  8,  9, 11, 13]])

## 處理完duration

In [None]:
duration_mtx = np.load('./preprocessed_output/duration_data_scaled.npy')
duration_mtx

array([[0.        , 0.        , 0.        , ..., 0.60215524, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.78871351, 0.        ,
        0.51074993],
       [0.        , 0.        , 0.        , ..., 0.60693815, 0.65259975,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.46227693, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
duration_mtx.shape

(30460, 1036)

In [None]:
# x軸100為間隔
bin_size = 10
bin_range = range(int(min(flattened_list)), int(max(flattened_list)) + bin_size, bin_size)
sns.histplot(flattened_list, bins=bin_range)
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

NameError: ignored

## duration (先創他1036個timeslot)

In [None]:
start_date = pd.to_datetime('2022-01-01')
end_date = pd.to_datetime('2022-09-17')

# Create the time slots
time_slots = pd.date_range(start='00:00:00', end='23:59:59', freq='6H').strftime('%H:%M:%S')

# Calculate the number of weeks
num_weeks = int((end_date - start_date).days / 7)

# Calculate the number of time slots per week
num_time_slots = 28

# Create the matrix to store the imputed values
matrix = np.zeros((num_weeks, num_time_slots))
matrix.shape

In [None]:
num_weeks = 37
for i in range(num_weeks):
    # Calculate the start and end dates for the current week
    week_start_date = start_date + pd.DateOffset(weeks=i)
    week_end_date = week_start_date + pd.DateOffset(weeks=1) - pd.DateOffset(days=1)

    print(f'start:{week_start_date}, end: {week_end_date}')
    if(i == 2) : break

    # Filter the dataframe for the current week
    week_df = df[(df['event_time'] >= week_start_date) & (df['event_time'] <= week_end_date)]
    display(week_df)

    #slot_sum = 0 #用來驗證是否剛好算到df中的每個slot
    for j in range(num_time_slots): #這週中的每個timeslot
        slot_start_time = week_start_date + pd.DateOffset(hours=(j * 6)) 
        slot_end_time = slot_start_time + pd.DateOffset(hours=6)

        # Filter the dataframe for the current time slot
        slot_df = week_df[(week_df['event_time'] >= slot_start_time) & (week_df['event_time'] < slot_end_time)]
        #slot_sum += len(slot_df)
        #print(slot_sum, len(week_df))


In [None]:
a = pd.date_range(start='00:00:00', end='23:59:59', freq='6H').strftime('%H:%M:%S')
print(a)

In [None]:
type(df['event_time'][0])