In [89]:
import json
import pandas as pd
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

In [90]:
LOG_DATA_PATH = "data\\final_user-actions_dummy.json"

# 중첩 구조 평탄화해서 읽기
with open(LOG_DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)
    # data = [json.loads(line.strip()) for line in f if line.strip()]  # .jsonl일 때 읽어오는 방법
    
df = pd.json_normalize(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event                 100 non-null    object
 1   userId                100 non-null    object
 2   username              100 non-null    object
 3   timestamp             100 non-null    object
 4   workspaceId           100 non-null    int64 
 5   details.participants  100 non-null    object
 6   details.state         100 non-null    object
 7   details.name          100 non-null    object
 8   details.importance    100 non-null    int64 
 9   details.startDate     47 non-null     object
 10  details.endDate       47 non-null     object
dtypes: int64(2), object(9)
memory usage: 8.7+ KB


In [91]:
# ✅ participants 기반으로 user-task 복제
df = df.explode('details.participants')
df['participant_userId'] = df['details.participants'].apply(lambda x: x.get('userId') if isinstance(x, dict) else None)

# ✅ 필요한 필드만 정리
df = df[[
    'participant_userId',  # 이게 진짜 기준
    'details.state',
    'details.importance',
    'details.startDate',
    'details.endDate'
]]

df = df.dropna(subset=['participant_userId'])

# ✅ userId별로 분리
user_dfs = {}
for uid in df['participant_userId'].unique():
    user_dfs[f'df_{uid}'] = df[df['participant_userId'] == uid].copy()

In [92]:
user_dfs

{'df_3':     participant_userId details.state  details.importance  \
 0                    3   IN_PROGRESS                   1   
 6                    3        BEFORE                   1   
 8                    3          DONE                   3   
 10                   3        BEFORE                   3   
 11                   3          DONE                   5   
 13                   3   IN_PROGRESS                   4   
 15                   3   IN_PROGRESS                   1   
 16                   3          DONE                   5   
 17                   3        BEFORE                   5   
 19                   3        BEFORE                   4   
 21                   3          DONE                   5   
 23                   3        BEFORE                   5   
 24                   3          DONE                   2   
 26                   3        BEFORE                   4   
 27                   3          DONE                   2   
 28             

In [93]:
df['participant_userId'].unique()

array([3, 5, 1, 2, 4])

In [94]:
# # 유저별 df 분리
# user_dfs = {}

# for id in df['participant_userId'].unique():
#     user_dfs[f'df_{id}'] = df[df['participant_userId'] == id]

In [95]:
user_dfs.keys()

dict_keys(['df_3', 'df_5', 'df_1', 'df_2', 'df_4'])

In [96]:
user_dfs['df_3']  # test

Unnamed: 0,participant_userId,details.state,details.importance,details.startDate,details.endDate
0,3,IN_PROGRESS,1,,
6,3,BEFORE,1,,
8,3,DONE,3,2025-06-04T06:09:44.072653,2025-06-11T06:09:44.072653
10,3,BEFORE,3,,
11,3,DONE,5,2025-06-05T06:09:44.072701,2025-06-12T06:09:44.072701
13,3,IN_PROGRESS,4,,
15,3,IN_PROGRESS,1,,
16,3,DONE,5,2025-06-11T06:09:44.072806,2025-06-16T06:09:44.072806
17,3,BEFORE,5,,
19,3,BEFORE,4,,


In [97]:
# id 리스트 생성
id_list = df['participant_userId'].unique().astype(int)
id_list

array([3, 5, 1, 2, 4])

In [98]:
# state, importance 기준 grouping -> count 목적
group_list = {}

for i, (name, user_df) in enumerate(user_dfs.items(), start=1):
    group_list[f'grouped{i}'] = (user_df.groupby(['participant_userId', 'details.state', 'details.importance']).size().reset_index(name='count'))

In [99]:
group_list.keys()

dict_keys(['grouped1', 'grouped2', 'grouped3', 'grouped4', 'grouped5'])

In [100]:
group_list['grouped1']

Unnamed: 0,participant_userId,details.state,details.importance,count
0,3,BEFORE,1,6
1,3,BEFORE,2,4
2,3,BEFORE,3,4
3,3,BEFORE,4,4
4,3,BEFORE,5,6
5,3,DONE,1,5
6,3,DONE,2,5
7,3,DONE,3,5
8,3,DONE,4,3
9,3,DONE,5,9


In [101]:
for name, gr in group_list.items():
    json_temp = gr.to_dict(orient='records')

    # with open(f"data\\stat1-{name}.json", "w", encoding="utf-8") as f:
    #     json.dump(json_temp, f, ensure_ascii=False, indent=2)

In [111]:
print(type(group_list))
group_list

<class 'dict'>


{'grouped1':     participant_userId details.state  details.importance  count
 0                    3        BEFORE                   1      6
 1                    3        BEFORE                   2      4
 2                    3        BEFORE                   3      4
 3                    3        BEFORE                   4      4
 4                    3        BEFORE                   5      6
 5                    3          DONE                   1      5
 6                    3          DONE                   2      5
 7                    3          DONE                   3      5
 8                    3          DONE                   4      3
 9                    3          DONE                   5      9
 10                   3   IN_PROGRESS                   1      2
 11                   3   IN_PROGRESS                   4      3
 12                   3   IN_PROGRESS                   5      1,
 'grouped2':     participant_userId details.state  details.importance  count


In [102]:
### stat 2
filtered_users = {}

for name, df in user_dfs.items():
    filtered = df[df['details.state'] == 'DONE'][
        ['participant_userId', 'details.state', 'details.importance', 'details.startDate', 'details.endDate']
    ]
    filtered_users[name] = filtered

In [103]:
filtered_users.keys()

dict_keys(['df_3', 'df_5', 'df_1', 'df_2', 'df_4'])

In [104]:
filtered_users['df_3']

Unnamed: 0,participant_userId,details.state,details.importance,details.startDate,details.endDate
8,3,DONE,3,2025-06-04T06:09:44.072653,2025-06-11T06:09:44.072653
11,3,DONE,5,2025-06-05T06:09:44.072701,2025-06-12T06:09:44.072701
16,3,DONE,5,2025-06-11T06:09:44.072806,2025-06-16T06:09:44.072806
21,3,DONE,5,2025-06-08T06:09:44.072910,2025-06-14T06:09:44.072910
24,3,DONE,2,2025-06-12T06:09:44.072956,2025-06-16T06:09:44.072956
27,3,DONE,2,2025-06-11T06:09:44.073003,2025-06-14T06:09:44.073003
28,3,DONE,1,2025-05-28T06:09:44.073019,2025-06-12T06:09:44.073019
29,3,DONE,5,2025-06-08T06:09:44.073036,2025-06-13T06:09:44.073036
30,3,DONE,1,2025-06-06T06:09:44.073049,2025-06-11T06:09:44.073049
37,3,DONE,1,2025-06-07T06:09:44.073152,2025-06-14T06:09:44.073152


In [105]:
for name, df in filtered_users.items():
    # 날짜형 변환
    df['details.startDate'] = pd.to_datetime(df['details.startDate'], utc=True)
    df['details.endDate'] = pd.to_datetime(df['details.endDate'], utc=True)

    # 소요 시간 계산 (시간 단위)
    df['duration_hours'] = (df['details.endDate'] - df['details.startDate']).dt.total_seconds() / 3600

    # 컬럽 drop
    df = df[['participant_userId', 'details.importance', 'duration_hours']]

    # 평균 계산
    df = df.groupby(['participant_userId', 'details.importance'])['duration_hours'].mean().reset_index(name='mean_hours')

    # 딕셔너리 업데이트
    filtered_users[name] = df

In [106]:
filtered_users['df_3']

Unnamed: 0,participant_userId,details.importance,mean_hours
0,3,1,182.4
1,3,2,115.2
2,3,3,177.6
3,3,4,112.0
4,3,5,144.0


In [None]:
for name, fdf in filtered_users.items():
    json_temp = fdf.to_dict(orient='records')

    # 파일로 저장
    # with open(f"data\\stat2-{name}.json", "w", encoding="utf-8") as f:
    #     for item in json_temp:  # data는 List[Dict]
    #         json_line = json.dumps(item, ensure_ascii=False)
    #         f.write(json_line + "\n")
    #         # json.dump(json_temp, f, ensure_ascii=False, indent=2)

{'df_3':    participant_userId  details.importance  mean_hours
 0                   3                   1       182.4
 1                   3                   2       115.2
 2                   3                   3       177.6
 3                   3                   4       112.0
 4                   3                   5       144.0,
 'df_5':    participant_userId  details.importance  mean_hours
 0                   5                   1  158.857143
 1                   5                   2  132.000000
 2                   5                   3  120.000000
 3                   5                   4   81.000000
 4                   5                   5  136.000000,
 'df_1':    participant_userId  details.importance  mean_hours
 0                   1                   1  133.333333
 1                   1                   2  149.333333
 2                   1                   3  168.000000
 3                   1                   4   96.750000
 4                   1                 

In [113]:
print(type(filtered_users))
filtered_users

<class 'dict'>


{'df_3':    participant_userId  details.importance  mean_hours
 0                   3                   1       182.4
 1                   3                   2       115.2
 2                   3                   3       177.6
 3                   3                   4       112.0
 4                   3                   5       144.0,
 'df_5':    participant_userId  details.importance  mean_hours
 0                   5                   1  158.857143
 1                   5                   2  132.000000
 2                   5                   3  120.000000
 3                   5                   4   81.000000
 4                   5                   5  136.000000,
 'df_1':    participant_userId  details.importance  mean_hours
 0                   1                   1  133.333333
 1                   1                   2  149.333333
 2                   1                   3  168.000000
 3                   1                   4   96.750000
 4                   1                 