In [1]:
import json
import pandas as pd
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

In [2]:
LOG_DATA_PATH = "data\\final_user-actions_dummy.json"

# 중첩 구조 평탄화해서 읽기
with open(LOG_DATA_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)
    # data = [json.loads(line.strip()) for line in f if line.strip()]  # .jsonl일 때 읽어오는 방법
    
df = pd.json_normalize(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   event                 100 non-null    object
 1   userId                100 non-null    object
 2   username              100 non-null    object
 3   timestamp             100 non-null    object
 4   workspaceId           100 non-null    int64 
 5   details.participants  100 non-null    object
 6   details.state         100 non-null    object
 7   details.name          100 non-null    object
 8   details.importance    100 non-null    int64 
 9   details.startDate     47 non-null     object
 10  details.endDate       47 non-null     object
dtypes: int64(2), object(9)
memory usage: 8.7+ KB


In [3]:
# 유저별 df 분리
user_dfs = {}

for i in df['userId'].unique():
    user_dfs[f'df_user{i}'] = df[df['userId'] == i]

In [4]:
user_dfs['df_user1']  # test

Unnamed: 0,event,userId,username,timestamp,workspaceId,details.participants,details.state,details.name,details.importance,details.startDate,details.endDate
2,DONE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-10T06:09:44.072540Z,1,"[{'userId': 1, 'username': 'testuser1'}, {'use...",DONE,결제 게이트웨이 연동,2,2025-06-07T06:09:44.072540,2025-06-10T06:09:44.072540
5,CREATE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-02T10:05:01.072595Z,1,"[{'userId': 2, 'username': 'testuser2'}]",BEFORE,푸시 알림,5,,
9,CREATE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-05-29T09:13:11.072673Z,1,"[{'userId': 2, 'username': 'testuser2'}, {'use...",BEFORE,주문 취소 처리,2,,
10,CREATE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-12T03:36:19.072688Z,1,"[{'userId': 4, 'username': 'testuser4'}, {'use...",BEFORE,주문 확인서 발송,3,,
11,DONE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-12T06:09:44.072701Z,1,"[{'userId': 3, 'username': 'testuser3'}, {'use...",DONE,결제 연동,5,2025-06-05T06:09:44.072701,2025-06-12T06:09:44.072701
12,DONE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-12T06:09:44.072720Z,1,"[{'userId': 2, 'username': 'testuser2'}, {'use...",DONE,주문 상태 업데이트,2,2025-06-06T06:09:44.072720,2025-06-12T06:09:44.072720
14,DONE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-15T06:09:44.072750Z,1,"[{'userId': 4, 'username': 'testuser4'}, {'use...",DONE,댓글 수정/삭제,2,2025-05-31T06:09:44.072750,2025-06-15T06:09:44.072750
18,CREATE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-05-22T03:19:31.072853Z,1,"[{'userId': 5, 'username': 'testuser5'}, {'use...",BEFORE,게시판 설정,4,,
23,CREATE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-12T14:30:00.072943Z,1,"[{'userId': 5, 'username': 'testuser5'}, {'use...",BEFORE,프로필 이미지 업로드,5,,
24,DONE_PROJECT_PROGRESS_ACTION,1,testuser1,2025-06-16T06:09:44.072956Z,1,"[{'userId': 3, 'username': 'testuser3'}, {'use...",DONE,재고 실사,2,2025-06-12T06:09:44.072956,2025-06-16T06:09:44.072956


In [5]:
# id 리스트 생성
id_list = df['userId'].unique().astype(int)
id_list

array([3, 4, 1, 2, 5])

In [6]:
group_list = {}

for user_df, i in zip(user_dfs.values(), id_list):
    group_list[f'grouped{i}'] = user_df.groupby(['userId', 'details.state', 'details.importance']).size().reset_index(name='count')

In [7]:
for i, (name, user_df) in enumerate(user_dfs.items(), start=1):
    group_list[f'grouped{i}'] = (user_df.groupby(['userId', 'details.state', 'details.importance']).size().reset_index(name='count'))

In [8]:
group_list['grouped1']

Unnamed: 0,userId,details.state,details.importance,count
0,3,BEFORE,2,1
1,3,BEFORE,3,2
2,3,BEFORE,4,1
3,3,BEFORE,5,2
4,3,DONE,2,1
5,3,DONE,3,1
6,3,DONE,4,1
7,3,DONE,5,2
8,3,IN_PROGRESS,1,3


In [9]:
for name, gr in group_list.items():
    json_temp = gr.to_dict(orient='records')

    with open(f"data\\stat1-{name}.json", "w", encoding="utf-8") as f:
        json.dump(json_temp, f, ensure_ascii=False, indent=2)

In [10]:
### stat 2
filtered_users = {}

for name, df in user_dfs.items():
    filtered = df[df['details.state'] == 'DONE'][
        ['userId', 'details.state', 'details.importance', 'details.startDate', 'details.endDate']
    ]
    filtered_users[name] = filtered

In [11]:
filtered_users['df_user1']

Unnamed: 0,userId,details.state,details.importance,details.startDate,details.endDate
2,1,DONE,2,2025-06-07T06:09:44.072540,2025-06-10T06:09:44.072540
11,1,DONE,5,2025-06-05T06:09:44.072701,2025-06-12T06:09:44.072701
12,1,DONE,2,2025-06-06T06:09:44.072720,2025-06-12T06:09:44.072720
14,1,DONE,2,2025-05-31T06:09:44.072750,2025-06-15T06:09:44.072750
24,1,DONE,2,2025-06-12T06:09:44.072956,2025-06-16T06:09:44.072956
28,1,DONE,1,2025-05-28T06:09:44.073019,2025-06-12T06:09:44.073019
39,1,DONE,2,2025-06-06T06:09:44.073183,2025-06-12T06:09:44.073183
41,1,DONE,1,2025-06-11T06:09:44.073227,2025-06-15T06:09:44.073227
49,1,DONE,3,2025-06-07T06:09:44.073331,2025-06-10T06:09:44.073331
67,1,DONE,4,2025-06-07T06:09:44.073612,2025-06-11T06:09:44.073612


In [12]:
for name, df in filtered_users.items():
    # 날짜형 변환
    df['details.startDate'] = pd.to_datetime(df['details.startDate'], utc=True)
    df['details.endDate'] = pd.to_datetime(df['details.endDate'], utc=True)

    # 소요 시간 계산 (시간 단위)
    df['duration_hours'] = (df['details.endDate'] - df['details.startDate']).dt.total_seconds() / 3600

    # 컬럽 drop
    df = df[['userId', 'details.importance', 'duration_hours']]

    # 평균 계산
    df = df.groupby(['userId', 'details.importance'])['duration_hours'].mean().reset_index(name='mean_hours')

    # 딕셔너리 업데이트
    filtered_users[name] = df

In [13]:
filtered_users['df_user1']

Unnamed: 0,userId,details.importance,mean_hours
0,1,1,228.0
1,1,2,147.0
2,1,3,216.0
3,1,4,81.0
4,1,5,192.0


In [14]:
for name, fdf in filtered_users.items():
    json_temp = fdf.to_dict(orient='records')

    # 파일로 저장
    with open(f"data\\stat2-{name}.json", "w", encoding="utf-8") as f:
        for item in json_temp:  # data는 List[Dict]
            json_line = json.dumps(item, ensure_ascii=False)
            f.write(json_line + "\n")
            # json.dump(json_temp, f, ensure_ascii=False, indent=2)

In [19]:
for name, fdf in filtered_users.items():
    json_temp = fdf.to_dict(orient='records')
str(json_temp)

"[{'userId': '5', 'details.importance': 3, 'mean_hours': 168.0}, {'userId': '5', 'details.importance': 5, 'mean_hours': 96.0}]"