In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt

import my_module

pd.set_option("display.max_columns", None)

In [17]:
# care_dataの訓練データ
df_care = pd.DataFrame()
for path in tqdm(glob("../data/Care Record Data/*")):
    tmp = pd.read_csv(path)
    df_care = pd.concat([df_care, tmp])

# care_dataのテストデータ
df_care_test = pd.DataFrame()
for path in tqdm(glob("../TestData/**/*")):
    tmp = pd.read_csv(path)
    df_care_test = pd.concat([df_care_test, tmp])

# accデータ(訓練とテストデータ共有)
df_acc = pd.DataFrame()
for path in tqdm(glob("../data/Accelerometer Data/*")):
    tmp = pd.read_csv(path)
    df_acc = pd.concat([df_acc, tmp])




100%|██████████| 5/5 [00:00<00:00, 34.31it/s]
100%|██████████| 113/113 [00:00<00:00, 733.68it/s]
100%|██████████| 5/5 [00:29<00:00,  5.94s/it]


### 前処理
- 欠損値は落とす
- 重複データは落とす
- hour_lengthがマイナスのデータは落とす
    - start, finishはそれぞれ20分拡張する(論文の手法) 
- df_accはdatetime、df_careはstartカラムでsort

In [None]:
# str convert to datetime
df_care = my_module.convert_datetime(df_care, ["start", "finish"])
df_care_test = my_module.convert_datetime(df_care_test, ["start", "finish"])
df_acc = my_module.convert_datetime(df_acc, ["datetime"])

# add time_length columns
df_care = my_module.add_timeLength_timeLengthSeconds(df_care)
df_care_test = my_module.add_timeLength_timeLengthSeconds(df_care_test)

# drop missing and duplicated data
df_care.dropna(inplace=True)
df_care_test.dropna(inplace=True)
df_acc.drop_duplicates(inplace=True)

# finish - start <= 0 のデータは処理の関係上落とす
df_care = df_care[df_care["time_length_seconds"] > 0]
df_care_test = df_care_test[df_care_test["time_length_seconds"] > 0]

# datetimeごとにsort
df_care.sort_values("start", inplace=True)
df_care_test.sort_values("start", inplace=True)
df_acc.sort_values("datetime", inplace=True)


df_care["hour"] = df_care["year-month-date-hour"].str.split(pat="-", expand=True).iloc[:, -1].astype(float)
df_care_test["hour"] = df_care_test["year-month-date-hour"].str.split(pat="-", expand=True).iloc[:, -1].astype(float)

In [14]:
# extend time
import datetime
extend_time = 20 #20minute
df_care["start_extend"] = df_care["start"] - datetime.timedelta(minutes=extend_time)
df_care["finish_extend"] = df_care["finish"] - datetime.timedelta(minutes=extend_time)

### accデータのラベリング
- 後々使うかもしれないので各ユーザーごとに使用できるaccデータを結びつける

In [15]:
USERID = df_care["user_id"].unique()

In [20]:
seg_label_list = {index:[] for index in USERID} # segment's (1 sample's) label list (label mean "activity_type_id", "user_id" etc.).
seg_list = {index:[] for index in USERID} # segment's (1 sample's) accelerometer data list.
for userid in USERID:
    df_care_tmp = df_care[df_care["user_id"] == userid]
    df_acc_tmp = df_acc[df_acc["subject_id"] == userid]
    for index, row in df_care_tmp.iterrows():
        # started_at = df_care_tmp.iloc[i, 6]
        # finished_at = df_care_tmp.iloc[i, 7]
        started_at = row["start"]
        finished_at = row["finish"]
        seg = df_acc_tmp[(df_acc_tmp["datetime"] >=started_at) & (df_acc_tmp["datetime"] <= finished_at)]
        # seg_label = df_care_tmp.loc[i, "activity_type_id"]
        seg_label = row["activity_type_id"]
        if (len(seg)!=0):
            seg_list[userid].append(seg)
            seg_label_list[userid].append(seg_label)


In [None]:
# 使用するできるaccデータの確認
# x = list(seg_list.keys())
# height = list(map(len, seg_list.values()))
# plt.bar(x, height=height, tick_label=x, align="center")
for k, v in seg_list.items():
    print(f"user{k}: {len(v)}")
for k, v in seg_label_list.items():
    # print(k, v)
    print(f"user{k}: {set(v)}")

user14: 0
user25: 0
user15: 0
user13: 0
user8: 0
user14: set()
user25: set()
user15: set()
user13: set()
user8: set()


In [19]:
df_care.head()

Unnamed: 0,id,user_id,activity_type_id,activity_type,target_id,activity2user_id,start,finish,year-month-date-hour
0,4173,15,26,休憩,15,25397,2018-03-31 21:29:28+09:00,2018-03-31 22:02:45+09:00,2018-03-31-21.0
1,4410,15,26,休憩,15,26067,2018-04-03 21:49:23+09:00,2018-04-03 22:13:44+09:00,2018-04-03-21.0
2,6280,15,26,休憩,15,31067,2018-04-15 21:53:08+09:00,2018-04-15 22:14:46+09:00,2018-04-15-21.0
3,7928,15,26,休憩,15,34782,2018-04-21 21:34:30+09:00,2018-04-21 22:02:20+09:00,2018-04-21-21.0
4,12797,15,26,休憩,15,45514,2018-05-06 21:49:48+09:00,2018-05-06 22:08:50+09:00,2018-05-06-21.0


### 各時間に行われたactivityをtypeごとに集計して特徴量として追加する

In [8]:
# 各時間に行われたactivityの集計
def makeFeatures(df):
    feat = df.groupby(['activity_type_id', 'year-month-date-hour'], as_index=False).count()[['activity_type_id', 'year-month-date-hour','id']].rename(columns={'id':'count'})
    X = df_care.copy()
    print(feat)
    acts = list(df['activity_type_id'].value_counts().index)
    for act in acts:
        X = pd.merge(X, feat[feat['activity_type_id'] == act][['year-month-date-hour', 'count']], on='year-month-date-hour', how="left").rename(columns={'count':f"label_{act}"})
    return X
df_care_feature = makeFeatures(df_care)
#TODO: ここのエラーを解消
# df_care_feature = df_care_feature.fillna(0.0)

      activity_type_id year-month-date-hour  count
0                    1       2018-03-01-9.0     12
1                    1      2018-03-06-17.0     16
2                    1      2018-03-08-16.0      1
3                    1      2018-03-11-19.0      1
4                    1      2018-03-11-20.0      1
...                ...                  ...    ...
2033                27      2018-05-13-15.0      1
2034                27      2018-05-23-19.0      1
2035                28      2018-02-07-17.0      1
2036                28      2018-03-14-19.0      1
2037                28      2018-05-12-11.0      1

[2038 rows x 3 columns]


### 時間とactivityごとに集計したトータル

In [9]:
df_care_feature["hour"] = df_care_feature["year-month-date-hour"].str.split("-", expand=True).iloc[:, -1].astype(float)
corr = df_care_feature.groupby(["activity_type_id", "hour"]).count().iloc[:, 0].reset_index().sort_values(["activity_type_id", "hour"]).pivot("activity_type_id", "hour").fillna(0).astype(int)

In [120]:
# ここに特徴量を追加する
tmp = df_care_feature.groupby(["hour", "activity_type_id"]).count().iloc[:, 0].reset_index()
tmp[tmp["hour"] == 0.0]

Unnamed: 0,hour,activity_type_id,id
0,0.0,2,1
1,0.0,4,150
2,0.0,11,17
3,0.0,12,504
4,0.0,17,1
5,0.0,19,4
6,0.0,23,3


In [121]:
df_tmp = df_care_feature.copy()
activities = df_tmp["activity_type_id"].unique()
for activity in activities:
    df_tmp[f"{activity}_total"] = 0

In [122]:
df_tmp

Unnamed: 0,id,user_id,activity_type_id,activity_type,target_id,activity2user_id,start,finish,year-month-date-hour,time_length,time_length_seconds,hour,label_12,label_2,label_4,label_10,label_1,label_11,label_19,label_6,label_9,label_7,label_5,label_16,label_3,label_18,label_17,label_20,label_24,label_13,label_8,label_26,label_22,label_14,label_23,label_21,label_25,label_27,label_15,label_28,18_total,10_total,17_total,2_total,14_total,11_total,4_total,24_total,16_total,12_total,26_total,9_total,28_total,5_total,19_total,20_total,3_total,8_total,1_total,27_total,23_total,15_total,13_total,7_total,22_total,21_total,6_total,25_total
0,179,14,18,申し送り・ミーティング,3,14624,2018-02-02 09:47:16+09:00,2018-02-02 09:47:21+09:00,2018-02-02-9.0,0 days 00:00:05,5.0,9.0,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,187,14,10,日中利用者対応,35,12354,2018-02-02 11:12:03+09:00,2018-02-02 11:09:40+09:00,2018-02-02-11.0,-1 days +23:57:37,-143.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,189,14,17,手書き記録,31,11803,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,189,14,17,手書き記録,35,11805,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,189,14,17,手書き記録,36,11806,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10578,22876,8,12,夜間利用者対応,41,65768,2018-05-23 23:58:28+09:00,2018-05-23 23:58:40+09:00,2018-05-23-23.0,0 days 00:00:12,12.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10579,22877,8,12,夜間利用者対応,42,65769,2018-05-23 23:58:48+09:00,2018-05-23 23:59:09+09:00,2018-05-23-23.0,0 days 00:00:21,21.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10580,22878,8,12,夜間利用者対応,43,65770,2018-05-23 23:59:17+09:00,2018-05-23 23:59:27+09:00,2018-05-23-23.0,0 days 00:00:10,10.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10581,22879,8,12,夜間利用者対応,44,65771,2018-05-23 23:59:35+09:00,2018-05-23 23:59:47+09:00,2018-05-23-23.0,0 days 00:00:12,12.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_care_feature.head()

Unnamed: 0,id,user_id,activity_type_id,activity_type,target_id,activity2user_id,start,finish,year-month-date-hour,time_length,time_length_seconds,hour,12,2,4,10,1,11,19,6,9,7,5,16,3,18,17,20,24,13,8,26,22,14,23,21,25,27,15,28
0,179,14,18,申し送り・ミーティング,3,14624,2018-02-02 09:47:16+09:00,2018-02-02 09:47:21+09:00,2018-02-02-9.0,0 days 00:00:05,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,187,14,10,日中利用者対応,35,12354,2018-02-02 11:12:03+09:00,2018-02-02 11:09:40+09:00,2018-02-02-11.0,-1 days +23:57:37,-143.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,189,14,17,手書き記録,31,11803,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,189,14,17,手書き記録,35,11805,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,189,14,17,手書き記録,36,11806,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_care_feature["activity_type_id"].unique().astype(int)

array([18, 10, 17,  2, 14, 11,  4, 24, 16, 12, 26,  9, 28,  5, 19, 20,  3,
        8,  1, 27, 23, 15, 13,  7, 22, 21,  6, 25])