In [3]:
from glob import glob
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import my_module

pd.set_option("display.max_columns", None)

## Dataの読み込み
- df_care: train data
- df_care_test: test data
- df_acc: train & test data

In [4]:
# care_dataの訓練データ
df_care = pd.DataFrame()
for path in tqdm(glob("../data/Care Record Data/*")):
    tmp = pd.read_csv(path)
    df_care = pd.concat([df_care, tmp])

# care_dataのテストデータ
df_care_test = pd.DataFrame()
for path in tqdm(glob("../TestData/**/*")):
    tmp = pd.read_csv(path)
    df_care_test = pd.concat([df_care_test, tmp])

# accデータ(訓練とテストデータ共有)
df_acc = pd.DataFrame()
for path in tqdm(glob("../data/Accelerometer Data/*")):
    tmp = pd.read_csv(path)
    df_acc = pd.concat([df_acc, tmp])

100%|██████████| 5/5 [00:00<00:00, 224.46it/s]
100%|██████████| 113/113 [00:00<00:00, 736.58it/s]
100%|██████████| 5/5 [00:31<00:00,  6.27s/it]


## 前処理
- 欠損値は落とす
- 重複データは落とす
- hour_lengthがマイナスのデータは落とす
- trainの3月を含むそれ以降のデータは削除してもよさそう(heatmapより偏りがひどいため)
- df_accはdatetime、df_careはstartカラムでsort

In [5]:
# str convert to datetime
df_care = my_module.convert_datetime(df_care, ["start", "finish"])
df_care_test = my_module.convert_datetime(df_care_test, ["start", "finish"])
df_acc = my_module.convert_datetime(df_acc, ["datetime"])

# add time_length columns
df_care = my_module.add_timeLength_timeLengthSeconds(df_care)
df_care_test = my_module.add_timeLength_timeLengthSeconds(df_care_test)

# drop missing and duplicated data
df_care.dropna(inplace=True)
df_care_test.dropna(inplace=True)
df_acc.drop_duplicates(inplace=True)

# finish - start <= 0 のデータは処理の関係上落とす
df_care = df_care[df_care["time_length_seconds"] > 0]
df_care_test = df_care_test[df_care_test["time_length_seconds"] > 0]

# trainデータの4月を含む後のデータを抽出
df_care = df_care[df_care["start"] >= pd.Timestamp("2018-04-01T00:00:00.000+09:00")]
df_acc = df_acc[df_acc["datetime"] >= pd.Timestamp("2018-04-01T00:00:00.000+09:00")]

# datetimeごとにsort
df_care.sort_values("start", inplace=True)
df_care_test.sort_values("start", inplace=True)
df_acc.sort_values("datetime", inplace=True)

# ラベルが起こった時刻(hour特徴量)の追加
df_care["hour"] = df_care["year-month-date-hour"].str.split(pat="-", expand=True).iloc[:, -1].astype(float)
df_care_test["hour"] = df_care_test["year-month-date-hour"].str.split(pat="-", expand=True).iloc[:, -1].astype(float)

# # extend time (data argument)
# extend_time = 20 #20minute
# df_care = my_module.extend_time(df_care, extend_time)

## k-meansでクラスタリングを行う
- 普通のクラスタリング
- frequencyのmin-max正規化したもののクラスタリング
- 各userの総時間で割ったもののクラスタリング

In [6]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.cluster import SpectralClustering

users_heatmap = my_module.create_frequency_heatmap(df_care)

users_heatmap_dataframe = pd.DataFrame(users_heatmap).T
users_heatmap_dataframe.sort_index(inplace=True)

users_heatmap_dataframe_scaling = preprocessing.minmax_scale(users_heatmap_dataframe, axis=1)

# default
kmeans = KMeans(n_clusters=3, max_iter=5)
cluster = kmeans.fit(users_heatmap_dataframe)

# min-max scaling
kmeans = KMeans(n_clusters=3, max_iter=5)
cluster_min_max_scaling = kmeans.fit(users_heatmap_dataframe_scaling)

# time scaling
kmeans = KMeans(n_clusters=3, max_iter=5)
users_total_time = df_care.groupby(["user_id"])["time_length_seconds"].sum()
users_heatmap_time_scaling = users_heatmap_dataframe / users_total_time.values.reshape(5, 1)
cluster_time_scaling = kmeans.fit(users_heatmap_time_scaling)

print(users_heatmap_dataframe.index)
print(f"default: {cluster.labels_}")
print(f"min-max scale: {cluster_min_max_scaling.labels_}")
print(f"time scale: {cluster_time_scaling.labels_}")

Int64Index([8, 13, 14, 15, 25], dtype='int64')
default: [2 1 0 1 1]
min-max scale: [2 1 1 1 0]
time scale: [1 0 2 2 0]


## クラスタリングを元にheatmapの平均化
- time scaleを軸に考えると
    - クラス1: 8
    - クラス2: 13, 25
    - クラス3: 14, 15

In [11]:
cluster_ave_heatmap ={}

users_heatmap = {index: data.reshape(28, 24) for index, data in users_heatmap.items()}

# class1
cluster_ave_heatmap[1] = users_heatmap[8]

# class2
cluster_ave_heatmap[2] = (users_heatmap[13] + users_heatmap[25]) / 2

# class3
cluster_ave_heatmap[3] = (users_heatmap[14] + users_heatmap[15]) / 2

## 各userで予測するactivity_labelのフィルタを設定
- user8 :   1,2,3,4,5,7,8,9,10,11,12,13,14,16,18,19,20,21,22,23,24
- user13:   1,2,4,7,8,9,10,11,12,13,14,15,16,19,22,25,27
- user14:   1,2,3,4,5,6,7,9,10,11,12,13,14,15,16,19
- user15:   1,2,3,4,5,9,10,11,12,13,14,16,18,19,22
- user25:   1,2,3,4,6,7,8,10,13,14,16,17,18,22,23,24,25,26

In [12]:
users_filters = my_module.create_users_filters()

## Validationデータの予測

### 正解ラベルの作成

In [13]:
def create_y_label(df_care: pd.DataFrame):
    feat = df_care.groupby(["activity_type_id", "year-month-date-hour"]).count().\
                    reset_index()[['activity_type_id', 'year-month-date-hour','id']].\
                    rename(columns={"id": "counts"})
    # 頻度を出現のバイナリに変換
    feat["counts"] = feat["counts"].mask(feat.counts > 0, 1)
    df_care_date = df_care.copy()

    activity_type_ids = sorted(list(df_care['activity_type_id'].unique()))
    for activity_id in activity_type_ids:
        df_care_date = pd.merge(df_care_date, feat[feat['activity_type_id'] == activity_id][['year-month-date-hour', 'counts']],
                    on='year-month-date-hour', how="left").rename(columns={"counts": activity_id})
    df_care_date.loc[:, activity_type_ids] = df_care_date.loc[:, activity_type_ids].fillna(0)

    # 日付の重複を削除・ソート・カラムを任意の順番にソート
    df_care_y = df_care_date[~df_care_date["year-month-date-hour"].duplicated()].loc[:, ["user_id", "year-month-date-hour", *activity_type_ids]].sort_values("year-month-date-hour")

    # activity_labelの欠損値埋め
    for activity_id in np.arange(1, 29):
        if activity_id not in df_care_y.columns:
            df_care_y.loc[:, activity_id] = 0.0
    df_care_y = df_care_y.reindex(columns=["user_id", "year-month-date-hour", *np.arange(1, 29)])
    return df_care_y
df_care_y = create_y_label(df_care)
df_care_test_y = create_y_label(df_care_test)
display(df_care_y)

Unnamed: 0,user_id,year-month-date-hour,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,8,2018-04-01-8.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,15,2018-04-02-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,15,2018-04-02-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,13,2018-04-02-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,13,2018-04-02-11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8325,8,2018-05-23-19.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8330,8,2018-05-23-20.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8337,8,2018-05-23-21.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8348,8,2018-05-23-22.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 対応ヒートマップから予測

In [14]:
class_cluster_map = {
    8 : 1,
    13: 2,
    14: 3,
    15: 3,
    25: 2,
}

predict_list = [] 

for _, rows in df_care_test_y.iterrows():
    user_id, date = rows["user_id"], rows["year-month-date-hour"]
    hour = int(float(date.rsplit("-")[-1]))
    cluster_index = class_cluster_map[user_id]
    user_heatmap = cluster_ave_heatmap[cluster_index]
    predict_list.append([user_id, date, *user_heatmap[:, hour]])

df_predict = pd.DataFrame(predict_list, columns=["user_id", "year-month-date-hour", *np.arange(1, 29)])
display(df_care_test_y)
display(df_predict)

Unnamed: 0,user_id,year-month-date-hour,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,14,2018-05-18-0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14,2018-05-18-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,14,2018-05-18-2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,14,2018-05-18-3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,14,2018-05-18-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,15,2018-07-02-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6378,15,2018-07-02-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6390,15,2018-07-02-5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6400,15,2018-07-02-6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,user_id,year-month-date-hour,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,14,2018-05-18-0.0,0.0,0.5,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,122.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14,2018-05-18-1.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,2018-05-18-2.0,0.0,0.0,0.0,16.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14,2018-05-18-3.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14,2018-05-18-4.0,4.5,0.0,0.0,9.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17.5,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,15,2018-07-02-3.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,15,2018-07-02-4.0,4.5,0.0,0.0,9.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,17.5,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538,15,2018-07-02-5.0,26.0,1.0,0.0,26.5,0.0,3.0,0.0,0.0,8.0,0.0,0.0,127.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,15,2018-07-02-6.0,33.0,0.0,1.0,17.0,0.0,0.0,0.0,0.0,17.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 評価

In [15]:
from sklearn.metrics import classification_report, accuracy_score
y_true = df_care_test_y.iloc[:, 2:]
y_pred = df_predict.iloc[:, 2:]
y_pred[y_pred > 0] = 1
target_names = [f"label_{i}" for i in range(1, 29)]
display(y_true)
display(y_pred)
print('accuracy: {0} %'.format(100*accuracy_score(y_true, y_pred)))
report = classification_report(y_pred, y_true, target_names=target_names, output_dict=True)
report_df = pd.DataFrame(report).T
display(report_df)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6390,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6400,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


accuracy: 3.6968576709796674 %


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
label_1,0.65625,0.156134,0.252252,269.0
label_2,0.92,0.359375,0.516854,320.0
label_3,0.774194,0.165517,0.272727,145.0
label_4,0.978495,0.513158,0.673243,532.0
label_5,0.25,0.044444,0.075472,45.0
label_6,0.883721,0.308943,0.457831,123.0
label_7,0.6,0.115385,0.193548,26.0
label_8,0.6,0.042254,0.078947,71.0
label_9,0.76,0.4,0.524138,95.0
label_10,0.962264,0.291429,0.447368,175.0


In [82]:
import dataframe_image as dfi
report_df.dfi.export("report.png")

[0719/090719.497615:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0719/090719.498079:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0719/090719.740149:INFO:headless_shell.cc(660)] Written to file /tmp/tmpe03zgf3r/temp.png.
[0719/090719.894197:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0719/090719.894280:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0719/090720.090077:INFO:headless_shell.cc(660)] Written to file /tmp/tmpi8qm5447/temp.png.


### accデータのラベリング
- 後々使うかもしれないので各ユーザーごとに使用できるaccデータを結びつける

In [6]:
seg_list, seg_label_list = my_module.create_acc_dataframe_label(df_care_test, df_acc)

In [7]:
for k, v in seg_list.items():
    print(f"user{k}: {len(v)}")
for k, v in seg_label_list.items():
    # print(k, v)
    print(f"user{k}: {set(v)}")

user14: 464
user13: 146
user25: 8
user15: 0
user8: 54
user14: {1, 2, 3, 4, 9, 11, 12, 13, 16}
user13: {1, 2, 4, 9, 10, 11, 12, 14, 16}
user25: {24, 18, 10, 6}
user15: set()
user8: {1, 2, 4, 7, 8, 9, 10, 12, 13, 19, 20}


### 各時間に行われたactivityをtypeごとに集計して特徴量として追加する

In [8]:
# 各時間に行われたactivityの集計
def makeFeatures(df):
    feat = df.groupby(['activity_type_id', 'year-month-date-hour'], as_index=False).count()[['activity_type_id', 'year-month-date-hour','id']].rename(columns={'id':'count'})
    X = df_care.copy()
    print(feat)
    acts = list(df['activity_type_id'].value_counts().index)
    for act in acts:
        X = pd.merge(X, feat[feat['activity_type_id'] == act][['year-month-date-hour', 'count']], on='year-month-date-hour', how="left").rename(columns={'count':f"label_{act}"})
    return X
df_care_feature = makeFeatures(df_care)
#TODO: ここのエラーを解消
# df_care_feature = df_care_feature.fillna(0.0)

      activity_type_id year-month-date-hour  count
0                    1       2018-03-01-9.0     12
1                    1      2018-03-06-17.0     16
2                    1      2018-03-08-16.0      1
3                    1      2018-03-11-19.0      1
4                    1      2018-03-11-20.0      1
...                ...                  ...    ...
2033                27      2018-05-13-15.0      1
2034                27      2018-05-23-19.0      1
2035                28      2018-02-07-17.0      1
2036                28      2018-03-14-19.0      1
2037                28      2018-05-12-11.0      1

[2038 rows x 3 columns]


### 時間とactivityごとに集計したトータル

In [9]:
df_care_feature["hour"] = df_care_feature["year-month-date-hour"].str.split("-", expand=True).iloc[:, -1].astype(float)
corr = df_care_feature.groupby(["activity_type_id", "hour"]).count().iloc[:, 0].reset_index().sort_values(["activity_type_id", "hour"]).pivot("activity_type_id", "hour").fillna(0).astype(int)

In [120]:
# ここに特徴量を追加する
tmp = df_care_feature.groupby(["hour", "activity_type_id"]).count().iloc[:, 0].reset_index()
tmp[tmp["hour"] == 0.0]

Unnamed: 0,hour,activity_type_id,id
0,0.0,2,1
1,0.0,4,150
2,0.0,11,17
3,0.0,12,504
4,0.0,17,1
5,0.0,19,4
6,0.0,23,3


In [121]:
df_tmp = df_care_feature.copy()
activities = df_tmp["activity_type_id"].unique()
for activity in activities:
    df_tmp[f"{activity}_total"] = 0

In [122]:
df_tmp

Unnamed: 0,id,user_id,activity_type_id,activity_type,target_id,activity2user_id,start,finish,year-month-date-hour,time_length,time_length_seconds,hour,label_12,label_2,label_4,label_10,label_1,label_11,label_19,label_6,label_9,label_7,label_5,label_16,label_3,label_18,label_17,label_20,label_24,label_13,label_8,label_26,label_22,label_14,label_23,label_21,label_25,label_27,label_15,label_28,18_total,10_total,17_total,2_total,14_total,11_total,4_total,24_total,16_total,12_total,26_total,9_total,28_total,5_total,19_total,20_total,3_total,8_total,1_total,27_total,23_total,15_total,13_total,7_total,22_total,21_total,6_total,25_total
0,179,14,18,申し送り・ミーティング,3,14624,2018-02-02 09:47:16+09:00,2018-02-02 09:47:21+09:00,2018-02-02-9.0,0 days 00:00:05,5.0,9.0,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,187,14,10,日中利用者対応,35,12354,2018-02-02 11:12:03+09:00,2018-02-02 11:09:40+09:00,2018-02-02-11.0,-1 days +23:57:37,-143.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,189,14,17,手書き記録,31,11803,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,189,14,17,手書き記録,35,11805,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,189,14,17,手書き記録,36,11806,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,,6.0,,1.0,,,,,,,,,,,6.0,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10578,22876,8,12,夜間利用者対応,41,65768,2018-05-23 23:58:28+09:00,2018-05-23 23:58:40+09:00,2018-05-23-23.0,0 days 00:00:12,12.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10579,22877,8,12,夜間利用者対応,42,65769,2018-05-23 23:58:48+09:00,2018-05-23 23:59:09+09:00,2018-05-23-23.0,0 days 00:00:21,21.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10580,22878,8,12,夜間利用者対応,43,65770,2018-05-23 23:59:17+09:00,2018-05-23 23:59:27+09:00,2018-05-23-23.0,0 days 00:00:10,10.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10581,22879,8,12,夜間利用者対応,44,65771,2018-05-23 23:59:35+09:00,2018-05-23 23:59:47+09:00,2018-05-23-23.0,0 days 00:00:12,12.0,23.0,20.0,1.0,4.0,,1.0,,2.0,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_care_feature.head()

Unnamed: 0,id,user_id,activity_type_id,activity_type,target_id,activity2user_id,start,finish,year-month-date-hour,time_length,time_length_seconds,hour,12,2,4,10,1,11,19,6,9,7,5,16,3,18,17,20,24,13,8,26,22,14,23,21,25,27,15,28
0,179,14,18,申し送り・ミーティング,3,14624,2018-02-02 09:47:16+09:00,2018-02-02 09:47:21+09:00,2018-02-02-9.0,0 days 00:00:05,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,187,14,10,日中利用者対応,35,12354,2018-02-02 11:12:03+09:00,2018-02-02 11:09:40+09:00,2018-02-02-11.0,-1 days +23:57:37,-143.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,189,14,17,手書き記録,31,11803,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,189,14,17,手書き記録,35,11805,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,189,14,17,手書き記録,36,11806,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_care_feature["activity_type_id"].unique().astype(int)

array([18, 10, 17,  2, 14, 11,  4, 24, 16, 12, 26,  9, 28,  5, 19, 20,  3,
        8,  1, 27, 23, 15, 13,  7, 22, 21,  6, 25])