In [176]:
from glob import glob
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import my_module

pd.set_option("display.max_columns", None)

## Dataの読み込み
- df_care: train data
- df_care_test: test data
- df_acc: train & test data

In [177]:
df_acc, df_care_train, df_care_test = my_module.read_all_data()

## 前処理
- 欠損値は落とす
- 重複データは落とす
- hour_lengthがマイナスのデータは落とす
- trainの3月を含むそれ以降のデータは削除してもよさそう(heatmapより偏りがひどいため)
- df_accはdatetime、df_careはstartカラムでsort

In [178]:
# str convert to datetime
df_care_train = my_module.convert_datetime(df_care_train, ["start", "finish"])
df_care_test = my_module.convert_datetime(df_care_test, ["start", "finish"])
df_acc = my_module.convert_datetime(df_acc, ["datetime"])

# add time_length columns
df_care_train = my_module.add_timeLength_timeLengthSeconds(df_care_train)
df_care_test = my_module.add_timeLength_timeLengthSeconds(df_care_test)

# drop missing and duplicated data
df_care_train.dropna(inplace=True)
df_care_test.dropna(inplace=True)
df_acc.drop_duplicates(inplace=True)

# finish - start <= 0 のデータは処理の関係上落とす
df_care_train = df_care_train[df_care_train["time_length_seconds"] > 0]
df_care_test = df_care_test[df_care_test["time_length_seconds"] > 0]

# trainデータの4月を含む後のデータを抽出
df_care_train = df_care_train[df_care_train["start"] >= pd.Timestamp("2018-04-01T00:00:00.000+09:00")]
df_acc = df_acc[df_acc["datetime"] >= pd.Timestamp("2018-04-01T00:00:00.000+09:00")]

# datetimeごとにsort
df_care_train.sort_values("start", inplace=True)
df_care_test.sort_values("start", inplace=True)
df_acc.sort_values("datetime", inplace=True)

# ラベルが起こった時刻(hour特徴量)の追加
df_care_train["hour"] = df_care_train["year-month-date-hour"].str.split(pat="-", expand=True).iloc[:, -1].astype(float)
df_care_test["hour"] = df_care_test["year-month-date-hour"].str.split(pat="-", expand=True).iloc[:, -1].astype(float)

# # extend time (data argument)
# extend_time = 20 #20minute
# df_care_train = my_module.extend_time(df_care_train, extend_time)

## k-meansでクラスタリングを行う
- 普通のクラスタリング
- frequencyのmin-max正規化したもののクラスタリング
- 各userの総時間で割ったもののクラスタリング

In [179]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.cluster import SpectralClustering

users_heatmap = my_module.create_frequency_heatmap(df_care_train)

users_heatmap_dataframe = pd.DataFrame(users_heatmap).T
users_heatmap_dataframe.sort_index(inplace=True)

# default
kmeans = KMeans(n_clusters=3, max_iter=5)
cluster = kmeans.fit(users_heatmap_dataframe)

# min-max scaling
kmeans = KMeans(n_clusters=3, max_iter=5)
users_heatmap_dataframe_scaling = preprocessing.minmax_scale(users_heatmap_dataframe, axis=1)
cluster_min_max_scaling = kmeans.fit(users_heatmap_dataframe_scaling)

# time scaling
kmeans = KMeans(n_clusters=3, max_iter=5)
users_total_time = df_care_train.groupby(["user_id"])["time_length_seconds"].sum()
users_heatmap_time_scaling = users_heatmap_dataframe / users_total_time.values.reshape(5, 1)
cluster_time_scaling = kmeans.fit(users_heatmap_time_scaling)

print(users_heatmap_dataframe.index)
print(f"default: {cluster.labels_}")
print(f"min-max scale: {cluster_min_max_scaling.labels_}")
print(f"time scale: {cluster_time_scaling.labels_}")

Int64Index([8, 13, 14, 15, 25], dtype='int64')
default: [1 0 2 0 0]
min-max scale: [1 0 0 0 2]
time scale: [1 0 2 2 0]


## アイデア1: クラスタリングを元にheatmapの平均化
- time scaleを軸に考えると
    - クラス1: 8
    - クラス2: 13, 25
    - クラス3: 14, 15

In [214]:
df_care_all = pd.concat([df_care_train, df_care_test])

users_heatmap_train = my_module.create_frequency_heatmap(df_care_train)
users_heatmap_all = my_module.create_frequency_heatmap(df_care_all)


cluster_ave_heatmap ={}

users_heatmap = {index: data.reshape(28, 24) for index, data in users_heatmap_all.items()}

# class1
cluster_ave_heatmap[0] = users_heatmap[8]

# class2
cluster_ave_heatmap[1] = (users_heatmap[13] + users_heatmap[25]) / 2

# class3
cluster_ave_heatmap[2] = (users_heatmap[14] + users_heatmap[15]) / 2

class_cluster_map = {
    8 : 0,
    13: 1,
    14: 2,
    15: 2,
    25: 1,
}

# cluster_ave_heatmap[0] = users_heatmap[8]

# cluster_ave_heatmap[1] = (users_heatmap[13] + users_heatmap[14] + users_heatmap[15]) / 3

# cluster_ave_heatmap[2] = users_heatmap[25]

# class_cluster_map = {
#     8 : 0,
#     13: 1,
#     14: 1,
#     15: 1,
#     25: 2,
# }

# cluster_ave_heatmap[0] = (users_heatmap[13] + users_heatmap[15] + users_heatmap[25]) / 3

# cluster_ave_heatmap[1] = users_heatmap[8]

# cluster_ave_heatmap[2] = users_heatmap[14]

# class_cluster_map = {
#     8 : 1,
#     13: 0,
#     14: 2,
#     15: 0,
#     25: 0,
# }

## アイデア2: 祝日平日でクラスタリング

In [215]:
def create_frequency_heatmap(df: pd.DataFrame, show_flag: bool = False) -> dict:
    """
        各userの時間-activity_type_idのheatmapを作成する
        show_flag: heatmapの可視化を行うかのflag
    """
    mpl.style.use("seaborn-darkgrid")
    year_month_day = df["year-month-date-hour"].map(lambda x: x.rsplit("-", 1)[0])
    df["weekday"] = pd.to_datetime(year_month_day, format="%Y-%m-%d").dt.day_name()
    df["weekday"] = df["weekday"].map(lambda x: 1 if x in ["Sunday", "Saturday"] else 0)
    USER_ID: list = df["user_id"].unique()
    users_heatmap: dict = {}

    for user_id in USER_ID:
        heatmap_matrix = np.zeros((28, 24), dtype=int)
        corr = df[df["user_id"] == user_id].groupby(["weekday", "activity_type_id", "hour"]).\
                                                    count().iloc[:, 0].\
                                                    reset_index()[["activity_type_id", "hour", "id"]].\
                                                    sort_values(["activity_type_id", "hour"])
        for id, hour, count in zip(corr["activity_type_id"], corr["hour"], corr["id"]):
            id = int(id) - 1
            hour = int(hour)
            heatmap_matrix[id, hour] = int(count)
        users_heatmap[user_id] = heatmap_matrix.reshape(-1)

        # TODO: yticksをactivity_type_idに対応させる
        # 各userのheatmapの表示
        if show_flag:
            fig, ax = plt.subplots(1, 1, figsize=(15, 15))
            ax = sns.heatmap(heatmap_matrix, annot=True, fmt="d", ax=ax)
            ax.set_title(user_id)
    return users_heatmap


In [216]:
year_month_day = df_care_train["year-month-date-hour"].map(lambda x: x.rsplit("-", 1)[0])
df_care_train["weekday"] = pd.to_datetime(year_month_day, format="%Y-%m-%d").dt.day_name()
df_care_train["weekday"] = df_care_train["weekday"].map(lambda x: 1 if x in ["Sunday", "Saturday"] else 0)

df_care_train_summary = df_care_train[df_care_train["user_id"] == 8].groupby(["weekday", "activity_type_id", "hour"]).count().reset_index()
corr = df_care_train_summary[df_care_train_summary["weekday"] == 0]
heatmap_matrix = np.zeros((28, 24), dtype=int)
for id, hour, count in zip(corr["activity_type_id"], corr["hour"], corr["id"]):
    id = int(id) - 1
    hour = int(hour)
    heatmap_matrix[id, hour] = int(count)

## 各userで予測するactivity_labelのフィルタを設定
- user8 :   1,2,3,4,5,7,8,9,10,11,12,13,14,16,18,19,20,21,22,23,24
- user13:   1,2,4,7,8,9,10,11,12,13,14,15,16,19,22,25,27
- user14:   1,2,3,4,5,6,7,9,10,11,12,13,14,15,16,19
- user15:   1,2,3,4,5,9,10,11,12,13,14,16,18,19,22
- user25:   1,2,3,4,6,7,8,10,13,14,16,17,18,22,23,24,25,26

In [217]:
users_filters = my_module.create_users_filters()

## Validationデータの予測

### 正解ラベルの作成

In [218]:
df_care_train_y = my_module.create_y_label(df_care_train)
df_care_test_y = my_module.create_y_label(df_care_test)
df_care_all_y = my_module.create_y_label(df_care_all)
display(df_care_all_y)

Unnamed: 0,user_id,year-month-date-hour,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,8,2018-04-01-8.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,15,2018-04-02-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,15,2018-04-02-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,13,2018-04-02-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
140,13,2018-04-02-11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14741,15,2018-07-02-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14758,15,2018-07-02-4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14770,15,2018-07-02-5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14780,15,2018-07-02-6.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 対応ヒートマップから予測

In [219]:
def create_predict_df(df_care_y, class_cluster_map):
    predict_list = [] 

    for _, rows in df_care_y.iterrows():
        user_id, date = rows["user_id"], rows["year-month-date-hour"]
        hour = int(float(date.rsplit("-")[-1]))
        cluster_index = class_cluster_map[user_id]
        user_heatmap = cluster_ave_heatmap[cluster_index]
        predict_list.append([user_id, date, *user_heatmap[:, hour]])
    df_predict = pd.DataFrame(predict_list, columns=["user_id", "year-month-date-hour", *np.arange(1, 29)])
    return df_predict

df_care_train_predict = create_predict_df(df_care_train_y, class_cluster_map)
df_care_test_predict = create_predict_df(df_care_test_y, class_cluster_map)
df_care_all_predict = create_predict_df(df_care_all_y, class_cluster_map)

# display(df_care_train_predict)
# display(df_care_test_predict)
display(df_care_all_predict)

Unnamed: 0,user_id,year-month-date-hour,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,8,2018-04-01-8.0,13.0,60.0,0.0,6.0,0.0,0.0,7.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15,2018-04-02-0.0,0.0,0.5,0.0,55.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15,2018-04-02-1.0,0.0,0.0,0.0,24.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,272.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13,2018-04-02-10.0,1.0,0.0,0.0,10.5,0.0,16.5,5.0,4.5,0.0,80.0,0.0,0.0,2.0,1.5,0.0,1.5,1.5,1.0,0.0,0.0,0.0,2.5,0.0,7.5,1.5,0.0,0.0,0.0
4,13,2018-04-02-11.0,1.0,0.0,0.5,24.5,0.0,41.5,0.0,0.0,0.0,3.5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.5,0.0,0.0,0.5,0.0,1.0,1.0,0.0,0.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1226,15,2018-07-02-3.0,0.0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,232.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1227,15,2018-07-02-4.0,6.5,0.0,0.0,21.5,0.0,0.5,0.0,0.0,1.5,0.0,0.0,47.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1228,15,2018-07-02-5.0,51.0,1.5,0.0,44.0,0.0,3.5,0.0,0.0,13.5,0.0,0.0,323.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1229,15,2018-07-02-6.0,44.0,0.0,1.5,30.0,0.0,6.0,0.0,0.0,29.0,0.0,0.0,4.5,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 評価

In [220]:
fix_predict_labels = []
for index, data in df_care_all_predict.iterrows():
    user_id = data["user_id"]
    hour = int(float(data["year-month-date-hour"].rsplit("-")[-1]))
    filter = users_filters[user_id][hour]
    predict_data_row = data.iloc[2:]
    predict_data_row[filter == 0] = 0.0
    fix_predict_labels.append([*predict_data_row])
df_care_all_predict = pd.DataFrame(data=fix_predict_labels)
    

In [223]:
from sklearn.metrics import classification_report, accuracy_score
import warnings

y_true = df_care_all_y.iloc[:, 2:]
y_true[y_true>0] = 1
y_true[y_true<=0] = 0
y_true = y_true.astype(int)

THRESHOLD = 0.5
y_pred = df_care_all_predict.copy()
y_pred[y_pred > THRESHOLD] = 1
y_pred[y_pred <= THRESHOLD] = 0
y_pred = y_pred.astype(int)


print('accuracy: {0} %'.format(100*accuracy_score(y_true, y_pred)))

warnings.simplefilter("ignore")
target_names = [f"label_{i}" for i in range(1, 29)]
report = classification_report(y_pred, y_true, target_names=target_names, output_dict=True)
report_df = pd.DataFrame(report).T
display(report_df)

accuracy: 7.798537774167344 %


Unnamed: 0,precision,recall,f1-score,support
label_1,0.903614,0.211566,0.342857,709.0
label_2,0.971875,0.409211,0.575926,760.0
label_3,0.864407,0.154545,0.262211,330.0
label_4,0.99361,0.509419,0.673525,1221.0
label_5,0.647059,0.169231,0.268293,65.0
label_6,0.870748,0.465455,0.606635,275.0
label_7,0.875,0.125,0.21875,112.0
label_8,0.846154,0.066265,0.122905,166.0
label_9,0.941748,0.368821,0.530055,263.0
label_10,0.977612,0.280514,0.43594,467.0


## submissionDataの作成

In [224]:
paths = glob("../TestSubmission/**")
data = pd.DataFrame()
for path in paths:
    df = pd.read_csv(path)
    user_id = int(path.split("/")[-1].split(".")[0].strip("test"))
    df["user_id"] = user_id
    data = pd.concat([data, df.loc[:, ["user_id", "year-month-date-hour"]]])
submission_label_df = pd.DataFrame(data)

In [225]:
submission_label_df.head()

Unnamed: 0,user_id,year-month-date-hour
0,14,2018-05-18-06
1,14,2018-05-19-23
2,14,2018-05-21-07
3,14,2018-05-22-23
4,14,2018-05-24-06


In [226]:
df_submission= create_predict_df(submission_label_df, class_cluster_map)
df_submission.head()

Unnamed: 0,user_id,year-month-date-hour,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,14,2018-05-18-06,44.0,0.0,1.5,30.0,0.0,6.0,0.0,0.0,29.0,0.0,0.0,4.5,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14,2018-05-19-23,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14,2018-05-21-07,1.0,174.0,0.0,6.5,0.0,2.5,0.0,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,7.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,14,2018-05-22-23,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14,2018-05-24-06,44.0,0.0,1.5,30.0,0.0,6.0,0.0,0.0,29.0,0.0,0.0,4.5,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [227]:
def user_filtering(x):
    user_id = x["user_id"]
    hour = int(x["year-month-date-hour"].split("-")[-1])
    user_filter = users_filters[user_id][hour]
    x[2:][user_filter==1]
df_submission.sort_values(["user_id", "year-month-date-hour"], inplace=True)

USERS_IDS = [8, 13, 14, 15, 25]
for user_id in USERS_IDS:
    tmp = df_submission[df_submission["user_id"] == user_id]
    predict_data = []
    for index, data in tmp.iterrows():
        user_id = data["user_id"]
        hour = int(data["year-month-date-hour"].split("-")[-1])
        user_filter = users_filters[user_id][hour]
        user_predict = data[2:][user_filter==1]
        user_predict = user_predict.apply(lambda x: 1 if x > 0 else 0)
        predict_data.append([user_id, data["year-month-date-hour"], *user_predict.values])
    final_user_submission = pd.DataFrame(data=predict_data).iloc[:, 1:].set_axis(["year-month-date-hour", *user_predict.index.to_list()], axis="columns")
    final_user_submission.to_csv(f"../predict/user_{user_id}.csv", index=None)
    display(final_user_submission)

Unnamed: 0,year-month-date-hour,1,2,3,4,5,7,8,9,10,11,12,13,14,16,18,19,20,21,22,23,24
0,2018-05-25-05,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0
1,2018-05-26-16,1,1,0,1,0,0,0,1,1,0,0,1,0,0,1,1,1,0,0,1,0
2,2018-05-27-21,0,1,1,1,0,0,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1
3,2018-05-28-23,1,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
4,2018-05-30-08,1,1,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,0,1,0,0
5,2018-05-31-23,1,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
6,2018-06-02-08,1,1,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,0,1,0,0
7,2018-06-03-13,1,1,0,1,1,0,1,1,1,0,0,1,0,0,1,1,1,0,1,1,1
8,2018-06-04-09,1,1,0,1,0,1,1,1,1,0,0,0,0,0,1,1,1,0,1,0,0
9,2018-06-05-23,1,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0


Unnamed: 0,year-month-date-hour,1,2,4,7,8,9,10,11,12,13,14,15,16,19,22,25,27
0,2018-05-20-23,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,2018-05-22-08,1,1,1,1,1,0,1,0,0,0,0,0,1,1,0,0,0
2,2018-05-23-20,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0
3,2018-05-24-23,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,2018-05-28-17,1,1,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0
5,2018-05-30-23,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,2018-06-01-05,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0
7,2018-06-05-07,1,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0
8,2018-06-06-15,1,1,1,0,0,0,1,0,1,1,0,0,0,1,1,1,1
9,2018-06-09-05,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0


Unnamed: 0,year-month-date-hour,1,2,3,4,5,6,7,9,10,11,12,13,14,15,16,19
0,2018-05-18-06,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0
1,2018-05-19-23,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
2,2018-05-21-07,1,1,0,1,0,1,0,1,1,0,0,0,0,0,1,0
3,2018-05-22-23,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
4,2018-05-24-06,1,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0
5,2018-05-25-23,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
6,2018-05-27-08,0,1,1,1,1,0,1,1,1,0,0,1,1,0,1,1
7,2018-05-28-23,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
8,2018-05-30-05,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0
9,2018-05-31-23,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0


Unnamed: 0,year-month-date-hour,1,2,3,4,5,9,10,11,12,13,14,16,18,19,22
0,2018-05-22-08,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0
1,2018-05-23-23,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2,2018-05-25-05,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0
3,2018-05-26-23,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4,2018-05-28-08,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0
5,2018-05-29-23,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
6,2018-05-31-08,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0
7,2018-06-01-23,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
8,2018-06-03-08,0,1,1,1,1,1,1,0,0,1,1,1,1,1,0
9,2018-06-04-22,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,year-month-date-hour,1,2,3,4,6,7,8,10,13,14,16,17,18,22,23,24,25,26
0,2018-05-21-11,1,0,1,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
1,2018-05-24-15,1,1,0,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
2,2018-05-25-15,1,1,0,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
3,2018-05-26-12,0,1,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,1
4,2018-05-30-15,1,1,0,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
5,2018-06-02-12,0,1,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,1
6,2018-06-03-15,1,1,0,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0
7,2018-06-05-12,0,1,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,1
8,2018-06-10-14,1,1,0,1,1,0,1,1,1,1,1,1,0,1,1,1,0,0
9,2018-06-11-09,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,1,0,0


In [14]:
import dataframe_image as dfi
report_df.dfi.export("report.png")

[0727/095714.099918:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0727/095714.099992:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0727/095714.258451:INFO:headless_shell.cc(660)] Written to file /tmp/tmpbctu3vw2/temp.png.
[0727/095714.370808:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0727/095714.370906:ERROR:bus.cc(398)] Failed to connect to the bus: Failed to connect to socket /var/run/dbus/system_bus_socket: No such file or directory
[0727/095714.550571:INFO:headless_shell.cc(660)] Written to file /tmp/tmpmvelq0pt/temp.png.


### accデータのラベリング
- 後々使うかもしれないので各ユーザーごとに使用できるaccデータを結びつける

In [None]:
seg_list, seg_label_list = my_module.create_acc_dataframe_label(df_care_test, df_acc)

In [None]:
for k, v in seg_list.items():
    print(f"user{k}: {len(v)}")
for k, v in seg_label_list.items():
    # print(k, v)
    print(f"user{k}: {set(v)}")

user14: 464
user13: 146
user25: 8
user15: 0
user8: 54
user14: {1, 2, 3, 4, 9, 11, 12, 13, 16}
user13: {1, 2, 4, 9, 10, 11, 12, 14, 16}
user25: {24, 18, 10, 6}
user15: set()
user8: {1, 2, 4, 7, 8, 9, 10, 12, 13, 19, 20}


### 各時間に行われたactivityをtypeごとに集計して特徴量として追加する

In [None]:
# 各時間に行われたactivityの集計
def makeFeatures(df):
    feat = df.groupby(['activity_type_id', 'year-month-date-hour'], as_index=False).count()[['activity_type_id', 'year-month-date-hour','id']].rename(columns={'id':'count'})
    X = df_care_train.copy()
    print(feat)
    acts = list(df['activity_type_id'].value_counts().index)
    for act in acts:
        X = pd.merge(X, feat[feat['activity_type_id'] == act][['year-month-date-hour', 'count']], on='year-month-date-hour', how="left").rename(columns={'count':f"label_{act}"})
    return X
df_care_feature = makeFeatures(df_care_train)
#TODO: ここのエラーを解消
# df_care_feature = df_care_feature.fillna(0.0)

      activity_type_id year-month-date-hour  count
0                    1       2018-03-01-9.0     12
1                    1      2018-03-06-17.0     16
2                    1      2018-03-08-16.0      1
3                    1      2018-03-11-19.0      1
4                    1      2018-03-11-20.0      1
...                ...                  ...    ...
2033                27      2018-05-13-15.0      1
2034                27      2018-05-23-19.0      1
2035                28      2018-02-07-17.0      1
2036                28      2018-03-14-19.0      1
2037                28      2018-05-12-11.0      1

[2038 rows x 3 columns]


### 時間とactivityごとに集計したトータル

In [None]:
df_care_feature["hour"] = df_care_feature["year-month-date-hour"].str.split("-", expand=True).iloc[:, -1].astype(float)
corr = df_care_feature.groupby(["activity_type_id", "hour"]).count().iloc[:, 0].reset_index().sort_values(["activity_type_id", "hour"]).pivot("activity_type_id", "hour").fillna(0).astype(int)

In [None]:
# ここに特徴量を追加する
tmp = df_care_feature.groupby(["hour", "activity_type_id"]).count().iloc[:, 0].reset_index()
tmp[tmp["hour"] == 0.0]

Unnamed: 0,hour,activity_type_id,id
0,0.0,2,1
1,0.0,4,150
2,0.0,11,17
3,0.0,12,504
4,0.0,17,1
5,0.0,19,4
6,0.0,23,3


In [None]:
df_tmp = df_care_feature.copy()
activities = df_tmp["activity_type_id"].unique()
for activity in activities:
    df_tmp[f"{activity}_total"] = 0

In [None]:
df_care_feature.head()

Unnamed: 0,id,user_id,activity_type_id,activity_type,target_id,activity2user_id,start,finish,year-month-date-hour,time_length,time_length_seconds,hour,12,2,4,10,1,11,19,6,9,7,5,16,3,18,17,20,24,13,8,26,22,14,23,21,25,27,15,28
0,179,14,18,申し送り・ミーティング,3,14624,2018-02-02 09:47:16+09:00,2018-02-02 09:47:21+09:00,2018-02-02-9.0,0 days 00:00:05,5.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,187,14,10,日中利用者対応,35,12354,2018-02-02 11:12:03+09:00,2018-02-02 11:09:40+09:00,2018-02-02-11.0,-1 days +23:57:37,-143.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,189,14,17,手書き記録,31,11803,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,189,14,17,手書き記録,35,11805,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,189,14,17,手書き記録,36,11806,2018-02-02 11:27:59+09:00,2018-02-02 11:28:00+09:00,2018-02-02-11.0,0 days 00:00:01,1.0,11.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
