In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def create_sliding_windows(data, window_size, step_size, feature_cols, label_cols):
    X, y = [], []
    unique_users = data['userId'].unique()
    for user in unique_users:
        user_data = data[data['userId'] == user]
        unique_dates = user_data['date'].unique()
        for date in unique_dates:
            daily_data = user_data[user_data['date'] == date]
            for start in range(0, len(daily_data) - window_size + 1, step_size):
                end = start + window_size
                window = daily_data.iloc[start:end]
                X.append(window[feature_cols].values)
                y.append(window[label_cols].iloc[0].values)  # 라벨은 윈도우의 첫 번째 값 사용
    return np.array(X), np.array(y)

In [3]:
# 데이터 길이를 padding 하여 맞춘다. 맞추지 않으면 tensor에서 size_error 발생

def pad_to_max_length(arr, max_length):
    """
    배열을 최대 길이에 맞추어 0으로 패딩합니다.
    """
    if len(arr) < max_length:
        pad_width = [(0, max_length - len(arr))] + [(0, 0)] * (arr.ndim - 1)
        return np.pad(arr, pad_width, mode='constant', constant_values=0)
    else:
        return arr

In [4]:
# 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, input_size_acc, input_size_hr, input_size_gps ,hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()


        self.lstm_acc = nn.LSTM(input_size_acc, hidden_size, num_layers, batch_first=True)
        self.lstm_hr = nn.LSTM(input_size_hr, hidden_size, num_layers, batch_first=True)
        self.lstm_gps = nn.LSTM(input_size_gps, hidden_size, num_layers, batch_first=True)

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        self.fc = nn.Linear(hidden_size * 3, num_classes)
    
    def forward(self, x_acc, x_hr, x_gps):
        h0_acc = torch.zeros(self.num_layers, x_acc.size(0), self.hidden_size).to(device)
        c0_acc = torch.zeros(self.num_layers, x_acc.size(0), self.hidden_size).to(device)
        out_acc, _ = self.lstm_acc(x_acc, (h0_acc, c0_acc))
        
        h0_hr = torch.zeros(self.num_layers, x_hr.size(0), self.hidden_size).to(device)
        c0_hr = torch.zeros(self.num_layers, x_hr.size(0), self.hidden_size).to(device)
        out_hr, _ = self.lstm_hr(x_hr, (h0_hr, c0_hr))

        h0_gps = torch.zeros(self.num_layers, x_gps.size(0), self.hidden_size).to(device)
        c0_gps = torch.zeros(self.num_layers, x_gps.size(0), self.hidden_size).to(device)
        out_gps, _ = self.lstm_gps(x_gps, (h0_gps, c0_gps))

        
        out_acc = out_acc[:, -1, :]
        out_hr = out_hr[:, -1, :]
        out_gps = out_gps[:, -1, :]
        
        combined = torch.cat((out_acc, out_hr, out_gps), dim=1)
        out = self.fc(combined)
        return out

In [5]:
# 모델 인스턴스 생성
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model = LSTMModel(3, 1, 2, 64, 2, 10).to(device)

print(device)

mps


In [6]:
model.load_state_dict(torch.load('./model_user01-06_dict_users.pth', map_location=device))

<All keys matched successfully>

## Validation Data 불러오기

In [7]:
users_path = "/Users/song/Desktop/AI Factory/HumanUnderStanding2024/val dataset"

mAcc_users = pd.read_parquet(users_path + "/ch2024_val__m_acc_part_1.parquet.gzip")

mAcc_users.head()

Unnamed: 0,subject_id,timestamp,x,y,z
0,1,2023-08-20 00:00:00.025,0.933201,-3.522235,9.164511
1,1,2023-08-20 00:00:00.043,0.947558,-3.522235,9.169296
2,1,2023-08-20 00:00:00.110,0.9667,-3.479164,9.164511
3,1,2023-08-20 00:00:00.131,0.947558,-3.522235,9.159725
4,1,2023-08-20 00:00:00.150,0.918844,-3.531806,9.159725


In [8]:
val_label = pd.read_csv(users_path + '/val_label.csv')

val_label.head()

Unnamed: 0,subject_id,date,Q1,Q2,Q3,S1,S2,S3,S4
0,1,2023-08-20,1,1,1,0,0,0,0
1,1,2023-08-21,1,1,1,0,0,1,0
2,1,2023-08-22,0,1,1,0,1,1,0
3,1,2023-08-23,0,1,1,0,0,1,0
4,1,2023-08-24,1,1,1,0,0,1,0


In [9]:
val_label_users = val_label[val_label['subject_id'] == 1]

val_label_users['subject_id'].unique()

array([1])

In [10]:
mGps_users = pd.read_parquet(users_path + '/ch2024_val__m_gps.parquet.gzip')

mGps_users.head()

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed
0,1,2023-08-20 00:00:08,144.217651,0.016095,0.926485,0.143791
1,1,2023-08-20 00:00:13,144.217651,0.01609,0.926477,0.160771
2,1,2023-08-20 00:00:18,144.217651,0.016091,0.926478,0.006571
3,1,2023-08-20 00:00:23,144.217651,0.016091,0.926474,0.05931
4,1,2023-08-20 00:00:28,144.217651,0.016092,0.926477,0.049454


In [11]:
mGps_user = mGps_users[mGps_users['subject_id'] == 1]

mGps_user.head()

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed
0,1,2023-08-20 00:00:08,144.217651,0.016095,0.926485,0.143791
1,1,2023-08-20 00:00:13,144.217651,0.01609,0.926477,0.160771
2,1,2023-08-20 00:00:18,144.217651,0.016091,0.926478,0.006571
3,1,2023-08-20 00:00:23,144.217651,0.016091,0.926474,0.05931
4,1,2023-08-20 00:00:28,144.217651,0.016092,0.926477,0.049454


In [12]:
mhr_users = pd.read_parquet(users_path + '/ch2024_val__w_heart_rate.parquet.gzip')

mhr_users = mhr_users[mGps_users['subject_id'] == 1]

mhr_users.head()


Unnamed: 0,subject_id,timestamp,heart_rate
0,1,2023-08-20 00:00:44.572,0
1,1,2023-08-20 00:01:44.752,0
2,1,2023-08-20 00:02:44.919,0
3,1,2023-08-20 00:03:45.075,0
4,1,2023-08-20 00:04:45.248,0


In [13]:
mAcc_users['date'] = mAcc_users['timestamp'].dt.date
mAcc_users['date'] = pd.to_datetime(mAcc_users['date'])

mGps_user['date'] = mGps_user['timestamp'].dt.date
mGps_user['date'] = pd.to_datetime(mGps_user['date'])

mhr_users['date'] = mhr_users['timestamp'].dt.date
mhr_users['date'] = pd.to_datetime(mhr_users['date'])


val_label_users['date'] = pd.to_datetime(val_label_users['date'])

mAcc_users.rename(columns={"subject_id" : "userId"}, inplace=True)
mGps_user.rename(columns={"subject_id" : "userId"}, inplace=True)
mhr_users.rename(columns={"subject_id" : "userId"}, inplace=True)


In [14]:
val_label_users.rename(columns={"subject_id" : "userId"}, inplace=True)

In [15]:
val_mAcc_data_label = pd.merge(mAcc_users, val_label_users, on=['userId','date'], how='inner')

val_mAcc_data_label.head()

Unnamed: 0,userId,timestamp,x,y,z,date,Q1,Q2,Q3,S1,S2,S3,S4
0,1,2023-08-20 00:00:00.025,0.933201,-3.522235,9.164511,2023-08-20,1,1,1,0,0,0,0
1,1,2023-08-20 00:00:00.043,0.947558,-3.522235,9.169296,2023-08-20,1,1,1,0,0,0,0
2,1,2023-08-20 00:00:00.110,0.9667,-3.479164,9.164511,2023-08-20,1,1,1,0,0,0,0
3,1,2023-08-20 00:00:00.131,0.947558,-3.522235,9.159725,2023-08-20,1,1,1,0,0,0,0
4,1,2023-08-20 00:00:00.150,0.918844,-3.531806,9.159725,2023-08-20,1,1,1,0,0,0,0


In [16]:
mGps_user = pd.merge(mGps_user, val_label_users, on=['userId','date'], how='inner')

mGps_user.head()

Unnamed: 0,userId,timestamp,altitude,latitude,longitude,speed,date,Q1,Q2,Q3,S1,S2,S3,S4
0,1,2023-08-20 00:00:08,144.217651,0.016095,0.926485,0.143791,2023-08-20,1,1,1,0,0,0,0
1,1,2023-08-20 00:00:13,144.217651,0.01609,0.926477,0.160771,2023-08-20,1,1,1,0,0,0,0
2,1,2023-08-20 00:00:18,144.217651,0.016091,0.926478,0.006571,2023-08-20,1,1,1,0,0,0,0
3,1,2023-08-20 00:00:23,144.217651,0.016091,0.926474,0.05931,2023-08-20,1,1,1,0,0,0,0
4,1,2023-08-20 00:00:28,144.217651,0.016092,0.926477,0.049454,2023-08-20,1,1,1,0,0,0,0


In [17]:
mhr_users = pd.merge(mhr_users, val_label_users, on=['userId','date'], how='inner')

mhr_users.head()

Unnamed: 0,userId,timestamp,heart_rate,date,Q1,Q2,Q3,S1,S2,S3,S4
0,1,2023-08-20 00:00:44.572,0,2023-08-20,1,1,1,0,0,0,0
1,1,2023-08-20 00:01:44.752,0,2023-08-20,1,1,1,0,0,0,0
2,1,2023-08-20 00:02:44.919,0,2023-08-20,1,1,1,0,0,0,0
3,1,2023-08-20 00:03:45.075,0,2023-08-20,1,1,1,0,0,0,0
4,1,2023-08-20 00:04:45.248,0,2023-08-20,1,1,1,0,0,0,0


In [23]:
def sliding_window(data, window_size, step_size, feature_col ,label_cols):
    n_samples = len(data)
    X, y = [], []
    for start in range(0, n_samples - window_size + 1, step_size):
        end = start + window_size
        window = data.iloc[start:end]
        X.append(window[feature_col].values)
        y.append(window[label_cols].iloc[0].values)  # 라벨은 윈도우의 첫 번째 값 사용
    return np.array(X), np.array(y)

In [24]:
# 슬라이딩 윈도우 설정 (2.5초)
window_size_acc_hr = 150  # 2.5초 * 60Hz
step_size_acc_hr = 150    # 2.5초 * 60Hz
window_size_gps = 3     # 3 * 5 = 15초
step_size_gps = 3       # 3 * 5 = 15초

label_columns = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']

# 슬라이딩 윈도우 생성
X_acc, y = sliding_window(val_mAcc_data_label, window_size_acc_hr, step_size_acc_hr, ['x', 'y', 'z'] ,label_columns)

# 사용자 및 날짜별로 슬라이딩 윈도우 적용

In [25]:
X_hr, _ = create_sliding_windows(mhr_users, window_size_acc_hr, step_size_acc_hr, ['heart_rate'] ,label_columns)
X_gps, _ = create_sliding_windows(mGps_user, window_size_gps, step_size_gps, ['latitude', 'longitude'] ,label_columns)


In [31]:
date, _ = create_sliding_windows(val_mAcc_data_label, window_size_acc_hr, step_size_acc_hr, ['date'] ,label_columns)
userId, _ = create_sliding_windows(val_mAcc_data_label, window_size_acc_hr, step_size_acc_hr, ['userId']  ,label_columns)

In [35]:
max_length = max(len(X_acc), len(X_hr), len(X_gps))

# 데이터셋을 최대 길이에 맞추어 패딩
X_acc_padded = pad_to_max_length(X_acc, max_length)
X_hr_padded = pad_to_max_length(X_hr, max_length)
X_gps_padded = pad_to_max_length(X_gps, max_length)
date = pad_to_max_length(date, max_length)
userId = pad_to_max_length(userId, max_length)

# 텐서로 변환
X_acc_tensor = torch.tensor(X_acc_padded, dtype=torch.float32)
X_hr_tensor = torch.tensor(X_hr_padded, dtype=torch.float32)
X_gps_tensor = torch.tensor(X_gps_padded, dtype=torch.float32)

In [36]:
print(f"Shape of X_acc_tensor: {X_acc_tensor.shape}")
print(f"Shape of X_hr_tensor: {X_hr_tensor.shape}")
print(f"Shape of X_gps_tensor: {X_gps_tensor.shape}")
print(f"Shape of date: {date.shape}")
print(f"Shape of userId: {userId.shape}")

Shape of X_acc_tensor: torch.Size([1178541, 150, 3])
Shape of X_hr_tensor: torch.Size([1178541, 150, 1])
Shape of X_gps_tensor: torch.Size([1178541, 3, 2])
Shape of date: (1178541, 150, 1)
Shape of userId: (1178541, 150, 1)


In [37]:
# 데이터셋 및 데이터로더 준비
val_dataset = TensorDataset(X_acc_tensor, X_hr_tensor,X_gps_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [38]:
# 모델 평가 및 예측 값 저장
model.eval()

LSTMModel(
  (lstm_acc): LSTM(3, 64, num_layers=2, batch_first=True)
  (lstm_hr): LSTM(1, 64, num_layers=2, batch_first=True)
  (lstm_gps): LSTM(2, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=192, out_features=10, bias=True)
)

In [39]:
predictions = []
with torch.no_grad():
    for X_acc_batch, X_hr_batch, X_gps_batch in val_loader:
        X_acc_batch, X_hr_batch, X_gps_batch = X_acc_batch.to(device), X_hr_batch.to(device), X_gps_batch.to(device)
        output = model(X_acc_batch, X_hr_batch, X_gps_batch)
        predictions.append(output.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)

In [40]:
label_col = ['pmEmotion', 'pmStress', 'userId', 'lightsleepduration', 'wakeupduration', 'deepsleepduration', 'durationtosleep', 'remsleepduration', 'hr_max', 'hr_average']

# 예측 결과를 데이터프레임으로 변환
predictions_df = pd.DataFrame(predictions, columns=label_col)
# predictions_df['date'] = date_info

# # 날짜별로 그룹화하여 평균 계산
# grouped_predictions = predictions_df.groupby('date').mean()

# # 결과 출력
# print(grouped_predictions)

predictions_df.head()

Unnamed: 0,pmEmotion,pmStress,userId,lightsleepduration,wakeupduration,deepsleepduration,durationtosleep,remsleepduration,hr_max,hr_average
0,0.168576,0.017561,5.007957,0.3518,0.031546,-0.199462,-0.061714,0.329936,-0.843757,-1.148313
1,0.432812,-0.234909,4.366338,0.264252,-0.739102,0.551136,-0.506537,0.83512,-0.332779,-0.710235
2,0.384209,0.209817,4.300361,0.023013,-0.266451,0.196407,-0.184196,0.383186,-0.689983,-0.92479
3,0.416516,0.231133,4.024417,-0.135322,-0.850933,0.261417,-0.50508,0.107946,-0.447854,-0.607842
4,0.279345,0.099136,5.211491,0.367325,0.057752,0.088415,-0.001078,0.206475,-0.905855,-1.206617


In [41]:
date_data = date[:, 0]
userId_data = userId[:, 0]

date_df = pd.DataFrame(date_data, columns=['date'])

date_df.head()

Unnamed: 0,date
0,2023-08-20
1,2023-08-20
2,2023-08-20
3,2023-08-20
4,2023-08-20


In [42]:
userId_df = pd.DataFrame(userId_data, columns=['userId'])

userId_df.head()

Unnamed: 0,userId
0,1
1,1
2,1
3,1
4,1


In [43]:
predictions_df['date']= date_df
predictions_df['userId'] = userId_df

In [49]:
predictions_df = predictions_df[predictions_df['userId'] != 0]

In [50]:
mean_predicted = predictions_df.groupby(['userId', 'date']).mean()
mean_predicted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pmEmotion,pmStress,lightsleepduration,wakeupduration,deepsleepduration,durationtosleep,remsleepduration,hr_max,hr_average
userId,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2023-08-20,0.179739,0.031498,0.13903,0.134338,-0.010567,0.168541,0.16358,-0.511389,-0.822704
1,2023-08-21,0.178784,0.028327,0.13926,0.13982,-0.00889,0.175022,0.162733,-0.509395,-0.821653
1,2023-08-22,0.174627,0.024146,0.138767,0.143231,-0.007055,0.179338,0.161883,-0.510879,-0.824825
1,2023-08-23,0.174956,0.02774,0.142942,0.141956,-0.008673,0.176242,0.162466,-0.512901,-0.825132
1,2023-08-24,0.177404,0.0291,0.139613,0.142459,-0.008247,0.177352,0.161642,-0.511234,-0.824695


In [51]:
print(len(mean_predicted))

40


In [35]:
print(val_label_users.shape)

(40, 9)


In [55]:
import pickle
#이제 "mod"라는 이름에 파일을 불러왔기 때문에 mod로 원하는 작업을 수행하면 된다.
with open('rf_model', 'rb') as f:
    clf = pickle.load(f)

In [56]:
X_features = ['pmEmotion', 'pmStress', 'lightsleepduration', 'wakeupduration', 'deepsleepduration', 'durationtosleep', 'remsleepduration', 'hr_max', 'hr_average']

prediction_X = mean_predicted[X_features]

prediction_X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pmEmotion,pmStress,lightsleepduration,wakeupduration,deepsleepduration,durationtosleep,remsleepduration,hr_max,hr_average
userId,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2023-08-20,0.179739,0.031498,0.13903,0.134338,-0.010567,0.168541,0.16358,-0.511389,-0.822704
1,2023-08-21,0.178784,0.028327,0.13926,0.13982,-0.00889,0.175022,0.162733,-0.509395,-0.821653
1,2023-08-22,0.174627,0.024146,0.138767,0.143231,-0.007055,0.179338,0.161883,-0.510879,-0.824825
1,2023-08-23,0.174956,0.02774,0.142942,0.141956,-0.008673,0.176242,0.162466,-0.512901,-0.825132
1,2023-08-24,0.177404,0.0291,0.139613,0.142459,-0.008247,0.177352,0.161642,-0.511234,-0.824695


In [57]:
y_pred = clf.predict(prediction_X)



In [60]:
y = val_label_users[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']]

y.head()

Unnamed: 0,Q1,Q2,Q3,S1,S2,S3,S4
0,1,1,1,0,0,0,0
1,1,1,1,0,0,1,0
2,0,1,1,0,1,1,0
3,0,1,1,0,0,1,0
4,1,1,1,0,0,1,0


In [61]:
accuracy = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, average=None)

print(f'Random Forest Classifier Accuracy: {accuracy:.4f}')
print(f'Random Forest Classifier F1-Score: {f1}')

weights = np.array([1.5, 1.5, 1.0, 1.5, 1.5, 1.5, 1.5])

# 가중 평균 F1-Score 계산
weighted_f1_score = np.sum(f1 * weights)

print(f"F1-Score (Weight) : {weighted_f1_score:.4f}")

Random Forest Classifier Accuracy: 0.0000
Random Forest Classifier F1-Score: [0.73015873 0.         0.6440678  0.         0.57142857 0.90410959
 0.29787234]
F1-Score (Weight) : 4.3994


In [62]:
print(y_pred)

print(y)

[[1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]
 [1 0 1 0 1 1 1]]
    Q1  Q2  Q3  S1  S2  S3  S4
0    1   1   1   0   0   0   0
1    1   1   1   0   0   1   0
2    0   1   1   0   1   1   0
3    0   1   1   0   0   1   0
4    1   1   1   0   0   1   0
5    1   1   1   0   1   1   0
6    1   1   1   0   1   1   1
7    1   1   1   0   1   1   1
8    1   1   1   0   0   1   0
9    1   