In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#### Data 불러오기

In [None]:
df = pd.read_csv("/content/drive/MyDrive/train/data_mart/data_merge.csv")
df_add = pd.read_csv("/content/drive/MyDrive/train/data_mart/data_merge_add.csv")

df_add.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr
0,2020-08-31 00:00:00,-0.000904,4.448405,8.366823,user07,37.276019,127.115323,44.890444,
1,2020-08-31 00:00:00,-0.000904,4.448405,8.366823,user07,37.276078,127.11555,37.783222,
2,2020-08-31 00:00:00,0.025839,2.306513,8.952953,user07,37.276019,127.115323,44.890444,
3,2020-08-31 00:00:00,0.025839,2.306513,8.952953,user07,37.276078,127.11555,37.783222,
4,2020-08-31 00:02:00,0.090913,0.010326,9.815679,user07,37.276159,127.115936,42.482,


In [None]:
df_add = df_add.fillna(0)

In [None]:
df_add.isna().sum()

timestamp    0
x            0
y            0
z            0
userId       0
lat          0
lon          0
accuracy     0
hr           0
dtype: int64

In [None]:
df = pd.concat(objs=[df, df_add], axis=0)
df.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity
0,2020-08-30 03:59:00,-0.075511,0.217579,9.747272,user01,37.544928,127.05441,34.94725,0.0,3.0
1,2020-08-30 03:59:00,-0.075511,0.217579,9.747272,user01,37.544849,127.054274,61.097727,0.0,3.0
2,2020-08-30 03:59:00,-0.075511,0.217579,9.747272,user01,37.544954,127.054375,39.202667,0.0,3.0
3,2020-08-30 03:59:00,-0.062778,0.186066,9.749598,user01,37.544928,127.05441,34.94725,0.0,3.0
4,2020-08-30 03:59:00,-0.062778,0.186066,9.749598,user01,37.544849,127.054274,61.097727,0.0,3.0


In [None]:
df.isna().sum()

timestamp          0
x                  0
y                  0
z                  0
userId             0
lat                0
lon                0
accuracy           0
hr                 0
activity     2028416
dtype: int64

In [None]:
# 표준화할 열들 선택
columns_to_standardize = ['x', 'y', 'z', 'lat', 'lon', 'hr', 'activity']

# StandardScaler 객체 생성
scaler = StandardScaler()

df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

df.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity
0,2020-08-30 03:59:00,-0.149766,-0.153538,0.729051,user01,0.185912,0.168258,34.94725,-2.441332,0.137235
1,2020-08-30 03:59:00,-0.149766,-0.153538,0.729051,user01,0.185899,0.168252,61.097727,-2.441332,0.137235
2,2020-08-30 03:59:00,-0.149766,-0.153538,0.729051,user01,0.185916,0.168257,39.202667,-2.441332,0.137235
3,2020-08-30 03:59:00,-0.145683,-0.161485,0.729498,user01,0.185912,0.168258,34.94725,-2.441332,0.137235
4,2020-08-30 03:59:00,-0.145683,-0.161485,0.729498,user01,0.185899,0.168252,61.097727,-2.441332,0.137235


In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date

df['date'] = pd.to_datetime(df['date'])

#### Sleep, Survey Feature들 합치기

In [None]:
sleep_data = "/content/drive/MyDrive/train/dataset/user_sleep_2020.csv"
survey_data = "/content/drive/MyDrive/train/dataset/user_survey_2020.csv"

sleep_df = pd.read_csv(sleep_data)
survey_df = pd.read_csv(survey_data)

In [None]:
# AM,PM 시계열 상으로 같은 날짜 합치기
survey_df = survey_df.groupby(['userId', 'date']).agg({
    'startInput': 'first',  # 시작 입력값은 AM의 값을 사용
    'endInput': 'last',  # 종료 입력값은 PM의 값을 사용
    'sleep': 'max',  # 수면 시간은 최대값을 사용 (일반적으로 AM에 기록)
    'sleepProblem': 'max',  # 수면 문제 역시 최대값을 사용
    'dream': 'max',
    'amCondition': 'max',
    'amEmotion': 'max',
    'pmEmotion': 'max',
    'pmStress': 'max',
    'pmFatigue': 'max',
    'caffeine': 'last',  # 카페인 섭취는 PM에 기록된 값을 사용
    'cAmount(ml)': 'sum',  # 카페인 섭취량은 합계를 사용
    'alcohol': 'last',
    'aAmount(ml)': 'sum'
}).reset_index()

# 결측치 처리하기
survey_df.fillna({
    'sleep': 0,
    'sleepProblem': 0,
    'dream': 0,
    'amCondition': 0,
    'amEmotion': 0,
    'pmEmotion': 0,
    'pmStress': 0,
    'pmFatigue': 0,
    'cAmount(ml)': 0,
    'aAmount(ml)': 0,
    'caffeine' : 0,
    'alcohol': 0  # 'alcohol' 컬럼에 대해서는 'not specific'으로 채움
}, inplace=True)

caffeine_map = {'not specific': 0, 'tea': 1, 'coke': 1, 'caffeinated drink': 1, 'coffee': 1, 0 : 0}
alcohol_map = {
    'not specified': 0, 'soju&beer': 1, 'soju': 1,
    'beer': 1, 'beer&rice wine' : 1, 'wine': 1, 'rice wine': 1,
    'cognac': 1, 'wine&beer': 1, 'kaoliang': 1, 0 : 0
}

# 매핑 적용
survey_df['caffeine'] = survey_df['caffeine'].map(caffeine_map)
survey_df['alcohol'] = survey_df['alcohol'].map(alcohol_map)


sleep_df.drop(columns='timezone', axis=1, inplace=True)

sleep_df.head()

Unnamed: 0,userId,date,startDt,endDt,lastUpdate,wakeupduration,lightsleepduration,deepsleepduration,wakeupcount,durationtosleep,...,hr_average,hr_min,hr_max,rr_average,rr_min,rr_max,breathing_disturbances_intensity,snoring,snoringepisodecount,sleep_score
0,user24,2020-08-30,1598708760,1598740860,1598748252,3840,13800,9000,3,1740,...,72,63,82,15,10,20,10,840,1,86
1,user24,2020-08-31,1598800140,1598825940,1598833333,4740,9540,6360,1,3180,...,69,60,82,15,11,20,8,2700,3,52
2,user24,2020-09-01,1598885940,1598908800,1598916190,1920,8760,4740,1,1440,...,70,58,94,15,12,23,8,0,0,61
3,user24,2020-09-02,1598979420,1598999580,1599006969,2760,7740,7440,1,1380,...,68,60,80,15,12,19,-1,240,1,40
4,user24,2020-09-03,1599060780,1599085980,1599093370,3900,11640,7620,1,1680,...,71,61,83,15,11,25,7,300,1,55


In [None]:
data_all = pd.merge(survey_df, sleep_df, on = ["date", "userId"], how='inner')

In [None]:
data_rf_features = data_all[['pmEmotion', 'pmStress', 'userId', 'lightsleepduration', 'wakeupduration', 'deepsleepduration', 'durationtosleep', 'remsleepduration', 'hr_max', 'hr_average', 'sleep_score' ,'date']]

In [None]:
data_rf_features['date']= pd.to_datetime(data_rf_features['date'])


# 스케일러 초기화
scaler = StandardScaler()

# 스케일링할 열 선택 (datetime 열과 target 열을 제외한 수치형 열)
features_to_scale = data_rf_features.drop(columns=['date', 'userId']).columns

# 데이터프레임을 수치형 데이터와 날짜 데이터로 분리
numeric_data = data_rf_features[features_to_scale]
date_data = data_rf_features['date']
user_id_data = data_rf_features['userId']

# 스케일링 적용
scaled_features = scaler.fit_transform(numeric_data)

# 스케일링된 데이터를 데이터프레임으로 변환
data_features = pd.DataFrame(scaled_features, columns=features_to_scale)

# 날짜 데이터와 결합
data_features['date'] = date_data
data_features['userId'] = user_id_data

# 열 순서 복원
data_features = data_features[['date', 'userId'] + list(features_to_scale)]

data_features.head()

Unnamed: 0,date,userId,pmEmotion,pmStress,lightsleepduration,wakeupduration,deepsleepduration,durationtosleep,remsleepduration,hr_max,hr_average,sleep_score
0,2020-08-31,user01,0.443799,-0.016037,0.331114,0.040274,0.274698,0.039552,0.952865,0.797754,0.958187,1.217948
1,2020-09-01,user01,0.443799,-0.016037,-0.061009,-0.821648,0.529081,-0.468321,0.83184,0.663944,1.546426,1.178819
2,2020-09-02,user01,0.443799,-0.016037,-0.695324,-0.205989,0.592676,-0.141831,0.872182,0.931565,1.546426,0.044088
3,2020-09-03,user01,-1.76943,-0.855799,-0.118674,-0.649263,1.695002,-0.250661,0.448593,0.396322,1.399367,1.178819
4,2020-09-04,user01,0.443799,-0.855799,-0.268603,-0.772395,1.016648,-0.395767,-0.035509,1.065376,1.399367,0.317988


##### Feature들과 data 합치기

In [None]:
df_features = pd.merge(df, data_features, on=['date', 'userId'], how='inner')

df_features.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity,...,pmEmotion,pmStress,lightsleepduration,wakeupduration,deepsleepduration,durationtosleep,remsleepduration,hr_max,hr_average,sleep_score
0,2020-08-31 00:10:00,-0.102148,-0.226125,0.720885,user01,0.175609,0.163487,12.38,-2.441332,-3.720634,...,0.443799,-0.016037,0.331114,0.040274,0.274698,0.039552,0.952865,0.797754,0.958187,1.217948
1,2020-08-31 00:11:00,-0.101582,-0.226406,0.721456,user01,0.175611,0.163488,13.13925,-2.441332,0.137235,...,0.443799,-0.016037,0.331114,0.040274,0.274698,0.039552,0.952865,0.797754,0.958187,1.217948
2,2020-08-31 00:12:00,-0.102154,-0.226862,0.722177,user01,0.175611,0.163488,12.992,-2.441332,0.137235,...,0.443799,-0.016037,0.331114,0.040274,0.274698,0.039552,0.952865,0.797754,0.958187,1.217948
3,2020-08-31 00:12:00,-0.102154,-0.226862,0.722177,user01,0.17561,0.163488,12.7495,-2.441332,0.137235,...,0.443799,-0.016037,0.331114,0.040274,0.274698,0.039552,0.952865,0.797754,0.958187,1.217948
4,2020-08-31 00:12:00,-0.101931,-0.226364,0.720646,user01,0.175611,0.163488,12.992,-2.441332,0.137235,...,0.443799,-0.016037,0.331114,0.040274,0.274698,0.039552,0.952865,0.797754,0.958187,1.217948


#### train_label과 train_user 데이터 합성

In [None]:
# train_label.csv 파일 불러오기
train_label_path = "/content/drive/MyDrive/train/dataset/train_label.csv"

train_labels = pd.read_csv(train_label_path)
train_labels.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [None]:
train_labels['date'] = pd.to_datetime(train_labels['date'])
train_labels.rename(columns={"subject_id" : 'userId'}, inplace=True)

In [None]:
df_label = pd.merge(df_features, train_labels, on=['userId', 'date'], how='inner')

df_label.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity,...,hr_max,hr_average,sleep_score,Q1,Q2,Q3,S1,S2,S3,S4
0,2020-08-31 00:10:00,-0.102148,-0.226125,0.720885,user01,0.175609,0.163487,12.38,-2.441332,-3.720634,...,0.797754,0.958187,1.217948,0,0,0,0,1,1,1
1,2020-08-31 00:11:00,-0.101582,-0.226406,0.721456,user01,0.175611,0.163488,13.13925,-2.441332,0.137235,...,0.797754,0.958187,1.217948,0,0,0,0,1,1,1
2,2020-08-31 00:12:00,-0.102154,-0.226862,0.722177,user01,0.175611,0.163488,12.992,-2.441332,0.137235,...,0.797754,0.958187,1.217948,0,0,0,0,1,1,1
3,2020-08-31 00:12:00,-0.102154,-0.226862,0.722177,user01,0.17561,0.163488,12.7495,-2.441332,0.137235,...,0.797754,0.958187,1.217948,0,0,0,0,1,1,1
4,2020-08-31 00:12:00,-0.101931,-0.226364,0.720646,user01,0.175611,0.163488,12.992,-2.441332,0.137235,...,0.797754,0.958187,1.217948,0,0,0,0,1,1,1


In [None]:
df_label['userId'] = df_label['userId'].str.replace('user', '').astype(int)

##### Feature Data 나눠 놓기

In [None]:
label_cols = ['pmEmotion', 'pmStress', 'userId', 'lightsleepduration', 'wakeupduration', 'deepsleepduration', 'durationtosleep', 'remsleepduration', 'hr_max', 'hr_average', 'sleep_score' ,'date']

label_data = df_label[label_cols].drop_duplicates(subset=['userId','date']).set_index(['userId','date'])
dates = df_label['date'].values
users_id = df_label['userId'].values

#### Sliding Window 나누기

In [None]:
# 슬라이딩 윈도우 함수 정의
def sliding_windows(X, dates, user_ids, label_data, window_size, step_size):
    X_windows = []
    y_windows = []
    for start in range(0, len(X) - window_size + 1, step_size):
        end = start + window_size
        X_windows.append(X[start:end])
        date_for_window = dates[start]
        user_id_for_window = user_ids[start]
        y_windows.append(label_data.loc[(date_for_window, user_id_for_window)].values)  # 날짜 및 사용자 라벨 사용
    return np.array(X_windows), np.array(y_windows)


In [None]:
# 슬라이딩 윈도우 설정 (60분)
window_size = 60  # 60분
step_size = 1   # step 1
X_acc, y_label = sliding_windows(df_label[["x", "y","z"]], users_id, dates, label_data ,window_size, step_size)

In [None]:
X_gps, _ = sliding_windows(df_label[["lat", "lon"]], users_id, dates, label_data ,window_size, step_size)

In [None]:
X_hr, _ = sliding_windows(df_label[["hr"]], users_id, dates, label_data ,window_size, step_size)

In [None]:
X_acc = np.array(X_acc)
X_gps = np.array(X_gps)
X_hr = np.array(X_hr)
y_labels = np.array(y_label)

#### Tensor size 맞추기

In [None]:
# 텐서로 변환
X_acc_tensor = torch.tensor(X_acc, dtype=torch.float32)
X_gps_tensor = torch.tensor(X_gps, dtype=torch.float32)
X_hr_tensor = torch.tensor(X_hr, dtype=torch.float32)
y_tensor = torch.tensor(y_labels, dtype=torch.float32)

In [None]:
print(f"Shape of X_acc_tensor: {X_acc_tensor.shape}")
print(f"Shape of X_gps_tensor: {X_gps_tensor.shape}")
print(f"Shape of X_hr_tensor: {X_hr_tensor.shape}")
print(f"Shape of y_tensor: {y_tensor.shape}")

Shape of X_acc_tensor: torch.Size([2576338, 60, 3])
Shape of X_gps_tensor: torch.Size([2576338, 60, 2])
Shape of X_hr_tensor: torch.Size([2576338, 60, 1])
Shape of y_tensor: torch.Size([2576338, 10])


In [None]:
# 데이터셋 및 데이터로더 준비
train_dataset = TensorDataset(X_acc_tensor,X_hr_tensor,X_gps_tensor,y_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

#### LSTM Model

In [None]:
# 모델 정의
class LSTMModel(nn.Module):
    def __init__(self, input_size_acc, input_size_hr, input_size_gps ,hidden_size, num_layers, num_classes, dropout_prob):
        super(LSTMModel, self).__init__()


        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.dropout = nn.Dropout(dropout_prob)

        self.lstm_acc = nn.LSTM(input_size_acc, hidden_size, num_layers, batch_first=True)
        self.lstm_hr = nn.LSTM(input_size_hr, hidden_size, num_layers, batch_first=True)
        self.lstm_gps = nn.LSTM(input_size_gps, hidden_size, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size * 3, num_classes)

    def forward(self, x_acc, x_hr, x_gps):
        h0_acc = torch.zeros(self.num_layers, x_acc.size(0), self.hidden_size).to(device)
        c0_acc = torch.zeros(self.num_layers, x_acc.size(0), self.hidden_size).to(device)
        out_acc, _ = self.lstm_acc(x_acc, (h0_acc, c0_acc))

        out_acc = self.dropout(out_acc)

        h0_hr = torch.zeros(self.num_layers, x_hr.size(0), self.hidden_size).to(device)
        c0_hr = torch.zeros(self.num_layers, x_hr.size(0), self.hidden_size).to(device)
        out_hr, _ = self.lstm_hr(x_hr, (h0_hr, c0_hr))

        out_hr = self.dropout(out_hr)

        h0_gps = torch.zeros(self.num_layers, x_gps.size(0), self.hidden_size).to(device)
        c0_gps = torch.zeros(self.num_layers, x_gps.size(0), self.hidden_size).to(device)
        out_gps, _ = self.lstm_gps(x_gps, (h0_gps, c0_gps))

        out_gps = self.dropout(out_gps)


        out_acc = out_acc[:, -1, :]
        out_hr = out_hr[:, -1, :]
        out_gps = out_gps[:, -1, :]

        combined = torch.cat((out_acc, out_hr, out_gps), dim=1)
        out = self.fc(combined)
        return out

In [None]:
# 하이퍼파라미터 설정
input_size_acc = X_acc_tensor.shape[2]
input_size_hr = X_hr_tensor.shape[2]
input_size_gps = X_gps_tensor.shape[2]
hidden_size = 64
num_layers = 2
num_classes = y_tensor.shape[1]
dropout_prob = 0.2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(input_size_acc, input_size_hr, input_size_gps, hidden_size, num_layers, num_classes, dropout_prob).to(device)

print(device)

cuda


In [None]:
# 손실 함수와 옵티마이저 설정
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습 루프
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for x_acc, x_hr, x_gps, labels in train_loader:
        x_acc, x_hr, x_gps, labels = x_acc.to(device), x_hr.to(device), x_gps.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(x_acc, x_hr, x_gps)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x_acc.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

Epoch 1/30, Loss: 0.8504
Epoch 2/30, Loss: 0.7226
Epoch 3/30, Loss: 0.6656
Epoch 4/30, Loss: 0.6271
Epoch 5/30, Loss: 0.5957
Epoch 6/30, Loss: 0.5837
Epoch 7/30, Loss: 0.6237
Epoch 8/30, Loss: 0.6338
Epoch 9/30, Loss: 0.6303
Epoch 10/30, Loss: 0.6416
Epoch 11/30, Loss: 0.6398
Epoch 12/30, Loss: 0.5829
Epoch 13/30, Loss: 0.5875
Epoch 14/30, Loss: 0.6902
Epoch 15/30, Loss: 0.6969
Epoch 16/30, Loss: 0.7544
Epoch 17/30, Loss: 0.7592
Epoch 18/30, Loss: 0.7276
Epoch 19/30, Loss: 0.7367
Epoch 20/30, Loss: 0.7529
Epoch 21/30, Loss: 0.7455
Epoch 22/30, Loss: 0.7315
Epoch 23/30, Loss: 0.7428
Epoch 24/30, Loss: 0.7501
Epoch 25/30, Loss: 0.7364
Epoch 26/30, Loss: 0.7290
Epoch 27/30, Loss: 0.7400
Epoch 28/30, Loss: 0.7590
Epoch 29/30, Loss: 0.7535
Epoch 30/30, Loss: 0.7730


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/train/lstm_random_model_v0_0_2.pt')