In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#### Data 불러오기

In [None]:
df = pd.read_csv("/content/drive/MyDrive/train/data_mart/data_merge.csv")

df.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity
0,2020-08-30 03:59:00,-0.075511,0.217579,9.747272,user01,37.544928,127.05441,34.94725,0.0,3.0
1,2020-08-30 03:59:00,-0.075511,0.217579,9.747272,user01,37.544849,127.054274,61.097727,0.0,3.0
2,2020-08-30 03:59:00,-0.075511,0.217579,9.747272,user01,37.544954,127.054375,39.202667,0.0,3.0
3,2020-08-30 03:59:00,-0.062778,0.186066,9.749598,user01,37.544928,127.05441,34.94725,0.0,3.0
4,2020-08-30 03:59:00,-0.062778,0.186066,9.749598,user01,37.544849,127.054274,61.097727,0.0,3.0


In [None]:
# 표준화할 열들 선택
columns_to_standardize = ['x', 'y', 'z', 'lat', 'lon', 'hr', 'activity']

# StandardScaler 객체 생성
scaler = StandardScaler()

df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

df.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity
0,2020-08-30 03:59:00,-0.083763,-0.154072,0.712045,user01,0.207593,0.197913,34.94725,-1.975513,0.137235
1,2020-08-30 03:59:00,-0.083763,-0.154072,0.712045,user01,0.207582,0.197907,61.097727,-1.975513,0.137235
2,2020-08-30 03:59:00,-0.083763,-0.154072,0.712045,user01,0.207597,0.197911,39.202667,-1.975513,0.137235
3,2020-08-30 03:59:00,-0.080098,-0.162368,0.712477,user01,0.207593,0.197913,34.94725,-1.975513,0.137235
4,2020-08-30 03:59:00,-0.080098,-0.162368,0.712477,user01,0.207582,0.197907,61.097727,-1.975513,0.137235


In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date

df['date'] = pd.to_datetime(df['date'])

#### train_label과 train_user 데이터 합성

In [None]:
# train_label.csv 파일 불러오기
train_label_path = "/content/drive/MyDrive/train/dataset/train_label.csv"

train_labels = pd.read_csv(train_label_path)
train_labels.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [None]:
users = ['user01', 'user02', 'user03', 'user04', 'user05', 'user06', 'user11', 'user12']

train_label_users = train_labels[train_labels['subject_id'].isin(users)]

train_label_users.head()

Unnamed: 0,subject_id,date,Q1,Q2,Q3,S1,S2,S3,S4
0,user01,2020-08-30,1,0,0,1,1,0,0
1,user01,2020-08-31,0,0,0,0,1,1,1
2,user01,2020-09-01,0,0,0,0,1,1,1
3,user01,2020-09-02,1,0,0,1,1,1,1
4,user01,2020-09-03,1,0,0,0,1,1,1


In [None]:
train_label_users['date'] = pd.to_datetime(train_label_users['date'])
train_label_users.rename(columns={"subject_id" : 'userId'}, inplace=True)

In [None]:
df_label = pd.merge(df, train_label_users, on=['userId', 'date'], how='inner')

df_label.head()

Unnamed: 0,timestamp,x,y,z,userId,lat,lon,accuracy,hr,activity,date,Q1,Q2,Q3,S1,S2,S3,S4
0,2020-08-30 03:59:00,-0.083763,-0.154072,0.712045,user01,0.207593,0.197913,34.94725,-1.975513,0.137235,2020-08-30,1,0,0,1,1,0,0
1,2020-08-30 03:59:00,-0.083763,-0.154072,0.712045,user01,0.207582,0.197907,61.097727,-1.975513,0.137235,2020-08-30,1,0,0,1,1,0,0
2,2020-08-30 03:59:00,-0.083763,-0.154072,0.712045,user01,0.207597,0.197911,39.202667,-1.975513,0.137235,2020-08-30,1,0,0,1,1,0,0
3,2020-08-30 03:59:00,-0.080098,-0.162368,0.712477,user01,0.207593,0.197913,34.94725,-1.975513,0.137235,2020-08-30,1,0,0,1,1,0,0
4,2020-08-30 03:59:00,-0.080098,-0.162368,0.712477,user01,0.207582,0.197907,61.097727,-1.975513,0.137235,2020-08-30,1,0,0,1,1,0,0


In [None]:
df_label['userId'] = df_label['userId'].str.replace('user', '').astype(int)

In [None]:
label_cols = ["userId", "date", "Q1", "Q2", "Q3", "S1", "S2", "S3", "S4"]

label_data = df_label[label_cols].drop_duplicates(subset=['userId','date']).set_index(['userId','date'])
dates = df_label['date'].values
users_id = df_label['userId'].values

#### Sliding Window 나누기

In [None]:
# 슬라이딩 윈도우 함수 정의
def sliding_windows(X, dates, user_ids, label_data, window_size, step_size):
    X_windows = []
    y_windows = []
    for start in range(0, len(X) - window_size + 1, step_size):
        end = start + window_size
        X_windows.append(X[start:end])
        date_for_window = dates[start]
        user_id_for_window = user_ids[start]
        y_windows.append(label_data.loc[(date_for_window, user_id_for_window)].values)  # 날짜 및 사용자 라벨 사용
    return np.array(X_windows), np.array(y_windows)


In [None]:

# 슬라이딩 윈도우 설정 (10분)
window_size = 10  # 10분
step_size = 1   # step 1
X_acc, y_label = sliding_windows(df_label[["x", "y","z"]], users_id, dates, label_data ,window_size, step_size)

In [None]:
X_gps, _ = sliding_windows(df_label[["lat", "lon"]], users_id, dates, label_data ,window_size, step_size)

In [None]:
X_hr, _ = sliding_windows(df_label[["hr"]], users_id, dates, label_data ,window_size, step_size)

In [None]:
X_acc = np.array(X_acc)
X_gps = np.array(X_gps)
X_hr = np.array(X_hr)
y_labels = np.array(y_label)

#### Tensor size 맞추기

In [None]:
from torch.utils.data import Dataset, DataLoader
class SensorDataset(Dataset):
    def __init__(self, X_acc, X_hr, X_gps, y):
        self.X_acc = X_acc
        self.X_hr = X_hr
        self.X_gps = X_gps
        self.y = y

    def __len__(self):
        return len(self.X_acc)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X_acc[idx], dtype=torch.float32),
            torch.tensor(self.X_hr[idx], dtype=torch.float32),
            torch.tensor(self.X_gps[idx], dtype=torch.float32),
            torch.tensor(self.y[idx], dtype=torch.float32)
        )

In [None]:
# 텐서로 변환
X_acc_tensor = torch.tensor(X_acc, dtype=torch.float32)
X_gps_tensor = torch.tensor(X_gps, dtype=torch.float32)
X_hr_tensor = torch.tensor(X_hr, dtype=torch.float32)
y_tensor = torch.tensor(y_labels, dtype=torch.float32)

In [None]:
print(f"Shape of X_acc_tensor: {X_acc_tensor.shape}")
print(f"Shape of X_gps_tensor: {X_gps_tensor.shape}")
print(f"Shape of X_hr_tensor: {X_hr_tensor.shape}")
print(f"Shape of y_tensor: {y_tensor.shape}")

Shape of X_acc_tensor: torch.Size([866780, 10, 3])
Shape of X_gps_tensor: torch.Size([866780, 10, 2])
Shape of X_hr_tensor: torch.Size([866780, 10, 1])
Shape of y_tensor: torch.Size([866780, 7])


In [None]:
# 데이터셋 및 데이터로더 준비
train_dataset = SensorDataset(X_acc_tensor.numpy(),X_hr_tensor.numpy(),X_gps_tensor.numpy(),y_tensor.numpy())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

#### LSTM Model

In [None]:
# Residual Block 클래스 정의
class ResidualBlock(nn.Module):
    def __init__(self, input_dim, output_dim, num_layers):
        super(ResidualBlock, self).__init__()
        self.layers = nn.ModuleList([
            nn.LSTM(input_dim if i == 0 else output_dim, output_dim, batch_first=True)
            for i in range(num_layers)
        ])
        self.attentions = nn.ModuleList([
            nn.MultiheadAttention(embed_dim=output_dim, num_heads=1, batch_first=True)
            for _ in range(num_layers)
        ])
        self.shortcut = nn.Sequential()
        if input_dim != output_dim:
            self.shortcut = nn.Sequential(
                nn.Linear(input_dim, output_dim),
                nn.LeakyReLU()
            )

    def forward(self, x):
        residual = self.shortcut(x)
        for lstm, attention in zip(self.layers, self.attentions):
            x, _ = lstm(x)
            x, _ = attention(x, x, x)
        return x + residual

# FeatureNet 클래스 정의
class FeatureNet(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers, dropout_prob):
        super(FeatureNet, self).__init__()
        self.conv = nn.Conv1d(in_channels=input_size, out_channels=32, kernel_size=5, stride=1, padding=2)
        self.batch_norm = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(dropout_prob)
        self.residual_block = ResidualBlock(input_dim=32, output_dim=hidden_dim, num_layers=num_layers)

    def forward(self, x):
        x = x.transpose(1, 2)  # Conv1d expects input of shape (batch_size, in_channels, seq_length)
        x = self.conv(x)
        x = self.batch_norm(x)
        x = nn.LeakyReLU()(x)
        x = self.dropout(x)

        x = x.transpose(1, 2)  # LSTM expects input of shape (batch_size, seq_length, input_size)
        x = self.residual_block(x)

        x = x[:, -1, :]  # Get the last output of the LSTM
        return x

# MultiFeatureModel 클래스 정의
class MultiFeatureModel(nn.Module):
    def __init__(self, num_classes, hidden_dim, num_layers, dropout_prob):
        super(MultiFeatureModel, self).__init__()
        self.acc_net = FeatureNet(input_size=3, hidden_dim=hidden_dim, num_layers=num_layers, dropout_prob=dropout_prob)
        self.hr_net = FeatureNet(input_size=1, hidden_dim=hidden_dim, num_layers=num_layers, dropout_prob=dropout_prob)
        self.gps_net = FeatureNet(input_size=2, hidden_dim=hidden_dim, num_layers=num_layers, dropout_prob=dropout_prob)

        self.fc1 = nn.Linear(hidden_dim * 3, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.batch_norm = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_acc, x_hr, x_gps):
        acc_out = self.acc_net(x_acc)
        hr_out = self.hr_net(x_hr)
        gps_out = self.gps_net(x_gps)

        combined = torch.cat((acc_out, hr_out, gps_out), dim=1)
        x = self.fc1(combined)
        x = nn.LeakyReLU()(x)
        x = self.dropout(x)
        x = self.batch_norm(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)
        return x


In [None]:
# 하이퍼파라미터 설정
hidden_dim = 64
num_layers = 2
dropout_prob = 0.2
num_classes = y_tensor.shape[1]

# 모델 인스턴스 생성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 모델 인스턴스 생성
model = MultiFeatureModel(num_classes, hidden_dim, num_layers, dropout_prob).to(device)

print(device)

cuda


In [None]:
# 손실 함수와 옵티마이저 설정
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 학습 루프
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for x_acc, x_hr, x_gps, labels in train_loader:
        x_acc, x_hr, x_gps, labels = x_acc.to(device), x_hr.to(device), x_gps.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(x_acc, x_hr, x_gps)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x_acc.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

Epoch 1/30, Loss: 0.6389
Epoch 2/30, Loss: 0.6300
Epoch 3/30, Loss: 0.6245
Epoch 4/30, Loss: 0.6204
Epoch 5/30, Loss: 0.6185
Epoch 6/30, Loss: 0.6170
Epoch 7/30, Loss: 0.6131
Epoch 8/30, Loss: 0.6087
Epoch 9/30, Loss: 0.6078
Epoch 10/30, Loss: 0.6103
Epoch 11/30, Loss: 0.6100
Epoch 12/30, Loss: 0.6064
Epoch 13/30, Loss: 0.6033
Epoch 14/30, Loss: 0.6031
Epoch 15/30, Loss: 0.6030
Epoch 16/30, Loss: 0.6041
Epoch 17/30, Loss: 0.6027
Epoch 18/30, Loss: 0.5993
Epoch 19/30, Loss: 0.6003
Epoch 20/30, Loss: 0.6025
Epoch 21/30, Loss: 0.6000
Epoch 22/30, Loss: 0.5980
Epoch 23/30, Loss: 0.5986
Epoch 24/30, Loss: 0.5934
Epoch 25/30, Loss: 0.5959
Epoch 26/30, Loss: 0.6036
Epoch 27/30, Loss: 0.5980
Epoch 28/30, Loss: 0.5924
Epoch 29/30, Loss: 0.5917
Epoch 30/30, Loss: 0.5928


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/train/lstm_model_v0_0_7.pt')