# User Log 기반 LSTM Churn Model

> `user_logs_v2.csv` 원시 로그에서 30일 시퀀스를 만들고, LSTM으로 이탈 여부를 예측하는 노트북입니다.
> 
> - 입력: (N, T, F) 시퀀스 (T=관측 일수, F=로그 피처 수)
> - 타겟: `is_churn` (train_v2.csv 기준)
> - 모델: PyTorch LSTM + MLP 헤드


In [None]:
import pandas as pd
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

RANDOM_STATE = 719
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)


## 1. 라벨 및 로그 데이터 로드


In [None]:
# 1) 라벨 로드 (is_churn)
train = pd.read_csv("../data/train_v2.csv")  # msno, is_churn 포함이라고 가정
train = train[["msno", "is_churn"]]

print("Train shape:", train.shape)
train.head()


In [None]:
# 2) user_logs 로드
logs = pd.read_csv("../data/user_logs_v2.csv")
print("Logs shape:", logs.shape)
logs.head()


In [None]:
# 날짜 처리 및 관측 윈도우 필터링
logs["date"] = pd.to_datetime(logs["date"])

T = pd.Timestamp("2017-04-01")
start_date = pd.Timestamp("2017-03-01")
end_date   = pd.Timestamp("2017-03-31")

logs = logs[(logs["date"] >= start_date) & (logs["date"] <= end_date)]
print("Filtered logs:", logs.shape)
logs.head()


## 2. User별 (T, F) 시퀀스 텐서 생성


In [None]:
# 사용할 로그 피처
seq_features = [
    "num_25", "num_50", "num_75", "num_985", "num_100",
    "total_secs",
]

# (msno, date) 기준 일별 합계
daily = (
    logs
    .groupby(["msno", "date"], as_index=False)[seq_features]
    .sum()
)

print("Daily shape:", daily.shape)
daily.head()


In [None]:
# 전체 유저/날짜 인덱스 생성
all_users = train["msno"].unique()
all_dates = pd.date_range(start_date, end_date, freq="D")

T_len = len(all_dates)
F_dim = len(seq_features)
print("T_len (window length):", T_len)
print("F_dim (feature dim):", F_dim)

# 빠른 lookup을 위한 인덱싱
daily_indexed = daily.set_index(["msno", "date"])  # MultiIndex

X_list = []
y_list = []

for msno, is_churn in train[["msno", "is_churn"]].itertuples(index=False):
    seq = np.zeros((T_len, F_dim), dtype=np.float32)
    for t_idx, dt in enumerate(all_dates):
        key = (msno, dt)
        if key in daily_indexed.index:
            row = daily_indexed.loc[key, seq_features]
            seq[t_idx, :] = row.values.astype(np.float32)
        # else: 활동 없음 → 0 유지
    X_list.append(seq)
    y_list.append(is_churn)

X_seq = np.stack(X_list, axis=0)   # (N, T, F)
y_arr = np.array(y_list, dtype=np.int64)

print("X_seq shape:", X_seq.shape)
print("y_arr shape:", y_arr.shape)


## 3. PyTorch Dataset / DataLoader 구성


In [None]:
class UserLogDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  # (N, T, F)
        self.y = torch.tensor(y, dtype=torch.float32)  # (N,)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# train/valid/test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X_seq, y_arr,
    test_size=0.3,
    stratify=y_arr,
    random_state=RANDOM_STATE,
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=RANDOM_STATE,
)

train_ds = UserLogDataset(X_train, y_train)
valid_ds = UserLogDataset(X_valid, y_valid)
test_ds  = UserLogDataset(X_test, y_test)

train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=256)
test_dl  = DataLoader(test_ds, batch_size=256)

len(train_ds), len(valid_ds), len(test_ds)


## 4. LSTM 모델 정의


In [None]:
class UserLogLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, bidirectional=True, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        out_dim = hidden_dim * (2 if bidirectional else 1)
        self.head = nn.Sequential(
            nn.Linear(out_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: (B, T, F)
        out, _ = self.lstm(x)       # out: (B, T, H*D)
        last_hidden = out[:, -1, :] # (B, H*D)
        prob = self.head(last_hidden).squeeze(-1)  # (B,)
        return prob


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UserLogLSTM(input_dim=F_dim).to(device)
model


## 5. 학습 루프 및 평가


In [None]:
from torch.optim import Adam
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score, precision_score, f1_score

optimizer = Adam(model.parameters(), lr=1e-3)

# 클래스 불균형 보정용 pos_weight (원하면 BCEWithLogitsLoss + sigmoid 제거로 변경 가능)
pos_weight = torch.tensor(
    (y_train == 0).sum() / (y_train == 1).sum(),
    dtype=torch.float32,
    device=device,
)


def train_one_epoch(model, dataloader):
    model.train()
    total_loss = 0.0
    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model(X_batch)           # (B,) 0~1
        loss = F.binary_cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(y_batch)
    return total_loss / len(dataloader.dataset)


@torch.no_grad()
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0.0
    all_y = []
    all_p = []
    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        y_pred = model(X_batch)
        loss = F.binary_cross_entropy(y_pred, y_batch)
        total_loss += loss.item() * len(y_batch)

        all_y.append(y_batch.cpu().numpy())
        all_p.append(y_pred.cpu().numpy())

    all_y = np.concatenate(all_y)
    all_p = np.concatenate(all_p)
    return total_loss / len(dataloader.dataset), all_y, all_p


def compute_metrics(y_true, y_proba, thr=0.5):
    y_pred = (y_proba >= thr).astype(int)
    return {
        "roc_auc": roc_auc_score(y_true, y_proba),
        "pr_auc": average_precision_score(y_true, y_proba),
        "recall": recall_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
    }


n_epochs = 10
for epoch in range(1, n_epochs + 1):
    train_loss = train_one_epoch(model, train_dl)
    valid_loss, y_val, p_val = evaluate(model, valid_dl)
    metrics = compute_metrics(y_val, p_val, thr=0.5)
    print(
        f"Epoch {epoch:02d} | "
        f"train_loss={train_loss:.4f} | valid_loss={valid_loss:.4f} | "
        f"ROC-AUC={metrics['roc_auc']:.4f} | PR-AUC={metrics['pr_auc']:.4f} | Recall={metrics['recall']:.4f}"
    )
