In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr  # 피어슨 상관계수 계산
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt

In [2]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.loss")
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# 하이퍼파라미터
num_of_try = 23
num_epochs = 30
learning_rate = 0.001
num_folds = 5
dropout = 0.3

In [None]:
draw_based_lottery = pd.read_csv('/Users/parksung-cheol/Desktop/논문/도박논문(국내)/gambling_data/draw_based_lottery/draw_based_lottery.csv')
instant_lottery = pd.read_csv('/Users/parksung-cheol/Desktop/논문/도박논문(국내)/gambling_data/instant_lottery/instant_lottery.csv')
online_lottery = pd.read_csv('/Users/parksung-cheol/Desktop/gambling_data/online_lottery/online_lottery.csv')
pension_lottery = pd.read_csv('/Users/parksung-cheol/Desktop/gambling_data/pension_lottery/pension_lottery.csv')

dataframes = [draw_based_lottery, instant_lottery, online_lottery,pension_lottery]

lottery_all = pd.concat(dataframes, ignore_index=True)

lottery_all.sort_values(by=['User_ID'], inplace=True,ignore_index=True)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/parksung-cheol/Desktop/gambling_data/draw_based_lottery/draw_based_lottery.csv'

In [9]:
lottery_all

Unnamed: 0,User_ID,Date,Time,Price
0,0000,71,20,1000
1,0000,66,23,1000
2,0000,57,13,1000
3,00000,12,12,5000
4,00000,18,15,5000
...,...,...,...,...
23009210,zzzzzzzzzzl,32,21,49000
23009211,zzzzzzzzzzl,33,9,19000
23009212,zzzzzzzzzzl,47,2,75000
23009213,zzzzzzzzzzl,32,21,2000


In [None]:
# 데이터 불러오기
merged_df = pd.read_csv('./final_all/final_all_lottery.csv')

In [11]:
# Total_Amount 컬럼 생성
merged_df['Total_Amount'] = (
    merged_df['F2'] + merged_df['F4'] + merged_df['F6'] + merged_df['F8']
)

# Total_Amount 및 구매 금액 Feature에 로그 변환 적용
for col in ['F2', 'F4', 'F6', 'F8', 'Total_Amount']:
    merged_df[col] = np.log1p(merged_df[col])  # log(1 + x)

# Min-Max Scaling 적용
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

merged_df[['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8']] = feature_scaler.fit_transform(
    merged_df[['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8']]
)
merged_df[['Total_Amount']] = target_scaler.fit_transform(merged_df[['Total_Amount']])

KeyError: 'F2'

In [6]:
def calculate_accuracy(y_true, y_pred):
    """
    정규화된 MAE를 기반으로 Accuracy 계산
    """
    mae = np.mean(np.abs(y_true - y_pred))  # 평균 절대 오차
    mean_true = np.mean(y_true)  # 실제 값의 평균  
    normalized_mae = mae / (mean_true + 1e-8)  # 정규화 (0으로 나누기 방지)
    accuracy = 1 - normalized_mae
    return max(0, accuracy) * 100 # Accuracy 하한을 0으로 제한

# LSTM 모델 (Hidden layer 2개, Dropout 추가)
class LSTMModel(nn.Module):
    def __init__(self, input_dim=8, hidden_dim=50, output_dim=1, num_layers=2, dropout=dropout):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)  # out: (batch_size, seq_len, hidden_dim)
        out = out[:, -1, :]  # 마지막 타임스텝의 출력
        out = self.fc(out)
        return out
    
# Sliding Window: 레이블을 7일 합계로 구함
class LotteryDataset(Dataset):
    def __init__(self, df, input_len=28, output_len=7):
        self.data = []
        self.labels = []
        user_groups = df.groupby('User_ID')
        for _, group in user_groups:
            group = group.sort_values(by='Date')
            feature_data = group[['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8']].values
            total_amount = group['Total_Amount'].values

            for i in range(len(total_amount) - input_len - output_len + 1):
                self.data.append(feature_data[i:i+input_len])
                self.labels.append(total_amount[i+input_len:i+input_len+output_len].sum())  # 7일 합계

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx], dtype=torch.float32),
            torch.tensor([self.labels[idx]], dtype=torch.float32)
        )

In [7]:
# 평가 함수
def evaluate_model(model, test_loader, device, target_scaler):
    model.eval()
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            all_targets.extend(targets.cpu().numpy())
            all_predictions.extend(outputs.cpu().numpy())

    y_true = np.array(all_targets)
    y_pred = np.array(all_predictions)

    # 역정규화
    y_true_original = target_scaler.inverse_transform(y_true.reshape(-1, 1)).flatten()
    y_pred_original = target_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

    # 평가 지표
    mae = np.mean(np.abs(y_true_original - y_pred_original))
    mse = mean_squared_error(y_true_original, y_pred_original)
    r2 = r2_score(y_true_original, y_pred_original)
    accuracy = calculate_accuracy(y_true_original, y_pred_original)
    
    # 피어슨 상관계수
    pearson_corr, _ = pearsonr(y_true_original, y_pred_original)

    return mae, mse, r2, accuracy, pearson_corr, y_true_original, y_pred_original

In [8]:
os.makedirs(f'./final_all/LSTM/{num_of_try}', exist_ok=True)

In [9]:
# 시각화 함수
def plot_loss_per_epoch(train_losses, val_losses, filename=f"./final_all/LSTM/{num_of_try}/loss_per_epoch.png"):
    plt.figure(figsize=(13, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, marker='o', color='b', label='Train Loss')
    plt.plot(range(1, len(val_losses) + 1), val_losses, marker='o', color='r', label='Validation Loss')
    plt.title('Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()
    plt.savefig(filename, dpi=600)
    plt.close()

def plot_predictions_vs_actual(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/predictions_vs_actual.png"):
    plt.figure(figsize=(10, 6))
    plt.plot(y_true, color='blue', label='Actual', alpha=0.7)
    plt.plot(y_pred, color='red', label='Predicted', alpha=0.6)
    plt.title('Actual vs Predicted')
    plt.xlabel('Index')
    plt.ylabel('Total Amount')
    plt.legend()
    plt.savefig(filename, dpi=600)
    plt.close()

def plot_scatter_actual_vs_predicted(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/scatter_actual_vs_predicted.png"):
    plt.figure(figsize=(15, 15))
    plt.scatter(y_true, y_pred, alpha=0.8, color='black')
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--', linewidth=3)
    plt.title('Scatter Plot: Actual vs Predicted', fontsize=25)
    plt.xlabel('Normalized Total Amount', fontsize=17)
    plt.ylabel('Predicted Total Amount', fontsize=17)
    plt.grid(True)
    plt.savefig(filename, bbox_inches='tight', dpi=600)
    plt.close()

def plot_scatter_actual_vs_predicted2(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/scatter_actual_vs_predicted.png"):
    plt.figure(figsize=(15, 15))
    plt.scatter(y_true, y_pred, alpha=0.8, color='purple')
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='black', linestyle='--', linewidth=4)
    plt.title('Scatter Plot: Actual vs Predicted', fontsize=25)
    plt.xlabel('Normalized Total Amount', fontsize=17)
    plt.ylabel('Predicted Total Amount', fontsize=17)
    plt.grid(True)
    plt.savefig(filename, bbox_inches='tight', dpi=600)
    plt.close()

def plot_scatter_actual_vs_predicted3(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/scatter_actual_vs_predicted.png"):
    plt.figure(figsize=(15, 15))
    plt.scatter(y_true, y_pred, alpha=1, color='purple')
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='green', linestyle='-', linewidth=3)
    plt.title('Scatter Plot: Actual vs Predicted', fontsize=25)
    plt.xlabel('Normalized Total Amount', fontsize=20)
    plt.ylabel('Predicted Total Amount', fontsize=20)
    plt.grid(True)
    plt.savefig(filename, bbox_inches='tight', dpi=600)
    plt.close()

def plot_scatter_actual_vs_predicted4(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/scatter_actual_vs_predicted.png"):
    plt.figure(figsize=(15, 15))
    plt.scatter(y_true, y_pred, alpha=1, color='purple')
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='black', linestyle='-', linewidth=3)
    plt.title('Scatter Plot: Actual vs Predicted', fontsize=25)
    plt.xlabel('Normalized Total Amount', fontsize=20)
    plt.ylabel('Predicted Total Amount', fontsize=20)
    plt.grid(True)
    plt.savefig(filename, bbox_inches='tight', dpi=600)
    plt.close()

def plot_scatter_actual_vs_predicted5(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/scatter_actual_vs_predicted.png"):
    plt.figure(figsize=(15, 15))
    plt.scatter(y_true, y_pred, alpha=1, color='orange')  # 점 색상을 밝은 주황색으로 변경
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='black', linestyle='-', linewidth=3)
    plt.title('Scatter Plot: Actual vs Predicted', fontsize=25)
    plt.xlabel('Normalized Total Amount', fontsize=20)
    plt.ylabel('Predicted Total Amount', fontsize=20)
    plt.grid(True)
    plt.savefig(filename, bbox_inches='tight', dpi=600)
    plt.close()

In [10]:
def save_predictions_to_txt(y_true, y_pred, filename=f"./final_all/LSTM/{num_of_try}/predictions.txt"):
    # Stack y_true and y_pred as columns
    data = np.column_stack((y_true, y_pred))
    # Save as a text file
    np.savetxt(filename, data, fmt="%.6f", delimiter="\t", header="y_true\ty_pred", comments="")
    print(f"File saved to {filename}")

In [11]:
# 데이터 분리
user_ids = merged_df['User_ID'].unique()
train_user_ids, test_user_ids = train_test_split(user_ids, test_size=0.1, random_state=42)

train_df = merged_df[merged_df['User_ID'].isin(train_user_ids)]
test_df = merged_df[merged_df['User_ID'].isin(test_user_ids)]

# Dataset 생성
train_dataset = LotteryDataset(train_df)
test_dataset = LotteryDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [12]:
# 모델 학습 및 평가 (K-Fold)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.MSELoss()

kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
# best_r2 = float('-inf')
best_p = float('-inf')
best_model_state = None

## 학습

In [13]:
fold_results=[]

fold = 1
for train_index, val_index in kf.split(train_user_ids):
    print(f"Fold {fold}/{num_folds}")
    fold_train_ids = train_user_ids[train_index]
    fold_val_ids = train_user_ids[val_index]

    fold_train_df = merged_df[merged_df['User_ID'].isin(fold_train_ids)]
    fold_val_df = merged_df[merged_df['User_ID'].isin(fold_val_ids)]

    fold_train_dataset = LotteryDataset(fold_train_df)
    fold_val_dataset = LotteryDataset(fold_val_df)

    fold_train_loader = DataLoader(fold_train_dataset, batch_size=32, shuffle=True)
    fold_val_loader = DataLoader(fold_val_dataset, batch_size=32, shuffle=False)

    model = LSTMModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    epoch_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        train_true = []
        train_pred = []

        # Training loop
        for inputs, targets in fold_train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            epoch_loss += loss.item()

            # Collect predictions and true values for train accuracy
            train_true.extend(targets.cpu().numpy())
            train_pred.extend(outputs.cpu().detach().numpy())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_loss = epoch_loss / len(fold_train_loader)
        epoch_losses.append(avg_loss)

        # Calculate train accuracy
        train_true_original = target_scaler.inverse_transform(np.array(train_true).reshape(-1, 1)).flatten()
        train_pred_original = target_scaler.inverse_transform(np.array(train_pred).reshape(-1, 1)).flatten()
        train_accuracy = calculate_accuracy(train_true_original, train_pred_original)

        # Validation loop
        model.eval()
        val_loss = 0
        val_true = []
        val_pred = []

        with torch.no_grad():
            for inputs, targets in fold_val_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

                # Collect predictions and true values for validation accuracy
                val_true.extend(targets.cpu().numpy())
                val_pred.extend(outputs.cpu().detach().numpy())

        avg_val_loss = val_loss / len(fold_val_loader)
        val_losses.append(avg_val_loss)

        # Calculate validation accuracy
        val_true_original = target_scaler.inverse_transform(np.array(val_true).reshape(-1, 1)).flatten()
        val_pred_original = target_scaler.inverse_transform(np.array(val_pred).reshape(-1, 1)).flatten()
        val_accuracy = calculate_accuracy(val_true_original, val_pred_original)

        # Print metrics for the current epoch
        print(
            f"Epoch {epoch+1}/{num_epochs} - "
            f"Train Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, "
            f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%"
        )

    # Validation evaluation after all epochs
    mae, mse, r2, accuracy, pearson_corr, _, _ = evaluate_model(model, fold_val_loader, device, target_scaler)
    print(f"Validation Results - MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}, Accuracy: {accuracy:.2f}%, p score: {pearson_corr:.4f}")

    # Fold 결과 저장
    fold_results.append({
        "fold": fold,
        "mae": mae,
        "mse": mse,
        "r2": r2,
        "accuracy": accuracy,
        "pearson_corr": pearson_corr
    })

    if pearson_corr > best_p:
        best_p = pearson_corr
        best_model_state = model.state_dict()

    fold += 1

Fold 1/5
Epoch 1/30 - Train Loss: 0.8404, Train Accuracy: 87.55%, Val Loss: 0.4337, Val Accuracy: 90.89%
Epoch 2/30 - Train Loss: 0.4348, Train Accuracy: 90.81%, Val Loss: 0.4042, Val Accuracy: 91.35%
Epoch 3/30 - Train Loss: 0.4140, Train Accuracy: 91.07%, Val Loss: 0.3789, Val Accuracy: 91.74%
Epoch 4/30 - Train Loss: 0.4048, Train Accuracy: 91.18%, Val Loss: 0.3740, Val Accuracy: 91.61%
Epoch 5/30 - Train Loss: 0.3970, Train Accuracy: 91.28%, Val Loss: 0.3892, Val Accuracy: 91.18%
Epoch 6/30 - Train Loss: 0.3918, Train Accuracy: 91.35%, Val Loss: 0.3847, Val Accuracy: 91.49%
Epoch 7/30 - Train Loss: 0.3863, Train Accuracy: 91.42%, Val Loss: 0.3741, Val Accuracy: 91.55%
Epoch 8/30 - Train Loss: 0.3837, Train Accuracy: 91.46%, Val Loss: 0.3692, Val Accuracy: 91.51%
Epoch 9/30 - Train Loss: 0.3793, Train Accuracy: 91.52%, Val Loss: 0.3550, Val Accuracy: 91.98%
Epoch 10/30 - Train Loss: 0.3776, Train Accuracy: 91.54%, Val Loss: 0.3677, Val Accuracy: 91.61%
Epoch 11/30 - Train Loss: 0.37

In [14]:
# 전체 fold 결과 출력
print("\n=== Cross-Validation Results ===")
for result in fold_results:
    print(
        f"Fold {result['fold']} - "
        f"MAE: {result['mae']:.4f}, MSE: {result['mse']:.4f}, "
        f"R2: {result['r2']:.4f}, Accuracy: {result['accuracy']:.2f}%, "
        f"p score: {result['pearson_corr']:.4f}"
    )


=== Cross-Validation Results ===
Fold 1 - MAE: 2.5534, MSE: 11.3829, R2: 0.8032, Accuracy: 91.98%, p score: 0.8970
Fold 2 - MAE: 2.5876, MSE: 11.8607, R2: 0.7841, Accuracy: 91.80%, p score: 0.8857
Fold 3 - MAE: 2.6873, MSE: 13.1109, R2: 0.7730, Accuracy: 91.49%, p score: 0.8916
Fold 4 - MAE: 2.5693, MSE: 11.7327, R2: 0.7806, Accuracy: 91.87%, p score: 0.8851
Fold 5 - MAE: 2.6804, MSE: 12.2047, R2: 0.7820, Accuracy: 91.48%, p score: 0.8846


In [15]:
# 최적 모델 저장
best_model_path = f'./final_all/LSTM/models/model_{num_of_try}_best.pth'
torch.save(best_model_state, best_model_path)
print(f'Best Model saved to {best_model_path} with p score: {best_p:.4f}')

Best Model saved to ./final_all/LSTM/models/model_23_best.pth with p score: 0.8970


In [16]:
# 최적 모델 로드 후 테스트 평가
best_model = LSTMModel().to(device)
best_model.load_state_dict(torch.load(best_model_path))
mae, mse, r2, accuracy, pearson_corr, y_true_original, y_pred_original = evaluate_model(
    best_model, test_loader, device, target_scaler
)
print(f'Test Results - MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}, Accuracy: {accuracy:.2f}%, p score: {pearson_corr:.4f}')

Test Results - MAE: 2.5765, MSE: 11.6240, R2: 0.7986, Accuracy: 91.87%, p score: 0.8941


In [17]:
# Loss 시각화 저장
plot_loss_per_epoch(epoch_losses, val_losses)

# 예측값과 실제값 비교 시각화 저장
plot_predictions_vs_actual(y_true_original, y_pred_original, filename=f"./final_all/LSTM/{num_of_try}/predictions_vs_actual_test.png")

In [18]:
# 함수와 파일명을 리스트로 정의
plot_functions = [
    plot_scatter_actual_vs_predicted,
    plot_scatter_actual_vs_predicted2,
    plot_scatter_actual_vs_predicted3,
    plot_scatter_actual_vs_predicted4,
    plot_scatter_actual_vs_predicted5
]

# 반복문으로 각 함수 호출
for i, plot_func in enumerate(plot_functions, start=1):
    filename = f"./final_all/LSTM/{num_of_try}/scatter_plot_test{i}.png"
    plot_func(y_true_original, y_pred_original, filename=filename)
    print(f"{i}번째 plot 저장되었습니다.")

1번째 plot 저장되었습니다.
2번째 plot 저장되었습니다.
3번째 plot 저장되었습니다.
4번째 plot 저장되었습니다.
5번째 plot 저장되었습니다.


In [19]:
save_predictions_to_txt(y_true_original, y_pred_original, filename=f"./final_all/LSTM/{num_of_try}/predictions.txt")

File saved to ./final_all/LSTM/23/predictions.txt
