In [28]:
import pandas as pd
import numpy as np

# 1. 데이터 로드
sb = pd.read_csv('csv/scoreboard_2024_06_2025.csv')
bt = pd.read_csv('csv/batter_2024_06_2025.csv')
pt = pd.read_csv('csv/pitcher_2024_06_2025.csv')

# 날짜순 정렬 및 승리 여부 이진화
sb = sb.sort_values(['year', 'month', 'day', 'starttime'])
sb['win_binary'] = (sb['result'] == 1).astype(int)

# 2. 타자 데이터를 경기(idx)별 팀 합계로 집계
team_batting = bt.groupby('idx').agg({
    'hit': 'sum',
    'bat_num': 'sum'
}).reset_index()

# 기본 경기 정보와 결합
base_df = pd.merge(sb[['idx', 'team', 'year', 'month', 'day', 'home', 'away', 'r', 'win_binary', 'dbheader']], 
                   team_batting, on='idx', how='left')

# 3. 상대 팀 득점(실점) 정보 매칭 (더블헤더 dbheader 포함)
# game_id에 dbheader를 추가하여 더블헤더 경기를 구분합니다.
base_df['game_id'] = (base_df['year'].astype(str) + 
                     base_df['month'].astype(str).str.zfill(2) + 
                     base_df['day'].astype(str).str.zfill(2) + "_" + 
                     base_df['home'] + "_" + base_df['away'] + "_" + 
                     base_df['dbheader'].astype(str))

opp_scores = base_df[['game_id', 'team', 'r']].rename(columns={'team': 'opp_team', 'r': 'runs_allowed'})
base_df = pd.merge(base_df, opp_scores, on='game_id')
base_df = base_df[base_df['team'] != base_df['opp_team']].copy()

# 4. [특성 1, 2, 3] 최근 30경기 이동 평균 (transform 사용으로 컬럼 유지)
# ※ 주의: 7월 데이터만 사용할 경우 30경기 미만이면 NaN이 발생하므로 min_periods를 설정합니다.
base_df = base_df.sort_values(['team', 'year', 'month', 'day', 'game_id'])

# 평균 득점 및 실점
base_df['f1_avg_runs_scored_30'] = base_df.groupby('team')['r'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).mean())
base_df['f2_avg_runs_allowed_30'] = base_df.groupby('team')['runs_allowed'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).mean())

# 팀 타율 (누적 합계를 구한 뒤 나누는 방식이 더 정확합니다)
rolling_hits = base_df.groupby('team')['hit'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).sum())
rolling_ab = base_df.groupby('team')['bat_num'].transform(lambda x: x.shift(1).rolling(window=30, min_periods=1).sum())
base_df['f3_team_batting_avg_30'] = rolling_hits / rolling_ab

# 5. [특성 4] 선발 투수 시즌 평균 실점
starters = pt[pt['mound'] == 1][['idx', 'name', 'losescore']].copy()
starters = pd.merge(starters, sb[['idx', 'year', 'month', 'day']], on='idx')
starters = starters.sort_values(['name', 'year', 'month', 'day'])
starters['f4_pitcher_runs_avg'] = starters.groupby('name')['losescore'].transform(lambda x: x.shift(1).expanding().mean())

base_df = pd.merge(base_df, starters[['idx', 'f4_pitcher_runs_avg']], on='idx', how='left')

# 6. [특성 5] 팀 전체 승률
base_df['f5_total_win_pct'] = base_df.groupby('team')['win_binary'].transform(lambda x: x.shift(1).expanding().mean())

# 7. [특성 6] 홈/원정 승률
def calc_ha_win_pct(df):
    df = df.copy()
    df['is_home'] = (df['team'] == df['home'])
    # 홈일 때와 원정일 때를 각각 그룹화하여 승률 계산
    df['f6_ha_win_pct'] = df.groupby(['team', 'is_home'])['win_binary'].transform(lambda x: x.shift(1).expanding().mean())
    return df

base_df = calc_ha_win_pct(base_df)

# 8. 최종 MLP 데이터셋 구성
feature_cols = ['f1_avg_runs_scored_30', 'f2_avg_runs_allowed_30', 'f3_team_batting_avg_30', 
                'f4_pitcher_runs_avg', 'f5_total_win_pct', 'f6_ha_win_pct']

# 홈팀 데이터와 원정팀 데이터 분리 후 결합
home_df = base_df[base_df['team'] == base_df['home']][['game_id', 'win_binary'] + feature_cols]
home_df.columns = ['game_id', 'home_win'] + ['h_' + c for c in feature_cols]

away_df = base_df[base_df['team'] == base_df['away']][['game_id'] + feature_cols]
away_df.columns = ['game_id'] + ['a_' + c for c in feature_cols]

final_dataset = pd.merge(home_df, away_df, on='game_id').dropna()

# Target: 홈팀 승리 시 0, 원정팀 승리 시 1
final_dataset['target'] = (final_dataset['home_win'] == 0).astype(int)
final_dataset = final_dataset.drop(columns=['home_win'])

# 저장
final_dataset.to_csv('kbo_mlp_training_data.csv', index=False)
print(f"학습용 데이터셋 생성 완료: {len(final_dataset)} 경기 데이터 포함")

학습용 데이터셋 생성 완료: 1085 경기 데이터 포함


### skit-learn

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# 1. 전처리된 데이터 로드
df = pd.read_csv('kbo_mlp_training_data.csv')

In [30]:

# 2. 특징(X)과 정답(y) 분리
# game_id는 식별용이므로 제외, target(0:홈승, 1:원정승)을 예측
X = df.drop(columns=['game_id', 'target'])
y = df['target']

In [31]:

# 3. 데이터 분할 (훈련 75%, 테스트 25%) [cite: 20]
# shuffle=False는 시간 순서대로 테스트하기 위함입니다 (자료 기준) [cite: 76]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 4. 데이터 스케일링 (StandardScaler 사용) [cite: 75, 77]
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.transform(X_test)


In [32]:

# 5. 세 가지 최적화 알고리즘(Solver) 비교 [cite: 64, 71]
solvers = ['sgd', 'adam', 'lbfgs']
results = {}

print("--- 모델 학습 결과 ---")
for s in solvers:
    # 자료의 설정값 반영: 은닉층 (3,), 활성화 함수 relu, 최대 반복 1000 [cite: 47, 83]
    clf = MLPClassifier(
        hidden_layer_sizes=(12, 6), 
        activation='tanh', 
        solver=s, 
        max_iter=2000, 
        random_state=0
    )
    
    # 모델 학습
    clf.fit(X_train_scaled, y_train)
    
    # 예측 및 정확도 계산 [cite: 85, 88]
    y_pred = clf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[s] = acc
    print(f"Algorithm: {s.upper()} | Accuracy: {acc:.1%}")

--- 모델 학습 결과 ---
Algorithm: SGD | Accuracy: 52.5%




Algorithm: ADAM | Accuracy: 51.2%
Algorithm: LBFGS | Accuracy: 51.6%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [33]:

# 가장 높은 정확도를 보인 알고리즘 확인
best_solver = max(results, key=results.get)
print(f"\n추천 알고리즘: {best_solver.upper()} (정확도 {results[best_solver]:.1%})")


추천 알고리즘: SGD (정확도 52.5%)


In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# 1. 데이터 스케일링 (신경망에서 가장 중요!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. 테스트할 파라미터 조합 설정
param_grid = {
    'hidden_layer_sizes': [(3,), (6,), (12,), (6, 3), (12, 6)], # 노드 수를 늘려보거나 층을 쌓아봄
    'solver': ['sgd', 'adam', 'lbfgs'],
    'activation': ['relu', 'tanh'],
    'max_iter': [2000] # 충분히 학습하도록 반복 횟수 증가
}

# 3. 그리드 서치 실행 (모든 조합을 다 해보고 최고를 찾음)
mlp = MLPClassifier(random_state=1)
grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 4. 결과 출력
print(f"최고 정확도: {grid_search.best_score_:.1%}")
print(f"최적의 파라미터: {grid_search.best_params_}")

# 5. 최적의 모델로 테스트 데이터 평가
best_model = grid_search.best_estimator_
test_acc = best_model.score(X_test_scaled, y_test)
print(f"최종 테스트 결과: {test_acc:.1%}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of it

최고 정확도: 53.7%
최적의 파라미터: {'activation': 'tanh', 'hidden_layer_sizes': (6, 3), 'max_iter': 2000, 'solver': 'adam'}
최종 테스트 결과: 48.8%


In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 1. 데이터 로드 및 전처리
df = pd.read_csv('kbo_mlp_training_data.csv')
X = df.drop(columns=['game_id', 'target']).values
y = df['target'].values.reshape(-1, 1)

# 데이터 분할 (셔플 없이 시간 순서대로)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 텐서 변환
X_train_t = torch.FloatTensor(X_train_scaled)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test_scaled)
y_test_t = torch.FloatTensor(y_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=16, shuffle=True)


# 2. MLP 모델 클래스 정의
class KBOPredictor(nn.Module):
    def __init__(self):
        super(KBOPredictor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(12, 12),  # 입력층 12 -> 은닉층 12
            nn.Tanh(),
            nn.Linear(12, 6),   # 은닉층 12 -> 은닉층 6
            nn.Tanh(),
            nn.Linear(6, 1),    # 은닉층 6 -> 출력층 1
            nn.Sigmoid()        # 0~1 사이의 확률로 출력
        )

    def forward(self, x):
        return self.model(x)

# 모델, 손실함수, 최적화기 설정
model = KBOPredictor()
criterion = nn.BCELoss()
# weight_decay는 가중치가 너무 커지지 않게 규제함 (L2 규제)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-2)

# 3. 학습 루프
epochs = 200
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # 20회마다 평가 결과 출력
    if (epoch + 1) % 20 == 0:
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_t)
            train_acc = ((train_outputs > 0.5).float() == y_train_t).float().mean()
            
            test_outputs = model(X_test_t)
            test_acc = ((test_outputs > 0.5).float() == y_test_t).float().mean()
            
            print(f"Epoch [{epoch+1}/{epochs}] Loss: {loss.item():.4f} | Train Acc: {train_acc:.1%} | Test Acc: {test_acc:.1%}")

# 4. 최종 결과 확인
model.eval()
with torch.no_grad():
    final_pred = (model(X_test_t) > 0.5).float()
    print(f"\n[최종 테스트 정확도]: {accuracy_score(y_test, final_pred):.2%}")

Epoch [20/200] Loss: 0.7076 | Train Acc: 54.5% | Test Acc: 53.0%
Epoch [40/200] Loss: 0.6962 | Train Acc: 54.3% | Test Acc: 54.4%
Epoch [60/200] Loss: 0.6699 | Train Acc: 54.1% | Test Acc: 56.2%
Epoch [80/200] Loss: 0.6980 | Train Acc: 55.1% | Test Acc: 55.3%
Epoch [100/200] Loss: 0.6763 | Train Acc: 54.6% | Test Acc: 56.2%
Epoch [120/200] Loss: 0.6990 | Train Acc: 55.0% | Test Acc: 55.3%
Epoch [140/200] Loss: 0.7009 | Train Acc: 54.1% | Test Acc: 57.1%
Epoch [160/200] Loss: 0.7230 | Train Acc: 55.1% | Test Acc: 55.8%
Epoch [180/200] Loss: 0.6925 | Train Acc: 54.6% | Test Acc: 56.7%
Epoch [200/200] Loss: 0.7214 | Train Acc: 53.5% | Test Acc: 56.7%

[최종 테스트 정확도]: 56.68%


In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 1. 데이터 로드 및 전처리
df = pd.read_csv('kbo_mlp_training_data.csv')
X = df.drop(columns=['game_id', 'target']).values
y = df['target'].values.reshape(-1, 1)

# 데이터 분할 (셔플 없이 시간 순서대로)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 텐서 변환
X_train_t = torch.FloatTensor(X_train_scaled)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test_scaled)
y_test_t = torch.FloatTensor(y_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=16, shuffle=True)

# 2. 모델 설계 (Dropout 추가로 과적합 방지)
class AdvancedKBOPredictor(nn.Module):
    def __init__(self):
        super(AdvancedKBOPredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(12, 8),
            nn.Tanh(),
            nn.Dropout(0.2),
            nn.Linear(8, 4),
            nn.Tanh(),
            nn.Linear(4, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

# 모델, 손실함수, 최적화기 설정
model = AdvancedKBOPredictor()
criterion = nn.BCELoss()
# weight_decay는 가중치가 너무 커지지 않게 규제함 (L2 규제)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-2)

# 3. 학습 루프
epochs = 200
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # 20회마다 평가 결과 출력
    if (epoch + 1) % 20 == 0:
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_t)
            train_acc = ((train_outputs > 0.5).float() == y_train_t).float().mean()
            
            test_outputs = model(X_test_t)
            test_acc = ((test_outputs > 0.5).float() == y_test_t).float().mean()
            
            print(f"Epoch [{epoch+1}/{epochs}] Loss: {loss.item():.4f} | Train Acc: {train_acc:.1%} | Test Acc: {test_acc:.1%}")

# 4. 최종 결과 확인
model.eval()
with torch.no_grad():
    final_pred = (model(X_test_t) > 0.5).float()
    print(f"\n[최종 테스트 정확도]: {accuracy_score(y_test, final_pred):.2%}")

Epoch [20/200] Loss: 0.6894 | Train Acc: 55.3% | Test Acc: 58.1%
Epoch [40/200] Loss: 0.6759 | Train Acc: 54.5% | Test Acc: 54.8%
Epoch [60/200] Loss: 0.6757 | Train Acc: 54.1% | Test Acc: 56.2%
Epoch [80/200] Loss: 0.6524 | Train Acc: 54.7% | Test Acc: 56.7%
Epoch [100/200] Loss: 0.7317 | Train Acc: 54.7% | Test Acc: 55.3%
Epoch [120/200] Loss: 0.6924 | Train Acc: 54.7% | Test Acc: 55.8%
Epoch [140/200] Loss: 0.7032 | Train Acc: 54.5% | Test Acc: 55.8%
Epoch [160/200] Loss: 0.6845 | Train Acc: 55.0% | Test Acc: 55.8%
Epoch [180/200] Loss: 0.6884 | Train Acc: 54.6% | Test Acc: 56.7%
Epoch [200/200] Loss: 0.7507 | Train Acc: 54.0% | Test Acc: 56.2%

[최종 테스트 정확도]: 56.22%
