In [10]:
import pandas as pd
import numpy as np

sb = pd.read_csv('csv/scoreboard_2024_09_2025.csv')
bt = pd.read_csv('csv/batter_2024_09_2025.csv')
pt = pd.read_csv('csv/pitcher_2024_09_2025.csv')


In [11]:
# 날짜순 정렬
sb = sb.sort_values(['year', 'month', 'day'])

# [수정] 승률 계산을 위한 이진화 (1: 승리, 0: 패배/무승부)
sb['win_binary'] = (sb['result'] == 1).astype(int)


# 2. 타자 데이터를 경기(idx)별 팀 합계로 집계
team_batting = bt.groupby('idx').agg({
    'hit': 'sum',
    'bat_num': 'sum'
}).reset_index()

# 기본 경기 정보와 결합
base_df = pd.merge(sb[['idx', 'team', 'year', 'month', 'day', 'dbheader', 'home', 'away', 'r', 'result', 'win_binary']], 
                   team_batting, on='idx', how='left')

In [12]:

# 3. 상대 팀 득점(실점) 정보 매칭 (zfill 에러 수정)
base_df['game_id'] = base_df['year'].astype(str) + \
                     base_df['month'].astype(str).str.zfill(2) + \
                     base_df['day'].astype(str).str.zfill(2) + "_" + \
                     base_df['dbheader'].astype(str) + "_" + \
                     base_df['home'] + "_" + base_df['away']


opp_scores = base_df[['game_id', 'team', 'r']].rename(columns={'team': 'opp_team', 'r': 'runs_allowed'})
base_df = pd.merge(base_df, opp_scores, on='game_id')
base_df = base_df[base_df['team'] != base_df['opp_team']]


In [13]:

# 4. [특성 1, 2, 3] 최근 30경기 이동 평균 (Rolling 30)
base_df = base_df.sort_values(['team', 'year', 'month', 'day'])

def get_rolling_features(group):
    # ① 평균 득점, ② 평균 실점 (최근 30경기) [cite: 29, 30, 39]
    group['f1_avg_runs_scored_30'] = group['r'].shift(1).rolling(window=30).mean()
    group['f2_avg_runs_allowed_30'] = group['runs_allowed'].shift(1).rolling(window=30).mean()
    
    # ③ 팀 타율 (최근 30경기) [cite: 32, 39]
    rolling_hits = group['hit'].shift(1).rolling(window=30).sum()
    rolling_ab = group['bat_num'].shift(1).rolling(window=30).sum()
    group['f3_team_batting_avg_30'] = rolling_hits / rolling_ab
    return group

base_df = base_df.groupby('team', group_keys=False).apply(get_rolling_features)


In [14]:

# 5. [특성 4] 선발 투수 시즌 평균 실점 [cite: 34]
starters = pt[pt['mound'] == 1][['idx', 'name', 'losescore']]
starters = pd.merge(starters, sb[['idx', 'year', 'month', 'day']], on='idx')
starters = starters.sort_values(['name', 'year', 'month', 'day'])
starters['f4_pitcher_runs_avg'] = starters.groupby('name')['losescore'].transform(lambda x: x.shift(1).expanding().mean())

base_df = pd.merge(base_df, starters[['idx', 'f4_pitcher_runs_avg']], on='idx', how='left')


In [15]:
# 6. [수정] 특성 5: 팀 전체 승률 (win_binary 사용) [cite: 36]
base_df['f5_total_win_pct'] = base_df.groupby('team')['win_binary'].transform(lambda x: x.shift(1).expanding().mean())

KeyError: 'team'

In [None]:

# 7. [수정] 특성 6: 홈/원정 승률 (win_binary 사용) [cite: 38]
def get_ha_win_pct(group):
    is_home = (group['team'] == group['home'])
    group.loc[is_home, 'f6_ha_win_pct'] = group.loc[is_home, 'win_binary'].shift(1).expanding().mean()
    
    is_away = (group['team'] == group['away'])
    group.loc[is_away, 'f6_ha_win_pct'] = group.loc[is_away, 'win_binary'].shift(1).expanding().mean()
    return group

base_df = base_df.groupby('team', group_keys=False).apply(get_ha_win_pct)


In [None]:

# 8. 최종 MLP 데이터셋 구성 (홈 6개 + 원정 6개 = 12개 입력값) [cite: 46]
feature_cols = ['f1_avg_runs_scored_30', 'f2_avg_runs_allowed_30', 'f3_team_batting_avg_30', 
                'f4_pitcher_runs_avg', 'f5_total_win_pct', 'f6_ha_win_pct']

home_data = base_df[base_df['team'] == base_df['home']][['game_id', 'result', 'win_binary'] + feature_cols]
home_data.columns = ['game_id', 'raw_result', 'win_binary'] + ['h_' + c for c in feature_cols]

away_data = base_df[base_df['team'] == base_df['away']][['game_id'] + feature_cols]
away_data.columns = ['game_id'] + ['a_' + c for c in feature_cols]

KeyError: 'team'

In [None]:

# 결측치 제거 후 병합
final_dataset = pd.merge(home_data, away_data, on='game_id').dropna()

# 무승부(0) 제거: 승/패만 학습에 사용
final_dataset = final_dataset[final_dataset['raw_result'].isin([1, -1])].copy()

# Target: 홈승이면 0, 원정승이면 1 (즉 홈이 졌으면 원정승)
final_dataset['target'] = (final_dataset['raw_result'] == -1).astype(int)

final_dataset = final_dataset.drop(columns=['raw_result', 'win_binary'])


# 저장
final_dataset.to_csv('kbo_mlp_training_data.csv', index=False)
print("학습용 데이터셋 생성 완료: kbo_mlp_training_data.csv")

학습용 데이터셋 생성 완료: kbo_mlp_training_data.csv


### skit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# 1. 전처리된 데이터 로드
df = pd.read_csv('kbo_mlp_training_data.csv')

In [None]:

# 2. 특징(X)과 정답(y) 분리
# game_id는 식별용이므로 제외, target(0:홈승, 1:원정승)을 예측
X = df.drop(columns=['game_id', 'target'])
y = df['target']

In [None]:

# 3. 데이터 분할 (훈련 75%, 테스트 25%) [cite: 20]
# shuffle=False는 시간 순서대로 테스트하기 위함입니다 (자료 기준) [cite: 76]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 4. 데이터 스케일링 (StandardScaler 사용) [cite: 75, 77]
sc_X = StandardScaler()

X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.transform(X_test)

[[ 0.82923389 -0.13608825  2.3896831  ...  1.62582348  1.19067157
   1.16990639]
 [ 1.31203772 -0.18055125  2.80207175 ...  0.46479099 -0.77519319
  -0.42405794]
 [ 1.31203772 -0.5362552   2.94324749 ...  0.64070501 -0.93202873
  -0.68336314]
 ...
 [ 1.02235542 -0.80303316  0.30121306 ... -1.82209118  0.95869219
   1.0445071 ]
 [-1.53650486  0.79763462 -1.7183722  ...  2.04801712  0.35658323
   1.06694698]
 [-1.48822448  1.24226456 -1.64698259 ...  0.02500596  0.33030606
   1.30032166]]


In [None]:

# 5. 세 가지 최적화 알고리즘(Solver) 비교 [cite: 64, 71]
solvers = ['sgd', 'adam', 'lbfgs']
results = {}

print("--- 모델 학습 결과 ---")
for s in solvers:
    # 자료의 설정값 반영: 은닉층 (3,), 활성화 함수 relu, 최대 반복 1000 [cite: 47, 83]
    clf = MLPClassifier(
        hidden_layer_sizes=(12, 6), 
        activation='tanh', 
        solver=s, 
        max_iter=2000, 
        random_state=0
    )
    
    # 모델 학습
    clf.fit(X_train_scaled, y_train)
    
    # 예측 및 정확도 계산 [cite: 85, 88]
    y_pred = clf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[s] = acc
    print(f"Algorithm: {s.upper()} | Accuracy: {acc:.1%}")

--- 모델 학습 결과 ---
Algorithm: SGD | Accuracy: 51.7%




Algorithm: ADAM | Accuracy: 47.7%
Algorithm: LBFGS | Accuracy: 47.7%


In [None]:

# 가장 높은 정확도를 보인 알고리즘 확인
best_solver = max(results, key=results.get)
print(f"\n추천 알고리즘: {best_solver.upper()} (정확도 {results[best_solver]:.1%})")


추천 알고리즘: SGD (정확도 51.7%)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# 1. 데이터 스케일링 (신경망에서 가장 중요!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. 테스트할 파라미터 조합 설정
param_grid = {
    'hidden_layer_sizes': [(3,), (6,), (12,), (6, 3), (12, 6)], # 노드 수를 늘려보거나 층을 쌓아봄
    'solver': ['sgd', 'adam', 'lbfgs'],
    'activation': ['relu', 'tanh'],
    'max_iter': [2000] # 충분히 학습하도록 반복 횟수 증가
}

# 3. 그리드 서치 실행 (모든 조합을 다 해보고 최고를 찾음)
mlp = MLPClassifier(random_state=1)
grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 4. 결과 출력
print(f"최고 정확도: {grid_search.best_score_:.1%}")
print(f"최적의 파라미터: {grid_search.best_params_}")

# 5. 최적의 모델로 테스트 데이터 평가
best_model = grid_search.best_estimator_
test_acc = best_model.score(X_test_scaled, y_test)
print(f"최종 테스트 결과: {test_acc:.1%}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


최고 정확도: 55.3%
최적의 파라미터: {'activation': 'tanh', 'hidden_layer_sizes': (12, 6), 'max_iter': 2000, 'solver': 'adam'}
최종 테스트 결과: 45.6%




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 1. 데이터 로드 및 전처리
df = pd.read_csv('kbo_mlp_training_data.csv')
X = df.drop(columns=['game_id', 'target']).values
y = df['target'].values.reshape(-1, 1)

# 데이터 분할 (셔플 없이 시간 순서대로)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 텐서 변환
X_train_t = torch.FloatTensor(X_train_scaled)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test_scaled)
y_test_t = torch.FloatTensor(y_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=16, shuffle=True)


# 2. MLP 모델 클래스 정의
class KBOPredictor(nn.Module):
    def __init__(self):
        super(KBOPredictor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(12, 12),  # 입력층 12 -> 은닉층 12
            nn.Tanh(),
            nn.Linear(12, 6),   # 은닉층 12 -> 은닉층 6
            nn.Tanh(),
            nn.Linear(6, 1),    # 은닉층 6 -> 출력층 1
            nn.Sigmoid()        # 0~1 사이의 확률로 출력
        )

    def forward(self, x):
        return self.model(x)

# 모델, 손실함수, 최적화기 설정
model = KBOPredictor()
criterion = nn.BCELoss()
# weight_decay는 가중치가 너무 커지지 않게 규제함 (L2 규제)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-2)

# 3. 학습 루프
epochs = 200
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # 20회마다 평가 결과 출력
    if (epoch + 1) % 20 == 0:
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_t)
            train_acc = ((train_outputs > 0.5).float() == y_train_t).float().mean()
            
            test_outputs = model(X_test_t)
            test_acc = ((test_outputs > 0.5).float() == y_test_t).float().mean()
            
            print(f"Epoch [{epoch+1}/{epochs}] Loss: {loss.item():.4f} | Train Acc: {train_acc:.1%} | Test Acc: {test_acc:.1%}")

# 4. 최종 결과 확인
model.eval()
with torch.no_grad():
    final_pred = (model(X_test_t) > 0.5).float()
    print(f"\n[최종 테스트 정확도]: {accuracy_score(y_test, final_pred):.2%}")

Epoch [20/200] Loss: 0.6884 | Train Acc: 55.6% | Test Acc: 56.5%
Epoch [40/200] Loss: 0.7183 | Train Acc: 56.5% | Test Acc: 51.6%
Epoch [60/200] Loss: 0.6718 | Train Acc: 55.9% | Test Acc: 48.4%
Epoch [80/200] Loss: 0.6661 | Train Acc: 55.8% | Test Acc: 54.8%
Epoch [100/200] Loss: 0.6578 | Train Acc: 55.4% | Test Acc: 53.2%
Epoch [120/200] Loss: 0.7007 | Train Acc: 55.8% | Test Acc: 58.1%
Epoch [140/200] Loss: 0.7167 | Train Acc: 55.6% | Test Acc: 55.9%
Epoch [160/200] Loss: 0.7078 | Train Acc: 55.0% | Test Acc: 58.1%
Epoch [180/200] Loss: 0.7008 | Train Acc: 56.1% | Test Acc: 57.5%
Epoch [200/200] Loss: 0.6688 | Train Acc: 55.8% | Test Acc: 58.6%

[최종 테스트 정확도]: 58.60%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 1. 데이터 로드 및 전처리
df = pd.read_csv('kbo_mlp_training_data.csv')
X = df.drop(columns=['game_id', 'target']).values
y = df['target'].values.reshape(-1, 1)

# 데이터 분할 (셔플 없이 시간 순서대로)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 텐서 변환
X_train_t = torch.FloatTensor(X_train_scaled)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test_scaled)
y_test_t = torch.FloatTensor(y_test)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=16, shuffle=True)

# 2. 모델 설계 (Dropout 추가로 과적합 방지)
class AdvancedKBOPredictor(nn.Module):
    def __init__(self):
        super(AdvancedKBOPredictor, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(12, 8),
            nn.Tanh(),
            nn.Dropout(0.5),
            nn.Linear(8, 4),
            nn.Tanh(),
            nn.Linear(4, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

# 모델, 손실함수, 최적화기 설정
model = AdvancedKBOPredictor()
criterion = nn.BCELoss()
# weight_decay는 가중치가 너무 커지지 않게 규제함 (L2 규제)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-2) # 1e-4 -> 1e-2

# 3. 학습 루프
epochs = 200
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # 20회마다 평가 결과 출력
    if (epoch + 1) % 20 == 0:
        model.eval()
        with torch.no_grad():
            train_outputs = model(X_train_t)
            train_acc = ((train_outputs > 0.5).float() == y_train_t).float().mean()
            
            test_outputs = model(X_test_t)
            test_acc = ((test_outputs > 0.5).float() == y_test_t).float().mean()
            
            print(f"Epoch [{epoch+1}/{epochs}] Loss: {loss.item():.4f} | Train Acc: {train_acc:.1%} | Test Acc: {test_acc:.1%}")

# 4. 최종 결과 확인
model.eval()
with torch.no_grad():
    final_pred = (model(X_test_t) > 0.5).float()
    print(f"\n[최종 테스트 정확도]: {accuracy_score(y_test, final_pred):.2%}")

Epoch [20/200] Loss: 0.6216 | Train Acc: 53.5% | Test Acc: 47.7%
Epoch [40/200] Loss: 0.7189 | Train Acc: 55.8% | Test Acc: 51.7%
Epoch [60/200] Loss: 0.6734 | Train Acc: 54.1% | Test Acc: 51.7%
Epoch [80/200] Loss: 0.7967 | Train Acc: 55.8% | Test Acc: 50.3%
Epoch [100/200] Loss: 0.6995 | Train Acc: 55.6% | Test Acc: 47.0%
Epoch [120/200] Loss: 0.5840 | Train Acc: 57.2% | Test Acc: 51.0%
Epoch [140/200] Loss: 0.8945 | Train Acc: 56.3% | Test Acc: 51.0%
Epoch [160/200] Loss: 0.6400 | Train Acc: 57.0% | Test Acc: 51.0%
Epoch [180/200] Loss: 0.6716 | Train Acc: 56.0% | Test Acc: 54.4%
Epoch [200/200] Loss: 0.6401 | Train Acc: 56.3% | Test Acc: 59.1%

[최종 테스트 정확도]: 59.06%
