In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
df_active = pd.read_csv("raw_data/kbo_hitters_active.csv")
df_retired = pd.read_csv("raw_data/kbo_hitters_retired.csv")

In [5]:
df_active.to_csv('raw_data/kbo_hitters_active_enc.csv', index=False, encoding='utf-8-sig')
df_retired.to_csv('raw_data/kbo_hitters_retired_enc.csv', index=False, encoding='utf-8-sig')

In [19]:
df_active.to_csv("kbo_hitters_active.csv", index=False, encoding='utf-8-sig')

In [20]:
df_retired.to_csv("kbo_hitters_retired.csv", index=False, encoding='utf-8-sig')

In [42]:
# 현역
df_active = pd.read_csv("raw_data/kbo_hitters_active.csv")
df_active.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1718 entries, 0 to 1717
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     1718 non-null   object
 1   pic_url  1718 non-null   object
 2   birth    1718 non-null   int64 
 3   season   1718 non-null   int64 
 4   team     1718 non-null   object
 5   AVG      1718 non-null   object
 6   G        1718 non-null   int64 
 7   PA       1718 non-null   int64 
 8   AB       1718 non-null   int64 
 9   R        1718 non-null   int64 
 10  H        1718 non-null   int64 
 11  2B       1718 non-null   int64 
 12  3B       1718 non-null   int64 
 13  HR       1718 non-null   int64 
 14  TB       1718 non-null   int64 
 15  RBI      1718 non-null   int64 
 16  SB       1718 non-null   int64 
 17  CS       1718 non-null   int64 
 18  BB       1718 non-null   int64 
 19  HBP      1718 non-null   int64 
 20  SO       1718 non-null   int64 
 21  GDP      1718 non-null   int64 
 22  

In [None]:
df_active = df_active.drop(columns=["AVG", "SLG", "OBP"])
df_active

KeyError: "['AVG', 'SLG', 'OBP'] not found in axis"

In [36]:
# 1. 그룹 기준
group_cols = ['name', 'birth']

# 2. 합산 대상 컬럼 정의 (문자열 컬럼 제외)
exclude_cols = ['name', 'pic_url', 'birth', 'season', 'team']
sum_cols = [col for col in df_active.columns if col not in exclude_cols]

# 동일 인물 기준 그룹화
years = df_active.groupby(['name', 'birth'])['season'].agg(first_year='min', last_year='max').reset_index()

# 원본 df에 병합 (left join)
df_active = df_active.merge(years, on=['name', 'birth'], how='left')

# 3. 그룹별 합산
sum_df = df_active.groupby(group_cols)[sum_cols].sum().reset_index()

# 4. active_year 추가 (시즌 수)
active_year_df = df_active.groupby(group_cols).size().reset_index(name='active_year')

# 5. 메타데이터: name, birth, team, pic_url, 만 유지
meta_df = df_active.groupby(group_cols)[['team', 'pic_url']].first().reset_index()

# 6. 모두 합치기
result = sum_df.merge(active_year_df, on=group_cols)
result = result.merge(meta_df, on=group_cols)
result = result.merge(years, on=group_cols, how='left')

# 결과 확인
result.head()

Unnamed: 0,name,birth,G,PA,AB,R,H,2B,3B,HR,...,BB,HBP,SO,GDP,E,active_year,team,pic_url,first_year,last_year
0,강민성,1999,23,44,36,2,5,1,0,0,...,7,0,15,0,2,3,KT,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2023,2025
1,강민호,1985,2408,8858,7752,978,2147,383,12,340,...,817,163,1550,247,149,22,롯데,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2004,2025
2,강백호,1999,834,3580,3133,511,953,192,6,127,...,407,15,641,47,46,8,KT,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2018,2025
3,강성우,2005,3,3,2,0,1,0,0,0,...,0,0,0,0,0,2,롯데,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2024,2025
4,강승호,1994,741,2582,2343,320,600,133,20,54,...,153,31,650,47,85,9,LG,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2016,2025


In [38]:
# 1루타 (1B) 계산
result['1B'] = result['H'] - result['2B'] - result['3B'] - result['HR']

# AVG 계산
result['AVG'] = result['H'] / result['AB']

# OBP 계산 (간략화: HBP, SF 등 없음)
result['OBP'] = (result['H'] + result['BB']) / (result['AB'] + result['BB'])

# SLG 계산
result['SLG'] = (
    result['1B'] + 2 * result['2B'] + 3 * result['3B'] + 4 * result['HR']
) / result['AB']

# OPS 계산
result['OPS'] = result['OBP'] + result['SLG']

# 소수점 정리 (선택)
result[['AVG', 'OBP', 'SLG', 'OPS']] = result[['AVG', 'OBP', 'SLG', 'OPS']].round(3)

# 1B는 계산용이므로 제거 (선택)
result.drop(columns=['1B'], inplace=True)

# first_year : 기록중에 제일 숫자가 작은 년도
# last_year : 기록중에 제일 숫자가 큰 년도
# active_year : 행수
result

Unnamed: 0,name,birth,G,PA,AB,R,H,2B,3B,HR,...,E,active_year,team,pic_url,first_year,last_year,AVG,OBP,SLG,OPS
0,강민성,1999,23,44,36,2,5,1,0,0,...,2,3,KT,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2023,2025,0.139,0.279,0.167,0.446
1,강민호,1985,2408,8858,7752,978,2147,383,12,340,...,149,22,롯데,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2004,2025,0.277,0.346,0.461,0.807
2,강백호,1999,834,3580,3133,511,953,192,6,127,...,46,8,KT,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2018,2025,0.304,0.384,0.491,0.875
3,강성우,2005,3,3,2,0,1,0,0,0,...,0,2,롯데,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2024,2025,0.500,0.500,0.500,1.000
4,강승호,1994,741,2582,2343,320,600,133,20,54,...,85,9,LG,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2016,2025,0.256,0.302,0.399,0.701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,홍현빈,1997,241,258,216,44,44,4,1,0,...,2,8,KT,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2017,2025,0.204,0.309,0.231,0.541
273,황동하,2002,4,0,0,0,0,0,0,0,...,0,3,KIA,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2023,2025,,,,
274,황성빈,1997,329,1071,967,196,283,35,15,5,...,14,4,롯데,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2022,2025,0.293,0.342,0.375,0.717
275,황영묵,1999,158,514,460,72,133,15,4,4,...,14,2,한화,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2024,2025,0.289,0.351,0.365,0.716


In [10]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 27 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         277 non-null    object 
 1   birth        277 non-null    int64  
 2   G            277 non-null    int64  
 3   PA           277 non-null    int64  
 4   AB           277 non-null    int64  
 5   R            277 non-null    int64  
 6   H            277 non-null    int64  
 7   2B           277 non-null    int64  
 8   3B           277 non-null    int64  
 9   HR           277 non-null    int64  
 10  TB           277 non-null    int64  
 11  RBI          277 non-null    int64  
 12  SB           277 non-null    int64  
 13  CS           277 non-null    int64  
 14  BB           277 non-null    int64  
 15  HBP          277 non-null    int64  
 16  SO           277 non-null    int64  
 17  GDP          277 non-null    int64  
 18  E            277 non-null    int64  
 19  active_y

In [13]:
result = result[result['AB'] >= 30]

In [40]:
result.to_csv("data/kbo_active_total_final.csv", index=False, encoding='utf-8-sig')

In [43]:
# 은퇴
df_retired = pd.read_csv("raw_data/kbo_hitters_retired.csv")
df_retired.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9793 entries, 0 to 9792
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     9793 non-null   object
 1   pic_url  9793 non-null   object
 2   birth    9793 non-null   int64 
 3   season   9793 non-null   int64 
 4   team     9793 non-null   object
 5   AVG      9793 non-null   object
 6   G        9793 non-null   int64 
 7   PA       9793 non-null   int64 
 8   AB       9793 non-null   int64 
 9   R        9793 non-null   int64 
 10  H        9793 non-null   int64 
 11  2B       9793 non-null   int64 
 12  3B       9793 non-null   int64 
 13  HR       9793 non-null   int64 
 14  TB       9793 non-null   int64 
 15  RBI      9793 non-null   int64 
 16  SB       9793 non-null   int64 
 17  CS       9793 non-null   int64 
 18  BB       9793 non-null   int64 
 19  HBP      9793 non-null   int64 
 20  SO       9793 non-null   int64 
 21  GDP      9793 non-null   int64 
 22  

In [26]:
df_retired = df_retired.drop(columns=["AVG", "SLG", "OBP"])
df_retired
df_retired.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9793 entries, 0 to 9792
Data columns (total 22 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     9793 non-null   object
 1   pic_url  9793 non-null   object
 2   birth    9793 non-null   int64 
 3   season   9793 non-null   int64 
 4   team     9793 non-null   object
 5   G        9793 non-null   int64 
 6   PA       9793 non-null   int64 
 7   AB       9793 non-null   int64 
 8   R        9793 non-null   int64 
 9   H        9793 non-null   int64 
 10  2B       9793 non-null   int64 
 11  3B       9793 non-null   int64 
 12  HR       9793 non-null   int64 
 13  TB       9793 non-null   int64 
 14  RBI      9793 non-null   int64 
 15  SB       9793 non-null   int64 
 16  CS       9793 non-null   int64 
 17  BB       9793 non-null   int64 
 18  HBP      9793 non-null   int64 
 19  SO       9793 non-null   int64 
 20  GDP      9793 non-null   int64 
 21  E        9793 non-null   int64 
dtype

In [28]:
# 1. 그룹 기준
group_cols = ['name', 'birth']

# 2. 합산 대상 컬럼 정의 (문자열 컬럼 제외)
exclude_cols = ['name', 'pic_url', 'birth', 'season', 'team']
sum_cols = [col for col in df_retired.columns if col not in exclude_cols]

# 동일 인물 기준 그룹화
years = df_retired.groupby(['name', 'birth'])['season'].agg(first_year='min', last_year='max').reset_index()

# 3. 그룹별 합산
sum_df = df_retired.groupby(group_cols)[sum_cols].sum().reset_index()

# 4. active_year 추가 (시즌 수)
active_year_df = df_retired.groupby(group_cols).size().reset_index(name='active_year')

# 5. 메타데이터: name, birth, pic_url, team만 유지
meta_df = df_retired.groupby(group_cols)[['team','pic_url']].first().reset_index()

# 6. 모두 합치기
result = sum_df.merge(active_year_df, on=group_cols)
result = result.merge(meta_df, on=group_cols)
result = result.merge(years, on=group_cols, how='left')

# 결과 확인
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         2282 non-null   object
 1   birth        2282 non-null   int64 
 2   G            2282 non-null   int64 
 3   PA           2282 non-null   int64 
 4   AB           2282 non-null   int64 
 5   R            2282 non-null   int64 
 6   H            2282 non-null   int64 
 7   2B           2282 non-null   int64 
 8   3B           2282 non-null   int64 
 9   HR           2282 non-null   int64 
 10  TB           2282 non-null   int64 
 11  RBI          2282 non-null   int64 
 12  SB           2282 non-null   int64 
 13  CS           2282 non-null   int64 
 14  BB           2282 non-null   int64 
 15  HBP          2282 non-null   int64 
 16  SO           2282 non-null   int64 
 17  GDP          2282 non-null   int64 
 18  E            2282 non-null   int64 
 19  active_year  2282 non-null 

In [29]:
# 1루타 (1B) 계산
result['1B'] = result['H'] - result['2B'] - result['3B'] - result['HR']

# AVG 계산
result['AVG'] = result['H'] / result['AB']

# OBP 계산 (간략화: HBP, SF 등 없음)
result['OBP'] = (result['H'] + result['BB']) / (result['AB'] + result['BB'])

# SLG 계산
result['SLG'] = (
    result['1B'] + 2 * result['2B'] + 3 * result['3B'] + 4 * result['HR']
) / result['AB']

# OPS 계산
result['OPS'] = result['OBP'] + result['SLG']

# 소수점 정리 (선택)
result[['AVG', 'OBP', 'SLG', 'OPS']] = result[['AVG', 'OBP', 'SLG', 'OPS']].round(3)

# 1B는 계산용이므로 제거 (선택)
result.drop(columns=['1B'], inplace=True)

# first_year : 기록중에 제일 숫자가 작은 년도
# last_year : 기록중에 제일 숫자가 큰 년도
# active_year : 행수

In [30]:
result = result[result['AB'] >= 30]

In [31]:
result.head()


Unnamed: 0,name,birth,G,PA,AB,R,H,2B,3B,HR,...,E,active_year,team,pic_url,first_year,last_year,AVG,OBP,SLG,OPS
1,가르시아,1975,469,1906,1697,257,447,85,5,105,...,28,5,한화,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2002,2011,0.263,0.337,0.505,0.842
2,가르시아,1985,50,206,183,27,62,9,0,8,...,9,1,LG,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2018,2018,0.339,0.37,0.519,0.889
3,가르시아,1993,39,156,136,21,28,6,1,4,...,6,1,LG,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2022,2022,0.206,0.303,0.353,0.656
4,가코,1981,58,220,189,21,46,8,0,1,...,1,1,삼성,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2011,2011,0.243,0.322,0.302,0.624
6,강경학,1992,516,1390,1167,185,277,36,12,13,...,47,9,한화,https://6ptotvmi5753.edge.naverncp.com/KBO_IMA...,2011,2021,0.237,0.33,0.322,0.653


In [32]:
result.to_csv("data/kbo_hitters_retired_total.csv", index=False, encoding='utf-8-sig')