# SteamDB 기반 접속자 수 수집

## 조건
- 입력: 02-3. steam_data_real.csv (컬럼: appid)
- 수집 대상: SteamCharts 각 게임 페이지의 월별 평균 동접, 월별 피크
- 2023년 8월 이후의 월만 수집

## 절차
- AppID로 순회하면서 파싱
- 기간 필터링 & 가공
- 행 구성 & 정규화

In [None]:
import time
import os
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime

# 설정
SAVE_PATH = 'steamcharts_temp.csv'
base_date = datetime.strptime('August 2023', '%B %Y')

# 크롬 옵션 설정
options = Options()
options.add_argument('--start-maximized')
options.add_experimental_option('detach', True)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
)

# AppID 목록 불러오기
df = pd.read_csv('02-3. steam_data_real.csv')

# 기존에 저장된 데이터 불러오기
if os.path.exists(SAVE_PATH):
    saved_df = pd.read_csv(SAVE_PATH)
    processed_ids = set(saved_df['id'].astype(int))
    print(f"이미 처리된 ID 수: {len(processed_ids)}")
else:
    saved_df = pd.DataFrame(columns=['id'])
    processed_ids = set()

# 크롬 드라이버 실행
driver = webdriver.Chrome(options=options)
all_columns = set(saved_df.columns) if not saved_df.empty else {'id'}

# 크롤링 시작
for gid in df['appid']:
    if gid in processed_ids:
        print(f"[{gid}] 이미 처리됨 - 건너뜀")
        continue

    success = False
    retry_count = 0

    while not success and retry_count < 2:
        try:
            url = f'https://steamcharts.com/app/{gid}'
            driver.get(url)
            time.sleep(2) 

            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            if "Internal Server Error" in html or "Maybe try again in a few minutes" in html:
                print(f"[{gid}] Internal Server Error 감지됨 - 건너뜀")
                retry_count += 1
                break  # 다음 게임으로 넘어감

            data = soup.find('table', class_='common-table')
            if not data:
                print(f"[{gid}] 데이터 없음 - 건너뜀")
                break

            raw_date = [i.text.strip() for i in data.find_all('td', class_='month-cell left')]
            peak = [i.text.strip() for i in data.find_all('td', class_='right num')]
            avg = [i.text.strip() for i in data.find_all('td', class_='right num-f')]

            filtered_date, filtered_peak, filtered_avg = [], [], []
            for d, p, a in zip(raw_date, peak, avg):
                try:
                    date_obj = datetime.strptime(d, '%B %Y')
                    if date_obj >= base_date:
                        filtered_date.append(d)
                        filtered_peak.append(p)
                        filtered_avg.append(a)
                except:
                    continue

            peak_int = [int(x.replace(',', '')) for x in filtered_peak]
            avg_float = [float(x.replace(',', '')) for x in filtered_avg]

            col_peak = [f"{d.split(' ')[1]} {d.split(' ')[0]} peak" for d in filtered_date]
            col_avg = [f"{d.split(' ')[1]} {d.split(' ')[0]} avg" for d in filtered_date]

            peak_series = pd.Series(peak_int, index=col_peak)
            avg_series = pd.Series(avg_float, index=col_avg)
            df_new = pd.concat([peak_series, avg_series]).to_frame().T
            df_new.insert(0, 'id', gid)

            # 누락 컬럼 보완
            all_columns.update(df_new.columns)
            for col in all_columns:
                if col not in df_new.columns:
                    df_new[col] = 0
            df_new = df_new[list(all_columns)]

            saved_df = pd.concat([saved_df, df_new], ignore_index=True)
            saved_df.to_csv(SAVE_PATH, index=False)
            print(f"[{gid}] 저장 완료")
            success = True
            processed_ids.add(gid)

        except Exception as e:
            print(f"[{gid}] 에러 발생: {e} - 다음 게임으로 넘어감")
            break  # 에러 발생 시 드라이버 유지, 다음 게임으로

# 마지막 저장
saved_df.to_csv(SAVE_PATH, index=False)
print("크롤링 완료.")

이미 처리된 ID 수: 2712
[1384160] 이미 처리됨 - 건너뜀
[2141910] 이미 처리됨 - 건너뜀
[1477590] 이미 처리됨 - 건너뜀
[1326470] 이미 처리됨 - 건너뜀
[550] 이미 처리됨 - 건너뜀
[477160] 이미 처리됨 - 건너뜀
[2290180] 이미 처리됨 - 건너뜀
[739630] 이미 처리됨 - 건너뜀
[1677280] 이미 처리됨 - 건너뜀
[2157560] 이미 처리됨 - 건너뜀
[2617700] 이미 처리됨 - 건너뜀
[570] 이미 처리됨 - 건너뜀
[2737070] 이미 처리됨 - 건너뜀
[2141730] 이미 처리됨 - 건너뜀
[1361210] 이미 처리됨 - 건너뜀
[1407200] 이미 처리됨 - 건너뜀
[526870] 이미 처리됨 - 건너뜀
[1016920] 이미 처리됨 - 건너뜀
[582010] 이미 처리됨 - 건너뜀
[1203220] 이미 처리됨 - 건너뜀
[3097560] 이미 처리됨 - 건너뜀
[1222700] 이미 처리됨 - 건너뜀
[1426300] Internal Server Error 감지됨 - 건너뜀
[2235200] 이미 처리됨 - 건너뜀
[281990] 이미 처리됨 - 건너뜀
[3176060] 이미 처리됨 - 건너뜀
[899770] 이미 처리됨 - 건너뜀
[2399830] 이미 처리됨 - 건너뜀
[1238810] 이미 처리됨 - 건너뜀
[594650] 이미 처리됨 - 건너뜀
[1260320] 이미 처리됨 - 건너뜀
[427410] 이미 처리됨 - 건너뜀
[1151340] 이미 처리됨 - 건너뜀
[1635450] 이미 처리됨 - 건너뜀
[1846380] 이미 처리됨 - 건너뜀
[1568590] 이미 처리됨 - 건너뜀
[892970] 이미 처리됨 - 건너뜀
[246900] 이미 처리됨 - 건너뜀
[2350270] 저장 완료
[1933390] Internal Server Error 감지됨 - 건너뜀
[2943650] 이미 처리됨 - 건너뜀
[251570] 이미 처리됨 - 건너뜀
[310

In [2]:
df = pd.read_csv('steamcharts_temp.csv')
df

Unnamed: 0,2023 November peak,2024 February peak,2025 February avg,2024 December avg,2024 August peak,2024 January peak,2025 January peak,2025 May peak,2024 March avg,2025 April peak,...,2024 July avg,2025 January avg,2024 July peak,2024 November avg,2025 March avg,2024 February avg,2023 September avg,2025 February peak,2025 April avg,2024 April peak
0,2747.0,2503.0,1523.3,1848.9,4231.0,3493.0,2494.0,10191.0,2650.9,2890.0,...,2871.5,1567.2,6794.0,3299.6,1714.5,1632.2,2183.7,2417.0,1836.7,4181.0
1,7357.0,7573.0,6148.8,5387.2,9627.0,5450.0,7679.0,9083.0,4159.9,12119.0,...,5098.8,5368.5,12058.0,6499.9,5850.9,4319.1,3762.1,11048.0,6627.6,8543.0
2,637.0,325.0,93.5,89.0,419.0,272.0,731.0,646.0,107.5,211.0,...,131.4,115.2,1143.0,99.1,95.6,98.9,94.8,242.0,78.2,324.0
3,17207.0,53964.0,9338.0,16985.8,13890.0,17076.0,44610.0,13846.0,18698.6,39476.0,...,8765.7,17323.0,18357.0,13790.0,11098.6,11038.0,5583.1,20587.0,12970.8,20005.0
4,71345.0,35794.0,28121.3,27355.1,34348.0,58686.0,41486.0,50899.0,24424.3,36296.0,...,27361.6,26582.6,48914.0,30372.5,24513.3,22900.6,26449.6,51703.0,20885.2,35358.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5556,17.0,12.0,3.9,6.9,20.0,14.0,16.0,21.0,19.0,29.0,...,6.7,5.4,24.0,4.4,19.3,3.3,3.5,13.0,8.5,38.0
5557,18.0,15.0,4.0,4.6,8.0,15.0,18.0,7.0,5.1,9.0,...,3.5,5.2,10.0,7.5,3.2,5.2,9.2,15.0,3.5,35.0
5558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5561 entries, 0 to 5560
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   2023 November peak   5561 non-null   float64
 1   2024 February peak   5561 non-null   float64
 2   2025 February avg    5561 non-null   float64
 3   2024 December avg    5561 non-null   float64
 4   2024 August peak     5561 non-null   float64
 5   2024 January peak    5561 non-null   float64
 6   2025 January peak    5561 non-null   float64
 7   2025 May peak        5561 non-null   float64
 8   2024 March avg       5561 non-null   float64
 9   2025 April peak      5561 non-null   float64
 10  2025 June peak       5561 non-null   float64
 11  2025 March peak      5561 non-null   float64
 12  2023 October avg     5561 non-null   float64
 13  2024 May peak        5561 non-null   float64
 14  2024 October avg     5561 non-null   float64
 15  2025 July peak       5561 non-null   f

In [4]:
df.columns

Index(['2023 November peak', '2024 February peak', '2025 February avg',
       '2024 December avg', '2024 August peak', '2024 January peak',
       '2025 January peak', '2025 May peak', '2024 March avg',
       '2025 April peak', '2025 June peak', '2025 March peak',
       '2023 October avg', '2024 May peak', '2024 October avg',
       '2025 July peak', '2023 December peak', '2024 September peak',
       '2023 December avg', '2023 October peak', '2024 November peak',
       '2024 March peak', '2023 November avg', '2024 January avg',
       '2024 April avg', '2024 October peak', '2023 August peak',
       '2024 December peak', '2023 August avg', '2024 June avg',
       '2024 August avg', '2024 June peak', '2023 September peak',
       '2024 September avg', '2024 May avg', '2025 June avg', 'id',
       '2025 July avg', '2025 May avg', '2024 July avg', '2025 January avg',
       '2024 July peak', '2024 November avg', '2025 March avg',
       '2024 February avg', '2023 September avg', '202

In [7]:
df = df[['id','2023 November peak', '2024 February peak', '2025 February avg',
       '2024 December avg', '2024 August peak', '2024 January peak',
       '2025 January peak', '2025 May peak', '2024 March avg',
       '2025 April peak', '2025 June peak', '2025 March peak',
       '2023 October avg', '2024 May peak', '2024 October avg',
       '2025 July peak', '2023 December peak', '2024 September peak',
       '2023 December avg', '2023 October peak', '2024 November peak',
       '2024 March peak', '2023 November avg', '2024 January avg',
       '2024 April avg', '2024 October peak', '2023 August peak',
       '2024 December peak', '2023 August avg', '2024 June avg',
       '2024 August avg', '2024 June peak', '2023 September peak',
       '2024 September avg', '2024 May avg', '2025 June avg',
       '2025 July avg', '2025 May avg', '2024 July avg', '2025 January avg',
       '2024 July peak', '2024 November avg', '2025 March avg',
       '2024 February avg', '2023 September avg', '2025 February peak',
       '2025 April avg', '2024 April peak']]

In [13]:
df.columns

Index(['id', '2023 November peak', '2024 February peak', '2025 February avg',
       '2024 December avg', '2024 August peak', '2024 January peak',
       '2025 January peak', '2025 May peak', '2024 March avg',
       '2025 April peak', '2025 June peak', '2025 March peak',
       '2023 October avg', '2024 May peak', '2024 October avg',
       '2025 July peak', '2023 December peak', '2024 September peak',
       '2023 December avg', '2023 October peak', '2024 November peak',
       '2024 March peak', '2023 November avg', '2024 January avg',
       '2024 April avg', '2024 October peak', '2023 August peak',
       '2024 December peak', '2023 August avg', '2024 June avg',
       '2024 August avg', '2024 June peak', '2023 September peak',
       '2024 September avg', '2024 May avg', '2025 June avg', '2025 July avg',
       '2025 May avg', '2024 July avg', '2025 January avg', '2024 July peak',
       '2024 November avg', '2025 March avg', '2024 February avg',
       '2023 September avg', '202

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5561 entries, 0 to 5560
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   5561 non-null   int64  
 1   2023 November peak   5561 non-null   float64
 2   2024 February peak   5561 non-null   float64
 3   2025 February avg    5561 non-null   float64
 4   2024 December avg    5561 non-null   float64
 5   2024 August peak     5561 non-null   float64
 6   2024 January peak    5561 non-null   float64
 7   2025 January peak    5561 non-null   float64
 8   2025 May peak        5561 non-null   float64
 9   2024 March avg       5561 non-null   float64
 10  2025 April peak      5561 non-null   float64
 11  2025 June peak       5561 non-null   float64
 12  2025 March peak      5561 non-null   float64
 13  2023 October avg     5561 non-null   float64
 14  2024 May peak        5561 non-null   float64
 15  2024 October avg     5561 non-null   f

In [17]:
df.to_csv('게임접속자수1.csv')