# WRDS CRSP 데이터 수집
논문 "Re-Imaging Price Trends" 구현용 데이터 생성

In [1]:
!pip install wrds



In [2]:
import wrds
import pandas as pd
import numpy as np

## 1. WRDS 연결

In [4]:
print("WRDS 연결 중...")
db = wrds.Connection()
print("연결 성공!")

WRDS 연결 중...
Enter your WRDS username [root]:nsj020816
Enter your password:··········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done
연결 성공!


## 2. CRSP 데이터 쿼리

In [5]:
query = """
    SELECT
        a.permno AS code,
        a.date,
        a.prc AS close,
        a.vol AS volume,
        a.ret,          -- 수익률
        a.openprc AS open,
        a.askhi AS high,
        a.bidlo AS low,
        b.ticker
    FROM
        crsp.dsf AS a
    LEFT JOIN
        crsp.dsenames AS b
        ON a.permno = b.permno AND b.namedt <= a.date AND a.date <= b.nameendt
    WHERE
        a.date BETWEEN '1993-01-01' AND '2019-12-31'
        AND b.shrcd IN (10, 11)
        AND b.exchcd IN (1, 2, 3)
"""

print("데이터 다운로드 중... (시간 소요)")
df = db.raw_sql(query, date_cols=['date'])
db.close()

print(f"다운로드 완료: {len(df):,}개 레코드")
print(f"종목 수: {df['code'].nunique():,}")
print(f"기간: {df['date'].min()} ~ {df['date'].max()}")

데이터 다운로드 중... (시간 소요)
다운로드 완료: 34,267,180개 레코드
종목 수: 15,642
기간: 1993-01-04 00:00:00 ~ 2019-12-31 00:00:00


2.데이터 확인


In [6]:
df.head()

Unnamed: 0,code,date,close,volume,ret,open,high,low,ticker
0,10026,2019-07-09,162.92999,74826.0,-0.002388,163.22,164.0,162.64999,JJSF
1,10026,2019-07-10,163.64999,62022.0,0.004419,163.14,164.73,162.22,JJSF
2,10026,2019-07-11,163.24001,42782.0,-0.002505,164.10001,164.28999,162.42999,JJSF
3,10026,2019-07-12,163.89,60461.0,0.003982,163.63,164.63,162.0197,JJSF
4,10026,2019-07-15,162.95,73890.0,-0.005736,163.83,164.63,162.63,JJSF


In [7]:
print("\n데이터 통계:")
print(df.describe())

print("\n결측치 확인:")
print(df.isnull().sum())

print("\n데이터 타입 확인:")
print(df.info())


데이터 통계:
               code                           date        close  \
count    34267180.0                       34267180   34254100.0   
mean   64654.149036  2004-08-01 01:22:32.269596800    44.303095   
min         10001.0            1993-01-04 00:00:00     -18800.0   
25%         44813.0            1997-12-17 00:00:00       4.2599   
50%         78064.0            2003-05-09 00:00:00        12.92   
75%         85271.0            2010-10-13 00:00:00       27.125   
max         93436.0            2019-12-31 00:00:00     340380.0   
std     27757.81781                            NaN  2018.918731   

               volume         ret        open         high           low  
count      34254155.0  34243846.0  32417427.0   34254100.0    34254100.0  
mean    662043.377724    0.000881   47.346165    46.120687     45.101454  
min               0.0   -0.971698      0.0001       0.0014        0.0001  
25%            8800.0   -0.014975        5.35         5.31           5.0  
50%         

In [8]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by=['code', 'date'])

In [9]:
df_no_open = df[df['open'].isnull()]
negative_close_no_open = df_no_open[df_no_open['close'] < 0]

if not negative_close_no_open.empty:
    print(f"'open' 값이 없는 {len(negative_close_no_open):,}개의 레코드에서 'close' 값이 음수입니다.")
    print("예시 레코드:")
    print(negative_close_no_open.head())
else:
    print("'open' 값이 없는 레코드 중 'close' 값이 음수인 경우는 없습니다.")


'open' 값이 없는 1,787,742개의 레코드에서 'close' 값이 음수입니다.
예시 레코드:
      code       date  close  volume       ret  open   high    low ticker
125  10001 1993-01-05 -14.25     0.0 -0.017241  <NA>   14.5   14.0   GFGC
126  10001 1993-01-06 -14.25     0.0       0.0  <NA>   14.5   14.0   GFGC
130  10001 1993-01-12 -14.25     0.0 -0.017241  <NA>   14.5   14.0   GFGC
133  10001 1993-01-15  -14.0     0.0  0.018182  <NA>  14.25  13.75   GFGC
134  10001 1993-01-18  -14.0     0.0       0.0  <NA>  14.25  13.75   GFGC


## 4. 기본 전처리

In [10]:
# CRSP prc는 음수일 수 있음 (bid/ask 평균 표시)
df['close'] = df['close'].abs()

## 5. 미래 수익률 계산

In [11]:
print("미래 수익률 계산 중")

# 수익률 컬럼 초기화
df['ret5'] = np.nan
df['ret20'] = np.nan
df['ret60'] = np.nan

# 종목별 처리
df['ret5'] = df.groupby('code')['close'].pct_change(5).shift(-5) * 100
df['ret20'] = df.groupby('code')['close'].pct_change(20).shift(-20) * 100
df['ret60'] = df.groupby('code')['close'].pct_change(60).shift(-60) * 100

#이진 분류

df['label_5'] = np.nan
df['label_20'] = np.nan
df['label_60'] = np.nan

not_na_mask5 = df['ret5'].notna()
not_na_mask20 = df['ret20'].notna()
not_na_mas60 = df['ret60'].notna()

df.loc[not_na_mask5, 'label_5'] = np.where(df.loc[not_na_mask5, 'ret5'] > 0, 1, 0)
df.loc[not_na_mask20, 'label_20'] = np.where(df.loc[not_na_mask20, 'ret5'] > 0, 1, 0)
df.loc[not_na_mas60, 'label_60'] = np.where(df.loc[not_na_mas60, 'ret5'] > 0, 1, 0)


print(df[['code', 'date', 'close', 'ret5', 'ret20', 'ret60', 'label_5', 'label_20', 'label_60']].head(10))

미래 수익률 계산 중


  df['ret5'] = df.groupby('code')['close'].pct_change(5).shift(-5) * 100
  df['ret20'] = df.groupby('code')['close'].pct_change(20).shift(-20) * 100
  df['ret60'] = df.groupby('code')['close'].pct_change(60).shift(-60) * 100


      code       date  close      ret5     ret20     ret60  label_5  label_20  \
124  10001 1993-01-04   14.5       0.0 -5.172414 -2.586207      0.0       0.0   
125  10001 1993-01-05  14.25       0.0 -3.508772       0.0      0.0       0.0   
126  10001 1993-01-06  14.25 -3.508772 -1.754386 -0.877193      0.0       0.0   
127  10001 1993-01-07   14.5 -5.172414 -3.448276 -2.586207      0.0       0.0   
128  10001 1993-01-08   14.5 -3.448276 -3.448276       0.0      0.0       0.0   
129  10001 1993-01-11   14.5 -3.448276 -5.172414 -2.586207      0.0       0.0   
130  10001 1993-01-12  14.25 -3.508772       0.0 -0.877193      0.0       0.0   
131  10001 1993-01-13  13.75  3.636364       0.0       0.0      1.0       1.0   
132  10001 1993-01-14  13.75  3.636364  1.818182  2.727273      1.0       1.0   
133  10001 1993-01-15   14.0 -1.785714  1.785714  3.571429      0.0       0.0   

     label_60  
124       0.0  
125       0.0  
126       0.0  
127       0.0  
128       0.0  
129       0.

## 6. 최종 데이터 확인

In [12]:
print("최종 데이터 현황:")
print(f"전체 레코드: {len(df):,}")
print(f"종목 수: {df['code'].nunique():,}")
print(f"기간: {df['date'].min()} ~ {df['date'].max()}")
print(f"미래수익률 유효 레코드: {df['ret5'].notna().sum():,}")

# 샘플 확인
print("\n최종 데이터 샘플:")
df.head(10)

최종 데이터 현황:
전체 레코드: 34,267,180
종목 수: 15,642
기간: 1993-01-04 00:00:00 ~ 2019-12-31 00:00:00
미래수익률 유효 레코드: 34,179,908

최종 데이터 샘플:


Unnamed: 0,code,date,close,volume,ret,open,high,low,ticker,ret5,ret20,ret60,label_5,label_20,label_60
124,10001,1993-01-04,14.5,150.0,0.035714,14.5,14.5,14.5,GFGC,0.0,-5.172414,-2.586207,0.0,0.0,0.0
125,10001,1993-01-05,14.25,0.0,-0.017241,,14.5,14.0,GFGC,0.0,-3.508772,0.0,0.0,0.0,0.0
126,10001,1993-01-06,14.25,0.0,0.0,,14.5,14.0,GFGC,-3.508772,-1.754386,-0.877193,0.0,0.0,0.0
127,10001,1993-01-07,14.5,228.0,0.017544,14.5,14.5,14.5,GFGC,-5.172414,-3.448276,-2.586207,0.0,0.0,0.0
128,10001,1993-01-08,14.5,1375.0,0.0,14.25,14.5,14.25,GFGC,-3.448276,-3.448276,0.0,0.0,0.0,0.0
129,10001,1993-01-11,14.5,200.0,0.0,14.5,14.5,14.5,GFGC,-3.448276,-5.172414,-2.586207,0.0,0.0,0.0
130,10001,1993-01-12,14.25,0.0,-0.017241,,14.5,14.0,GFGC,-3.508772,0.0,-0.877193,0.0,0.0,0.0
131,10001,1993-01-13,13.75,700.0,-0.035088,14.0,14.0,13.75,GFGC,3.636364,0.0,0.0,1.0,1.0,1.0
132,10001,1993-01-14,13.75,400.0,0.0,13.75,13.75,13.75,GFGC,3.636364,1.818182,2.727273,1.0,1.0,1.0
133,10001,1993-01-15,14.0,0.0,0.018182,,14.25,13.75,GFGC,-1.785714,1.785714,3.571429,0.0,0.0,0.0


## 7. 저장

In [13]:
TRAIN_VAL_FILENAME = 'data_1993_2000_train_val.parquet'
TEST_FILENAME = 'data_2001_2019_test.parquet'


# --- 훈련/검증 데이터 (1993-2000) ---
print("훈련/검증 데이터 (1993-2000) 분할 및 저장 중...")
train_val_df = df[df['date'].dt.year <= 2000].copy()
train_val_df.to_parquet(TRAIN_VAL_FILENAME, index=False)
print(f"저장 완료: {TRAIN_VAL_FILENAME} ({len(train_val_df):,} 레코드)")

# --- 테스트 데이터 (2001-2019) ---
print("테스트 데이터 (2001-2019) 분할 및 저장 중...")
test_df = df[df['date'].dt.year > 2000].copy()
test_df.to_parquet(TEST_FILENAME, index=False)
print(f"저장 완료: {TEST_FILENAME} ({len(test_df):,} 레코드)")

훈련/검증 데이터 (1993-2000) 분할 및 저장 중...
저장 완료: data_1993_2000_train_val.parquet (13,832,356 레코드)
테스트 데이터 (2001-2019) 분할 및 저장 중...
저장 완료: data_2001_2019_test.parquet (20,434,824 레코드)
