# price 전처리
- 숫자 추출
- 외국 화폐의 경우, 원화(KRW) 기준으로 처리
- 그 외 전처리

In [66]:
import pandas as pd

In [67]:
data = pd.read_csv("./hotdeal-info_utf-8-encoded_2024-05-13 18_50_00.322657.csv")

In [68]:
# 연도 필터링 -
data["year"] = pd.to_datetime(data["created_at"]).dt.year
data = data[data['year'] >= 2021]

In [69]:
data.head()

Unnamed: 0,title,created_at,price,views,votes,year
0,[지마켓] SF-1200F14XP LEADEX VII PLATINUM PRO ATX...,2024-05-13,"￦276,680(KRW)",216,0,2024
1,"[오늘의집] 삼성 브랜드위크 S32CG550 외 7건 (273,773/무배)",2024-05-13,"￦273,773(KRW)",521,0,2024
2,[위메프] ASUS ROG STRIX B650-A GAMING WIFI 대원씨티에스...,2024-05-13,"￦299,700(KRW)",1600,0,2024
3,[알리] XrayDisk M.2 NVMe 512GB Pro ($13.20),2024-05-13,$13.2(USD),2700,6,2024
4,[지마켓] LIAN LI UNI FAN SL-INF 120 RGB 화이트 3팩,2024-05-13,"￦110,500(KRW)",1500,2,2024


In [70]:
import re

### 숫자, 화폐 비율 계산

In [71]:
# 화폐 단위 추출
def identify_currency(row):
    price = row["price"]
    if price.startswith('￦') or '원' in price:
        return 'KRW'
    elif price.startswith('$'):
        return 'USD'
    elif price.startswith('€'):
        return 'EUR'
    elif price.startswith('¥') or 'CNY' in price:
        return 'CNY'
    elif 'JPY' in price:
        return 'JPY'
    elif price.startswith('£'):
        return 'GBP'
    else:
        return 'KRW'  # 기본값은 KRW
    
data["currency"] = data.apply(identify_currency, axis=1)

In [72]:
# price에서 가격만 가져오기
def clean_price(row):
    price = row["price"]
    cleaned_price = re.sub('[^0-9.]', '', price)
    if not cleaned_price:
        return None
    elif cleaned_price == '.':
        return None
    else:
        return cleaned_price
    
data["numeric_price"] = data.apply(clean_price, axis=1)

확인하기

In [75]:
data[["price", "currency", "numeric_price"]]

Unnamed: 0,price,currency,numeric_price
0,"￦276,680(KRW)",KRW,276680
1,"￦273,773(KRW)",KRW,273773
2,"￦299,700(KRW)",KRW,299700
3,$13.2(USD),USD,13.2
4,"￦110,500(KRW)",KRW,110500
...,...,...,...
27555,"￦95,475(KRW)",KRW,95475
27556,"￦64,600(KRW)",KRW,64600
27557,"￦657,390(KRW)",KRW,657390
27558,"￦371,061(KRW)",KRW,371061


### 추가 전처리

1. `,` 대신 `.`을 쓴 경우

In [76]:
def update_numeric_price(numeric_price):
    # '.'로 구분된 숫자 패턴에 매칭되는 경우에만 업데이트
    if re.match(r'\d+\.\d+\.\d+', numeric_price):
        return re.sub('[^0-9]', '', numeric_price)
    else:
        return numeric_price

# 매칭 문제로 null 값 삭제
data.dropna(inplace=True)
data["numeric_price"] = data["numeric_price"].apply(update_numeric_price)

2. USD인데 KRW라고 올린 경우

In [80]:
data.loc[data["title"] == '[뉴에그] WD BLACK SN850 1TB ($174.99/미국내무료)', "currency"] = "USD"
data.loc[data["title"] == '[Amazon] SK hynix Gold P31 2TB ($195.99/?)', "currency"] = "USD"
data.loc[data["title"] == '(아마존/뉴에그)WD_Black 1TB SN850 NVMe 최저가 관세내', "currency"] = "USD"
data.loc[data["title"] == '[AKG.com] n5005 199불!', "currency"] = "USD"

3. KRW가 맞는데 오타가 난 경우

In [81]:
data.loc[data["title"] == '[옥션] 삼성 ssd 980 pro 1TB MZ-V8P1T0BW(122,960/무료)', "numeric_price"] = '122960'
data.loc[data["title"] == '[알리]삼성980 pro 1tb', "numeric_price"] = '78080'

4. 맨 끝에 `.`이 있는 경우 (81.99.)

In [82]:
data['numeric_price'] = data['numeric_price'].str.rstrip('.')

price 실수형으로 변경

In [83]:
data['numeric_price'] = data['numeric_price'].astype('float')

### 환율 계산

In [84]:
data["currency"].unique()

array(['KRW', 'USD', 'EUR', 'CNY', 'GBP'], dtype=object)

In [91]:
def calculate_exchange_rate(row):
    price = row["numeric_price"]
    currency = row["currency"]

    if currency == "GBP":
        return price * 1719.29
    elif currency == "USD":
        return price * 1365.00
    elif currency == "CNY":
        return price * 189.00
    elif currency == "EUR":
        return price * 1477.48
    else: return price

In [92]:
data["price"] = data.apply(calculate_exchange_rate, axis=1)

In [93]:
# 확인
data[data["currency"]=="USD"]

Unnamed: 0,title,created_at,price,views,votes,year,currency,numeric_price
3,[알리] XrayDisk M.2 NVMe 512GB Pro ($13.20),2024-05-13,18018.00,2700,6,2024,USD,13.20
37,[Amazon]wavlink 썬더볼트 독($49/무료),2024-05-12,66885.00,6800,8,2024,USD,49.00
39,[뉴에그] MSI 4060ti (+패드) 몬스터헌터 콜라보 에디션 (백오더),2024-05-12,682486.35,13100,2,2024,USD,499.99
42,[알리] Rocoren 60W 0.5M 0.39달러 (외 100W/240W 및 다양...,2024-05-12,532.35,6300,2,2024,USD,0.39
43,[알리]XrayDisk M.2 SSD 1TB,2024-05-12,50136.45,10100,6,2024,USD,36.73
...,...,...,...,...,...,...,...,...
27472,[아마존]Seagate Expansion Desktop 10TB 189.99,2021-01-06,259336.35,3500,3,2021,USD,189.99
27502,"[newegg,amazon] seagate expansion 12tb 적출용하드",2021-01-05,271635.00,5900,9,2021,USD,199.00
27508,[아마존]razer nari wireless 헤드셋 (89.99달러/40%세일),2021-01-04,122836.35,3400,1,2021,USD,89.99
27548,[뉴에그] MX500 500GB 48.99$,2021-01-02,66871.35,3600,5,2021,USD,48.99


필요한 컬럼만 남겨 저장

In [95]:
res_data = data[["title", "created_at", "price", "views", "votes"]]

In [96]:
res_data

Unnamed: 0,title,created_at,price,views,votes
0,[지마켓] SF-1200F14XP LEADEX VII PLATINUM PRO ATX...,2024-05-13,276680.0,216,0
1,"[오늘의집] 삼성 브랜드위크 S32CG550 외 7건 (273,773/무배)",2024-05-13,273773.0,521,0
2,[위메프] ASUS ROG STRIX B650-A GAMING WIFI 대원씨티에스...,2024-05-13,299700.0,1600,0
3,[알리] XrayDisk M.2 NVMe 512GB Pro ($13.20),2024-05-13,18018.0,2700,6
4,[지마켓] LIAN LI UNI FAN SL-INF 120 RGB 화이트 3팩,2024-05-13,110500.0,1500,2
...,...,...,...,...,...
27555,"[SSG] BRAVOTEC 1140M 타이탄 글래스 케이스 / 95,475원",2021-01-01,95475.0,7600,8
27556,[옥션] 샌디스크 ULTRA M.2 NVMe 500GB (64600/0),2021-01-01,64600.0,5400,8
27557,[11번가] 시놀로지 DS920+ 할인 NAS 스토리지 4베이 +정품+,2021-01-01,657390.0,7900,5
27558,[인터파크] AMD 5600X (멀티팩) 국민카드 청구할인,2021-01-01,371061.0,10700,7


In [97]:
res_data.to_csv("hotdeal-info_price.csv", encoding="UTF-8", index=False)