# Web Crawing
 : 웹페이지 데이터 수집

## 크롤링 방법
 - requests package
    - json 문자열 파싱 (동적 크롤링에 주로 사용)
    - html 문자열 파싱 (정적 크롤링에 주로 사용)

In [47]:
import requests
import pandas as pd

##### 1. 실습 ( 웹서비스 분석 : 크롬 개발자 도구 url )
 - 코스피 url : https://m.stock.naver.com/api/index/KOSPI/price?pageSize=10&page=3

In [None]:
url = 'https://m.stock.naver.com/api/index/KOSPI/price?pageSize=10&page=3'
response = requests.get(url)

print(response) # <Response [200]> : response status

In [None]:
# request -> string (response)
print(type(response.text)) # string
print()
print(response.text)
print()
print(response.text[:200])

In [None]:
# response -> list (data)
data = response.json()
print(type(data)) # list
print()
print(data)

In [None]:
# data -> DataFrame (df)
df = pd.DataFrame(data)[['localTradedAt', 'closePrice']]
print(df.dtypes)
print()
print(df)

In [None]:
# 함수 생성
def stock_crawling(pagesize, page, code='KOSPI'):
    """stock page crawling

    Args:
        pagesize (int): page size
        page (int): page number
        code (str, optional): KOSPI / KOSDAQ. Defaults to 'KOSPI'.

    Returns:
        DataFrame : 'localTradedAt', 'closePrice'
    """
    
    url = f'https://m.stock.naver.com/api/index/{code}/price?pageSize={pagesize}&page={page}'
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data)[['localTradedAt', 'closePrice']]
    
    return df


 - 코스닥 url : https://m.stock.naver.com/api/index/KOSDAQ/price?pageSize=10&page=2

In [None]:
url = 'https://m.stock.naver.com/api/index/KOSDAQ/price?pageSize=10&page=2'
response = requests.get(url)
data = response.json()
df = pd.DataFrame(data)[["localTradedAt", "closePrice"]]

print(df)

print(stock_crawling(10,2,'KOSDAQ'))

 - 환율 ( 달러 ) url : https://api.stock.naver.com/marketindex/exchange/FX_USDKRW/prices?page=3&pageSize=10

In [None]:
url = 'https://api.stock.naver.com/marketindex/exchange/FX_USDKRW/prices?page=3&pageSize=10'
response = requests.get(url)
data = response.json()
df = pd.DataFrame(data)[["localTradedAt", "closePrice"]]

print(df)

def exchange_crawling(pagesize, page, code='FX_USDKRW'):
    """money exchange

    Args:
        pagesize (int): page size
        page (int): page number
        code (str, optional): FX_USDKRW, FX_EURKRW, FX_JPYKRW, FX_CNYKRW, each. Defaults to 'FX_USDKRW'.

    Returns:
        _DataFrame : 'localTradedAt', 'closePrice'
    """
    
    url = f'https://api.stock.naver.com/marketindex/exchange/{code}/prices?page={page}&pageSize={pagesize}'
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data)[['localTradedAt', 'closePrice']]
    
    return df

print(exchange_crawling(10,1))

### 데이터 분석 : 상관관계
 - 피어슨 상관계수
    - 1과 가까울수록 강한 양의 상관관계를 갖는다.
    - -1과 가까울수록 강한 음의 상관계를 갖는다.
    - 0과 가까울수록 관계가 없다.
    - pandas 함수 df.corr()

ex) 달러 환율과 주가 지수의 관계

In [None]:
kospi = stock_crawling(60, 1)
kosdaq = stock_crawling(60, 1, 'KOSDAQ')
usd = exchange_crawling(60,1)

df = kospi.copy()
df['kosdaq'] = kosdaq["closePrice"]
df['usd'] = usd["closePrice"]
df.rename(columns={'closePrice': 'kospi'},
          inplace=True)

print(df.dtypes)
print()
# 데이터타입 변경 str -> int
df['kospi'] = df['kospi'].apply(lambda data:float(data.replace(',','')))
df['kosdaq'] = df['kosdaq'].apply(lambda data:float(data.replace(',','')))
df['usd'] = df['usd'].apply(lambda data:float(data.replace(',','')))
print(df.dtypes)
print()

print(df[['kospi', 'kosdaq', 'usd']].corr())



In [None]:
# apply(func)
df = pd.DataFrame([{"age": 23}, {"age": 36}, {"age": 27}])

# 연령대 컬럼을 추가 
def change_ages(age): 
    return age // 10 * 10

df["ages"] = df["age"].apply(change_ages)
print(df)
print()

In [None]:
### lambda : 일회성 함수
###     - 사용 이유 : 메모리 절약, 간단한 함수 (파라미터를 받아서 바로 리턴)

# 함수 3개 사용 ( 메모리 3개 사용 )
def plus(n1,n2):
    return n1+n2

def minus(n1,n2):
    return n1-n2

def calc(func, n1, n2):
    return func(n1, n2)

print('def : plus, minus, calc')
print(calc(plus, 1,2))
print(calc(minus, 1,2))
print()

# lambda parmeter : return
print('lambda : calc')
print(calc(lambda n1, n2: n1+n2, 1, 2))
print(calc(lambda n1, n2: n1-n2, 1, 2))


### API 데이터 수집
 - application programing interface
 - api를 사용해서 데이터를 수집하는 것은 서비스에 데이터를 제공하는 공식적인 방법으로 데이터 수집

ex) naver api( 파파고 )

In [67]:
import pandas as pd
import requests, json

# 1. app 등록 > app_key(request_token)
# https://developers.naver.com
CLIENT_ID, CLIENT_SECRET = "Y7kkW5s3waqiGemisIDm", "O0iVtJhKyp"

# 2. Naver API document 확인 > URL
# https://openapi.naver.com/v1/papago/n2mt
url = 'https://openapi.naver.com/v1/papago/n2mt'
txt = '안녕하세요. 반갑습니다. 감사합니다.'
params = {
    'source': 'ko',
    'target': 'en',
    'text':txt
}
headers = {
    "Content-Type": "application/json",
    "X-Naver-Client-Id": CLIENT_ID,
    "X-Naver-Client-Secret": CLIENT_SECRET
}
# 3. request(url, app_key) > json(str)
ppg_response = requests.post(url, json.dumps(params), headers=headers)
print(ppg_response)
print()

# 4. json > list, dict > dataframe
txt_en = ppg_response.json()["message"]["result"]["translatedText"]
print(txt_en)

<Response [200]>

Hello. Nice to meet you. Thank you.


- selenium

In [68]:
# 파파고 함수
def ppg_translate(id,pw,txt,fromNa='ko', toNa='en'):
    """naver papago translate api

    Args:
        id (str): naver api id
        pw (str): naver api secret_
        txt (str): befor translate text
        fromNa (str, optional): . Defaults to 'ko'.
        toNa (str, optional): . Defaults to 'en'.

    Returns:
        _type_: DataFrame, after translate text
    """
    
    params = {
        'source': fromNa,
        'target': toNa,
        'text':txt
    }
    
    headers = {
        "Content-Type": "application/json",
        "X-Naver-Client-Id": id,
        "X-Naver-Client-Secret": pw
    }
    response = requests.post('https://openapi.naver.com/v1/papago/n2mt', json.dumps(params), headers=headers)
    
    return response.json()["message"]["result"]["translatedText"]

print(ppg_translate(CLIENT_ID, CLIENT_SECRET, '크롤링 공부'))

Crawling study


In [72]:
### covid 번역
def ppg_translate_txt(txt):
    """naver papago translate api

    Args:
        id (str): naver api id
        pw (str): naver api secret_
        txt (str): befor translate text
        fromNa (str, optional): . Defaults to 'ko'.
        toNa (str, optional): . Defaults to 'en'.

    Returns:
        _type_: DataFrame, after translate text
    """
    
    params = {
        'source': 'ko',
        'target': 'en',
        'text':txt
    }
    
    headers = {
        "Content-Type": "application/json",
        "X-Naver-Client-Id": CLIENT_ID,
        "X-Naver-Client-Secret": CLIENT_SECRET
    }
    response = requests.post('https://openapi.naver.com/v1/papago/n2mt', json.dumps(params), headers=headers)
    
    return response.json()["message"]["result"]["translatedText"]

covid = pd.read_excel('../excel/covid.xlsx')[['category', 'title']].apply(ppg_translate_txt)
covid

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.