## 김포 제주 노선 가격 정보 읽어오기
- 대상 사이트 : 대한항공 국내선 예매 사이트

### 대한항공 사이트 웹 크롤링
    - 대한항공 국내선 항공권예매 페이지를 이용한 크롤링
    - 세션을 생성하여 세션을 이용해 RESTFull json API 페이지 호출 하여 정보 획득
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [95]:
from IPython.display import display
import requests
import pandas as pd
import numpy as np
from pandas import DataFrame
from bs4 import BeautifulSoup
import time
from datetime import datetime
from datetime import timedelta
from common.crawling_util import session_crawling
from common.parsing_util import stat_fare

def crawling_KE_data(dpt,arr,dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    print('Crawling koreanair homepage schedule site')
    session_url = "https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange"
    session_head = {
        'Referer':'https://kr.koreanair.com/korea/ko.html',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    }
    
    ## Rest Full 방식 https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/05-25-2017-0000
    url = "https://www.koreanair.com/api/fly/revenue/from/{dpt}/to/{arr}/on/{mm}-{dd}-{yyyy}-0000"
    url_param = {
        'dpt':dpt,    'arr':arr,
        'yyyy':dpt_date[:4],
        'mm':dpt_date[4:6],
        'dd':dpt_date[6:]
    }
    url = url.format(**url_param)
    head = {
        'page-id':'/booking/dow.html', ## 필수 항목
        'uidd':'83^51%8638461@384712', ## 필수 항목
        'Referer':'https://www.koreanair.com/korea/ko/booking/dow.html',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    }
    param ={## 파라미터는 고정, _ 부분만 조회 시점 타임스탬프 적용
        'flexDays':'2',
        'scheduleDriven':'false',
        'purchaseThirdPerson':'',
        'domestic':'true',
        'isUpgradeableCabin':'false',
        'adults':'1',    'children':'0',    'infants':'0',
        'cabinClass':'ECONOMY',
        'adultDiscounts':'',    'adultInboundDiscounts':'',
        'childDiscounts':'',    'childInboundDiscounts':'',
        'infantDiscounts':'',   'infantInboundDiscounts':'',
        '_':str(int(datetime.now().timestamp())),
    }

    return session_crawling(session_url,url,param,session_head=session_head,head=head,method='get',json=True)

## 해당 아웃바운드 스케줄의 가격에 대한 키 정보 읽어오기
def get_fare_key(fare_key,fare_class,raw_data):
    mapper = raw_data['tripFareMapper']
    fare_key_list = []
    for book_cls in fare_class:
        fare_key_list.extend(mapper[fare_key+'-'+book_cls[:-1]])
    return set(fare_key_list)
## 해당 아웃바운드 스케줄의 가격 정보 읽어 오기
def get_fares(fare_key,fare_class,raw_data):
    fare_keys = get_fare_key(fare_key,fare_class,raw_data)
    fare_list = []
    for key in fare_keys:
        fare = raw_data['fares'][key]['fares'][0]
        fare_list.append([fare['amount'],fare['fuelSurcharge'],fare['tax']])
    return sorted(fare_list)
## 국내선용 파싱 정보
data_heads = ['date','flt','dpt','arr','fare1','fare2','fare3','tax1','tax2','seat']
## raw 데이터로 부터 DataFrame 생성
def raw_to_df(raw_data):
    ## 읽어온 데이터의 json['Result']['code'] 값이 0000 이 아닌 경우 오류
    sch_list = []
    for schedule in raw_data['outbound']:
        flt = schedule['flights'][0]
        sch = [flt['departure'][:4]+flt['departure'][5:7]+flt['departure'][8:10],
               flt['flightNumber'],flt['departure'][11:16],flt['arrival'][11:16]]
        fares = get_fares(schedule['key'],schedule['remainingSeatsByBookingClass'],raw_data)
        sch.extend([str(int(fare[0])) for fare in fares[::-1]])
        sch.extend(['0' for _ in range(3-len(fares))])
        if len(fares) > 0:
            sch.extend([str(int(fare)) for fare in fares[0][1:]])
        else:
            sch.extend(['0','0'])
        ## ECONOMYY 클래스에 대한 좌석 수 추가 - 노멀 클래스
        sch.append(schedule['remainingSeatsByBookingClass'].get('ECONOMYY',0))
        sch_list.append(sch)
    return pd.DataFrame(sch_list,columns=data_heads)

## 하루 기준 가격정보, 텍스정보, 최소값, 최대값, 평균 DataFrame 생성
def read_KE_1day_fare(dpt,arr,dpt_date):
    ## 데이터 읽어오기
    raw_data = crawling_KE_data(dpt,arr,dpt_date)
    df=raw_to_df(raw_data)
    df = df[df['date'] == dpt_date].reset_index()[data_heads]
    if df is None or len(df) == 0: ## 읽는 도중 에러가 나거나 익셉션 발생으로 문제가 있을 경우 처리
        print('********** No Data **********')
        return None
    ## 최소값 최대값 평균 계산
    stat = stat_fare(df,columns=['fare1','fare2','fare3'])
    ## 요약정보 추가
    df.ix[len(df)] = [dpt_date,'min','max','mean',str(int(stat[0])),str(int(stat[1])),str(int(stat[2])),'','','']
    return df

## 정해진 기간의 데이터 읽어오기, 기본 30일
def read_KE_date_range_fare(dpt,arr,start=0,end=31):
    # 기본 30일간 데이터 읽어서 파일로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    fare_list = []
    for d in date_range:
        try:
            fare_df = read_KE_1day_fare(dpt,arr,d)
            if fare_df is not None:
                fare_list.append(fare_df)
        except Exception as e:
            print('****** Error occured : ',e)
    result = pd.concat(fare_list,ignore_index=True)
    print('++++++++++Total : ', len(result))
    ## 파일 저장
    result.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','KE',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%M')))
    return result

In [96]:
## 하루치 데이터 읽어오기
dpt, arr, dpt_date = 'GMP','CJU','20170615'
read_KE_1day_fare(dpt,arr,dpt_date)

Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/06-15-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494900374 , cabinClass:ECONOMY , purchaseThirdPerson: , domestic:true , isUpgradeableCabin:false , infantDiscounts: , adultInboundDiscounts: , childDiscounts: , infantInboundDiscounts: , infants:0 , scheduleDriven:false , children:0 , adultDiscounts: , 
End Session crawling


Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170615,KE1201,07:05,08:10,142000,82000,74000,2200.0,4000.0,9.0
1,20170615,KE1203,07:20,08:30,142000,82000,74000,2200.0,4000.0,9.0
2,20170615,KE1205,08:00,09:10,142000,82000,0,2200.0,4000.0,9.0
3,20170615,KE1209,09:00,10:10,142000,82000,0,2200.0,4000.0,9.0
4,20170615,KE1211,09:15,10:25,142000,82000,0,2200.0,4000.0,9.0
5,20170615,KE1215,10:00,11:10,0,0,0,0.0,0.0,0.0
6,20170615,KE1217,10:40,11:50,142000,82000,0,2200.0,4000.0,9.0
7,20170615,KE1219,11:10,12:20,142000,82000,0,2200.0,4000.0,9.0
8,20170615,KE1221,11:45,12:55,142000,82000,0,2200.0,4000.0,9.0
9,20170615,KE1223,12:20,13:30,142000,82000,66000,2200.0,4000.0,9.0


In [92]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =0, 31 ## 읽어올 데이터 기간
read_KE_date_range_fare(dpt,arr,start,end)

Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/05-16-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494898269 , cabinClass:ECONOMY , purchaseThirdPerson: , domestic:true , isUpgradeableCabin:false , infantDiscounts: , adultInboundDiscounts: , childDiscounts: , infantInboundDiscounts: , infants:0 , scheduleDriven:false , children:0 , adultDiscounts: , 
End Session crawling
****** Error occured :  10 columns passed, passed data had 11 columns
Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/05-17-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494898272 , cabinClass:ECONOMY , 

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170526,KE1201,07:05,08:10,167000,107000,0,2200,4000,1
1,20170526,KE1203,07:20,08:30,0,0,0,0,0,0
2,20170526,KE1205,08:00,09:10,107000,0,0,2200,4000,1
3,20170526,KE1207,08:30,09:40,0,0,0,0,0,0
4,20170526,KE1209,09:00,10:10,0,0,0,0,0,0
5,20170526,KE1211,09:15,10:25,0,0,0,0,0,0
6,20170526,KE1215,10:00,11:10,0,0,0,0,0,0
7,20170526,KE1217,10:40,11:50,167000,107000,0,2200,4000,1
8,20170526,KE1219,11:10,12:20,107000,0,0,2200,4000,1
9,20170526,KE1269,12:00,13:10,0,0,0,0,0,0


In [93]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =31, 45 ## 읽어올 데이터 기간
read_KE_date_range_fare(dpt,arr,start,end)

Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/06-16-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494898564 , cabinClass:ECONOMY , purchaseThirdPerson: , domestic:true , isUpgradeableCabin:false , infantDiscounts: , adultInboundDiscounts: , childDiscounts: , infantInboundDiscounts: , infants:0 , scheduleDriven:false , children:0 , adultDiscounts: , 
End Session crawling
Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/06-17-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494898567 , cabinClass:ECONOMY , purchaseThirdPerson: , domestic:true , isUpgradeableCabin:false , infa

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170616,KE1201,07:05,08:10,107000,0,0,2200,4000,9
1,20170616,KE1203,07:20,08:30,167000,107000,0,2200,4000,9
2,20170616,KE1205,08:00,09:10,167000,107000,0,2200,4000,9
3,20170616,KE1207,08:30,09:40,107000,0,0,2200,4000,9
4,20170616,KE1209,09:00,10:10,167000,0,0,2200,4000,0
5,20170616,KE1211,09:15,10:25,167000,107000,0,2200,4000,9
6,20170616,KE1215,10:00,11:10,0,0,0,0,0,0
7,20170616,KE1217,10:40,11:50,167000,0,0,2200,4000,0
8,20170616,KE1219,11:10,12:20,107000,0,0,2200,4000,9
9,20170616,KE1269,12:00,13:10,167000,107000,0,2200,4000,9


In [94]:
dpt, arr = 'GMP','CJU'
start,end =45, 90 ## 읽어올 데이터 기간
read_KE_date_range_fare(dpt,arr,start,end)

Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/06-30-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494898647 , cabinClass:ECONOMY , purchaseThirdPerson: , domestic:true , isUpgradeableCabin:false , infantDiscounts: , adultInboundDiscounts: , childDiscounts: , infantInboundDiscounts: , infants:0 , scheduleDriven:false , children:0 , adultDiscounts: , 
End Session crawling
Crawling koreanair homepage schedule site
Start Session crawling
make session :  https://www.koreanair.com/korea/ko/booking/booking-gate.html#bookingChange
crawling :  https://www.koreanair.com/api/fly/revenue/from/GMP/to/CJU/on/07-01-2017-0000
>> Parameters
adults:1 , childInboundDiscounts: , flexDays:2 , _:1494898649 , cabinClass:ECONOMY , purchaseThirdPerson: , domestic:true , isUpgradeableCabin:false , infa

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170630,KE1201,07:05,08:10,107000,0,0,2200,4000,9
1,20170630,KE1203,07:20,08:30,107000,0,0,2200,4000,9
2,20170630,KE1205,08:00,09:10,167000,107000,0,2200,4000,9
3,20170630,KE1207,08:30,09:40,167000,107000,0,2200,4000,9
4,20170630,KE1209,09:00,10:10,167000,107000,0,2200,4000,9
5,20170630,KE1211,09:15,10:25,167000,107000,0,2200,4000,9
6,20170630,KE1215,10:00,11:10,167000,107000,0,2200,4000,9
7,20170630,KE1217,10:40,11:50,167000,107000,0,2200,4000,9
8,20170630,KE1219,11:10,12:20,167000,107000,0,2200,4000,9
9,20170630,KE1269,12:00,13:10,167000,107000,0,2200,4000,9
