## 김포 제주 노선 가격 정보 읽어오기
- 대상 사이트 : 제주항공 웹사이트, 인터파크 국내선 실시간 예매사이트

### 제주항공 사이트 웹 크롤링
    - 제주항공 항공권예매 페이지를 이용한 크롤링
    - 세션을 생성하여 세션을 이용해 json API 페이지 호출 하여 정보 획득
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [28]:
from IPython.display import display
import requests
import pandas as pd
import numpy as np
from pandas import DataFrame
from bs4 import BeautifulSoup
import time
from datetime import datetime
from datetime import timedelta
from common.crawling_util import simple_crawling, session_crawling
from common.parsing_util import parsing_json_data_to_dict

dom_int='D' ## 국내선(D) 국제선(I) 구분
def crawling_7C_data(dpt,arr,dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    print('Crawling jejuair homepage schedule site')
    session_url = "https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do"
    session_head = {
        'Referer':'http://www.jejuair.net/jejuair/main.jsp',
    }
    
    url = 'https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do'
    head = {
        'Referer':'https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do',
    }
    param ={
        'AdultPaxCnt':'1',
        'ChildPaxCnt':'0',
        'InfantPaxCnt':'0',
        'RouteType':dom_int,  ## 국내선 D, 국제선 I
        'SystemType':'IBE',
        'Language':'KR',
        'DepStn':dpt,
        'ArrStn':arr,
        'SegType':'DEP',
        'TripType':'OW',
        'DepDate':dpt_date,
        'Index':'1' #국제선용
    }

    return session_crawling(session_url,url,param,session_head=session_head,head=head,method='post',json=True)

## 국내선용 파싱 정보
loop_field = "['Result']['data']"
parse_info = {
    'date':"['Result']['data'][{}]['depDate']",#i
    'flt':"['Result']['data'][{}]['fltNo']",#i
    'dpt':"['Result']['data'][{}]['depTime']",#i
    'arr':"['Result']['data'][{}]['arrTime']",#i
    'fare1':"['Result']['data'][{}]['specialEquivFare']",#i
    'fare2':"['Result']['data'][{}]['discountEquivFare']",#i
    'fare3':"['Result']['data'][{}]['fare']",#i
    'rbd':"['Result']['data'][{}]['normalRBD']",#i
    'fare_basis':"['Result']['data'][{}]['fareBasis']",#i
    'seat':"['Result']['data'][{}]['normalSeatCount']",#i
}
data_heads = ['date','flt','dpt','arr','fare1','fare2','fare3','rbd','fare_basis','seat']
## raw 데이터로 부터 DataFrame 생성
def raw_to_df(raw_data):
    ## 읽어온 데이터의 json['Result']['code'] 값이 0000 이 아닌 경우 오류
    if raw_data['Result']['code'] != '0000':
        return None

    return pd.DataFrame(parsing_json_data_to_dict(raw_data, loop_field, parse_info))

## 유류할증료, 공항세 읽어오기
## 노선별 날짜별 fare base 에 상관없이 똑같다는 가정
## raw 데이터로 부터 DataFrame 생성
def crawling_7C_tax(dpt,arr,flt,dpt_date,dpt_time,arr_time,fare,rbd,basis):
    url = 'https://www.jejuair.net/jejuair/com/jeju/ibe/searchFareTax.do'
    head = {
        'Referer':'https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do',
    }
    param ={
        'DepDate':dpt_date+dpt_time,              'ArrDate':dpt_date+arr_time,
        'DepStn':dpt,        'ArrStn':arr,        'RBD':rbd,
        'FareBasis':basis,   'EquivFare':fare,    'FltNo':flt,
        'FareTypeNo':'3',## Normal Fare 기준
        'TripType':'OW',     'RouteType':dom_int, ## 국내선 국제선 구분
        #'SystemType':'IBE'        #'depDesc':'서울(김포)',        #'arrDesc':'제주',
        #'ReqType':'Price',        #'AdultPaxCnt':'1',             #'ChildPaxCnt':'0',
        #'InfantPaxCnt':'0',       #'Language':'KR',
    }
    raw_tax = simple_crawling(url, param, head=head, method='post', json=True)
    return (raw_tax['Result']['data'][0]['farePriceTaxDataBeans'][0]['taxAmount'],
            raw_tax['Result']['data'][0]['farePriceTaxDataBeans'][1]['taxAmount'])



## 하루 기준 가격정보, 텍스정보, 최소값, 최대값, 평균 DataFrame 생성
def read_7C_1day_fare(dpt,arr,dpt_date):
    ## 데이터 읽어오기
    raw_data = crawling_7C_data(dpt,arr,dpt_date)
    df=raw_to_df(raw_data)
    if df is None or len(df) == 0: ## 읽는 도중 에러가 나거나 익셉션 발생으로 문제가 있을 경우 처리
        print('********** No Data Type 1 **********')
        return None
    ## tax 읽어오기
    df_tax = df[df['seat'] != '0']
    if len(df_tax) == 0: ## 읽어온 값의 좌석이 아무것도 없을 경우
        print('********** No Data Type 2 **********')
        df['tax1'],df['tax2'] = 0,0
    else:    
        flt,dpt_time,arr_time,fare,rbd,basis=df_tax[data_heads].values[0,[1,2,3,6,7,8]]
        df['tax1'],df['tax2'] = crawling_7C_tax(dpt,arr,flt,dpt_date,dpt_time,arr_time,fare,rbd,basis)
    ## 최소값 최대값 평균 계산
    fare_arr = df[df['seat'] != '0'][['fare1','fare2','fare3']].values ## fare 만 구해오기
    fare_arr = fare_arr.reshape(1,-1) # shape 1차원으로 변경
    fare_arr = np.unique(fare_arr)
    if '' in fare_arr or '0' in fare_arr:
        fare_arr = fare_arr[1:] # 0 값 제거
    fare_arr = fare_arr.astype('int') # 중복값 제거, 수치형으로 변경
    #if 0 in fare_arr:
    #    fare_arr = fare_arr[1:] # 0 값 제거
    ## 요약정보 추가
    df.ix[len(df)] = [dpt_date,'min','max','mean',str(fare_arr.min()),str(fare_arr.max()),str(fare_arr.mean()),'','','','','']
    return df

## 정해진 기간의 데이터 읽어오기, 기본 30일
def read_7C_date_range_fare(dpt,arr,start=0,end=31):
    # 기본 30일간 데이터 읽어서 파일로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    fare_list = []
    for d in date_range:
        try:
            fare_df = read_7C_1day_fare(dpt,arr,d)
            if fare_df is not None:
                fare_list.append(fare_df)
        except Exception as e:
            print('****** Error occured : ',e)
    result = pd.concat(fare_list,ignore_index=True)
    print('++++++++++Total : ', len(result))
    ## 파일 저장
    result.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','7C',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%m')))
    return result

In [30]:
## 하루치 데이터 읽어오기
dpt, arr, dpt_date = 'GMP','CJU','20170514'
read_7C_1day_fare(dpt,arr,dpt_date)

Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemType:IBE , Index:1 , AdultPaxCnt:1 , DepDate:20170514 , InfantPaxCnt:0 , SegType:DEP , ChildPaxCnt:0 , DepStn:GMP , 
End Session crawling
Start Simple crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchFareTax.do
>> Parameters
ArrStn:CJU , FareTypeNo:3 , DepDate:201705140625 , DepStn:GMP , FltNo:151  , EquivFare:80000 , RBD:Y , RouteType:D , TripType:OW , FareBasis:YW , ArrDate:201705140730 , 
End Simple crawling


Unnamed: 0,arr,date,dpt,fare1,fare2,fare3,fare_basis,flt,rbd,seat,tax1,tax2
0,730,20170514,0625,0,51900,80000,YW,151.0,Y,9.0,2200.0,4000.0
1,740,20170514,0630,0,51900,80000,YW,101.0,Y,9.0,2200.0,4000.0
2,810,20170514,0700,0,68900,80000,YW,103.0,Y,9.0,2200.0,4000.0
3,900,20170514,0750,0,68900,80000,YW,105.0,Y,9.0,2200.0,4000.0
4,1010,20170514,0900,0,68900,80000,YW,107.0,Y,9.0,2200.0,4000.0
5,1055,20170514,0945,0,68900,80000,YW,141.0,Y,9.0,2200.0,4000.0
6,1115,20170514,1005,0,75900,80000,YW,111.0,Y,9.0,2200.0,4000.0
7,1335,20170514,1225,0,56900,80000,YW,115.0,Y,9.0,2200.0,4000.0
8,1350,20170514,1240,0,56900,80000,YW,117.0,Y,9.0,2200.0,4000.0
9,1425,20170514,1315,0,51900,80000,YW,143.0,Y,9.0,2200.0,4000.0


In [32]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =0, 31 ## 읽어올 데이터 기간
read_7C_date_range_fare(dpt,arr,start,end)

Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemType:IBE , Index:1 , AdultPaxCnt:1 , DepDate:20170424 , InfantPaxCnt:0 , SegType:DEP , ChildPaxCnt:0 , DepStn:GMP , 
End Session crawling
Start Simple crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchFareTax.do
>> Parameters
ArrStn:CJU , FareTypeNo:3 , DepDate:201704241440 , DepStn:GMP , FltNo:121  , EquivFare:65600 , RBD:Y , RouteType:D , TripType:OW , FareBasis:YX , ArrDate:201704241550 , 
End Simple crawling
Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemTyp

Unnamed: 0,arr,date,dpt,fare1,fare2,fare3,fare_basis,flt,rbd,seat,tax1,tax2
0,1335,20170424,1225,0,0,65600,YX,115,Y,0,2200,4000
1,1350,20170424,1240,0,0,65600,YX,117,Y,0,2200,4000
2,1425,20170424,1315,0,0,65600,YX,143,Y,0,2200,4000
3,1500,20170424,1350,0,0,65600,YX,119,Y,0,2200,4000
4,1550,20170424,1440,0,41900,65600,YX,121,Y,3,2200,4000
5,1640,20170424,1525,0,36900,65600,YX,155,Y,9,2200,4000
6,1655,20170424,1545,0,31900,65600,YX,123,Y,9,2200,4000
7,1715,20170424,1605,0,31900,65600,YX,125,Y,9,2200,4000
8,1755,20170424,1645,0,28900,65600,YX,145,Y,9,2200,4000
9,1925,20170424,1815,0,25900,65600,YX,127,Y,9,2200,4000


In [33]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =31, 45 ## 읽어올 데이터 기간
read_7C_date_range_fare(dpt,arr,start,end)

Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemType:IBE , Index:1 , AdultPaxCnt:1 , DepDate:20170525 , InfantPaxCnt:0 , SegType:DEP , ChildPaxCnt:0 , DepStn:GMP , 
End Session crawling
Start Simple crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchFareTax.do
>> Parameters
ArrStn:CJU , FareTypeNo:3 , DepDate:201705250625 , DepStn:GMP , FltNo:151  , EquivFare:65600 , RBD:Y , RouteType:D , TripType:OW , FareBasis:YX , ArrDate:201705250730 , 
End Simple crawling
Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemTyp

Unnamed: 0,arr,date,dpt,fare1,fare2,fare3,fare_basis,flt,rbd,seat,tax1,tax2
0,0730,20170525,0625,0,51900,65600,YX,151,Y,9,2200,4000
1,0740,20170525,0630,0,51900,65600,YX,101,Y,9,2200,4000
2,0810,20170525,0700,0,61900,65600,YX,103,Y,9,2200,4000
3,0900,20170525,0750,0,0,65600,YX,105,Y,9,2200,4000
4,1010,20170525,0900,0,0,65600,YX,107,Y,9,2200,4000
5,1055,20170525,0945,0,0,65600,YX,141,Y,9,2200,4000
6,1115,20170525,1005,0,0,65600,YX,111,Y,9,2200,4000
7,1335,20170525,1225,0,61900,65600,YX,115,Y,9,2200,4000
8,1350,20170525,1240,0,0,65600,YX,117,Y,9,2200,4000
9,1425,20170525,1315,0,61900,65600,YX,143,Y,9,2200,4000


In [34]:
dpt, arr = 'GMP','CJU'
start,end =45, 90 ## 읽어올 데이터 기간
read_7C_date_range_fare(dpt,arr,start,end)

Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemType:IBE , Index:1 , AdultPaxCnt:1 , DepDate:20170608 , InfantPaxCnt:0 , SegType:DEP , ChildPaxCnt:0 , DepStn:GMP , 
End Session crawling
Start Simple crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchFareTax.do
>> Parameters
ArrStn:CJU , FareTypeNo:3 , DepDate:201706080625 , DepStn:GMP , FltNo:151  , EquivFare:65600 , RBD:Y , RouteType:D , TripType:OW , FareBasis:YX , ArrDate:201706080730 , 
End Simple crawling
Crawling jejuair homepage schedule site
Start Session crawling
make session :  https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do
crawling :  https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do
>> Parameters
ArrStn:CJU , RouteType:D , Language:KR , TripType:OW , SystemTyp

Unnamed: 0,arr,date,dpt,fare1,fare2,fare3,fare_basis,flt,rbd,seat,tax1,tax2
0,0730,20170608,0625,0,46900,65600,YX,151,Y,9,2200,4000
1,0740,20170608,0630,0,46900,65600,YX,101,Y,9,2200,4000
2,0810,20170608,0700,0,61900,65600,YX,103,Y,9,2200,4000
3,0900,20170608,0750,0,0,65600,YX,105,Y,9,2200,4000
4,1010,20170608,0900,0,0,65600,YX,107,Y,9,2200,4000
5,1055,20170608,0945,0,0,65600,YX,141,Y,9,2200,4000
6,1230,20170608,1120,0,61900,65600,YX,113,Y,9,2200,4000
7,1335,20170608,1225,0,51900,65600,YX,115,Y,9,2200,4000
8,1350,20170608,1240,0,51900,65600,YX,117,Y,9,2200,4000
9,1425,20170608,1315,0,56900,65600,YX,143,Y,9,2200,4000
