## 김포 제주(인천 나리타) 노선 가격 정보 읽어오기 1 - 1
- 대상 사이트 : 제주항공 웹사이트, 인터파크 국내선 실시간 예매사이트

### 제주항공 사이트 웹 크롤링
    - 제주항공 항공권예매 페이지를 이용한 크롤링
    - 세션을 생성하여 세션을 이용해 json API 페이지 호출 하여 정보 획득
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [12]:
from IPython.display import display
import requests
import pandas as pd
import numpy as np
from pandas import DataFrame
from bs4 import BeautifulSoup
import time
from datetime import datetime
from datetime import timedelta
from common.crawling_util import session_crawling

def crawling_7C_data(dpt,arr,dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    print('Start crawling info from 7C......')
    url = "https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do"
    head = {
        'Referer':'http://www.jejuair.net/jejuair/main.jsp',
    }
    print('>> make session info')
    sess = requests.Session()
    req = sess.get(url,headers=head)
    ## 세션 정보 확인시 필요
    #print(sess.cookies,sess.cookies.get_dict())
    time.sleep(1) ## 처리중 지연 현상 처리를 위해 1초간 sleep
    
    url = 'https://www.jejuair.net/jejuair/com/jeju/ibe/searchAvail.do'
    head = {
        'Referer':'https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do',
    }
    param ={
        'AdultPaxCnt':'1',
        'ChildPaxCnt':'0',
        'InfantPaxCnt':'0',
        'RouteType':'I',  ## 국내선 D, 국제선 I
        'SystemType':'IBE',
        'Language':'KR',
        'DepStn':dpt,
        'ArrStn':arr,
        'SegType':'DEP',
        'TripType':'OW',
        'DepDate':dpt_date,
        'Index':'1' #국제선용
    }

    print('>> crawl one way fare info : Departure - {}, Arrival - {}, Date - {}, Type - {}'.format(dpt,arr,dpt_date,dom_int))
    req = sess.post(url,param,headers=head)
    time.sleep(1) ## 처리중 지연 현상 처리를 위해 1초간 sleep
    print('......End crawling info from 7C')
    return req.json()

data_fields = ['fltNo','depTime','arrTime','specialEquivFare','specialEquivFareBasis',
               'discountEquivFare','discountEquivFareBasis','fare','fareBasis']
data_heads = ['flt','dpt','arr','fare1','type1','fare2','type2','fare3','type3']
## raw 데이터로 부터 DataFrame 생성
def raw_to_df(raw_data,dom_int='D'):
    if dom_int == 'D':
        return  d_raw_to_df(raw_data)
    else:
        return i_raw_to_df(raw_data)

## 국내선용
d_data_fields = ['depDate','fltNo','depTime','arrTime','specialEquivFare','discountEquivFare',
                 'fare','normalRBD','fareBasis','normalSeatCount']
d_data_heads = ['date','flt','dpt','arr','fare1','fare2','fare3','rbd','fare_basis','seat']
def d_raw_to_df(raw_data):
    ## 읽어온 데이터의 json['Result']['code'] 값이 0000 이 아닌 경우 오류
    if raw_data['Result']['code'] != '0000':
        return None

    raw_fares = raw_data['Result']['data']
    fare_data = []
    for fare in raw_fares:
        line_info = []
        for f in d_data_fields:
            line_info.append(fare[f])
        fare_data.append(line_info)
    return DataFrame(fare_data,columns=d_data_heads)
## 국제선용
i_data_fields = ['depDate','fltNo','depTime','arrTime','specialEquivFare','discountEquivFare',
                 'normalEquivFare','normalRBD','normalEquivFareBasis','normalSeatCount']
i_data_heads = ['date','flt','dpt','arr','fare1','fare2','fare3','rbd','fare_basis','seat']
def i_raw_to_df(raw_data):
    ## 읽어온 데이터의 json['Result']['code'] 값이 0000 이 아닌 경우 오류
    if raw_data['Result']['code'] != '0000':
        return None
    
    raw_fares = raw_data['Result']['data']['availData']
    fare_data = []
    for fare in raw_fares:
        line_info = []
        for f in i_data_fields:
            line_info.append(fare[f])
        fare_data.append(line_info)
    return DataFrame(fare_data,columns=i_data_heads)

## 유류할증료, 공항세 읽어오기
## 노선별 날짜별 fare base 에 상관없이 똑같다는 가정
## raw 데이터로 부터 DataFrame 생성
def crawling_7C_tax(dpt,arr,flt,dpt_date,dpt_time,arr_time,fare,rbd,basis,dom_int='D'):
    url = 'https://www.jejuair.net/jejuair/com/jeju/ibe/searchFareTax.do'
    head = {
        'Referer':'https://www.jejuair.net/jejuair/com/jeju/ibe/availInit.do',
    }
    param ={
        'DepDate':dpt_date+dpt_time,
        'ArrDate':dpt_date+arr_time,
        'DepStn':dpt,
        'ArrStn':arr,
        'RBD':rbd,
        'FareBasis':basis,
        'EquivFare':fare,
        'FltNo':flt,
        'FareTypeNo':'3',## Normal Fare 기준
        'TripType':'OW',
        'RouteType':dom_int, ## 국내선 국제선 구분
        #'SystemType':'IBE'
        #'depDesc':'서울(김포)',
        #'arrDesc':'제주',
        #'ReqType':'Price',
        #'AdultPaxCnt':'1',
        #'ChildPaxCnt':'0',
        #'InfantPaxCnt':'0',
        #'Language':'KR',
    }
    print('crawling tax - param\n',param)
    raw_tax = requests.post(url,param,headers=head).json()
    #return raw_tax
    return (raw_tax['Result']['data'][0]['farePriceTaxDataBeans'][0]['taxAmount'],
            raw_tax['Result']['data'][0]['farePriceTaxDataBeans'][1]['taxAmount'])



## 하루 기준 가격정보, 텍스정보, 최소값, 최대값, 평균 DataFrame 생성
def read_7C_1day_fare(dpt,arr,dpt_date,dom_int):
    ## 데이터 읽어오기
    raw_data = crawling_7C_data(dpt,arr,dpt_date,dom_int)
    df=raw_to_df(raw_data,dom_int)
    if df is None or len(df) == 0: ## 읽는 도중 에러가 나거나 익셉션 발생으로 문제가 있을 경우 처리
        print('********** No Data Type 1 **********')
        return None
    ## tax 읽어오기
    df_tax = df[df['seat'] != '0']
    if len(df_tax) == 0: ## 읽어온 값의 좌석이 아무것도 없을 경우
        print('********** No Data Type 2 **********')
        df['tax1'],df['tax2'] = 0,0
    else:    
        flt,dpt_time,arr_time,fare,rbd,basis=df_tax.values[0,[1,2,3,6,7,8]]
        df['tax1'],df['tax2'] = crawling_7C_tax(dpt,arr,flt,dpt_date,dpt_time,arr_time,fare,rbd,basis,dom_int)
    ## 최소값 최대값 평균 계산
    fare_arr = df[df['seat'] != '0'][['fare1','fare2','fare3']].values ## fare 만 구해오기
    fare_arr = fare_arr.reshape(1,-1) # shape 1차원으로 변경
    fare_arr = np.unique(fare_arr)
    if '' in fare_arr or '0' in fare_arr:
        fare_arr = fare_arr[1:] # 0 값 제거
    fare_arr = fare_arr.astype('int') # 중복값 제거, 수치형으로 변경
    #if 0 in fare_arr:
    #    fare_arr = fare_arr[1:] # 0 값 제거
    ## 요약정보 추가
    df.ix[len(df)] = [dpt_date,'min','max','mean',str(fare_arr.min()),str(fare_arr.max()),str(fare_arr.mean()),'','','','','']
    return df

## 정해진 기간의 데이터 읽어오기, 기본 30일
def read_7C_date_range_fare(dpt,arr,dom_int,n=30):
    # 기본 30일간 데이터 읽어서 파일로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(n)]
    fare_list = []
    for d in date_range:
        try:
            fare_df = read_7C_1day_fare(dpt,arr,d,dom_int)
            if fare_df is not None:
                fare_list.append(fare_df)
        except Exception as e:
            print('****** Error occured : ',e)
    result = pd.concat(fare_list,ignore_index=True)
    print('++++++++++Total : ', len(result))
    ## 파일 저장
    result.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','7C',dpt,arr,dpt_date,n,datetime.today().strftime('%Y%m%d%H%m')))
    return result

In [7]:
## 하루치 데이터 읽어오기
dpt, arr, dpt_date, dom_int = 'GMP','CJU','20170514','D'
#dpt, arr, dpt_date, dom_int = 'ICN','NRT','2017-05-01','I'

crawling_7C_data(dpt,arr,dpt_date,dom_int)
#read_7C_1day_fare(dpt,arr,dpt_date,dom_int)

Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - GMP, Arrival - CJU, Date - 20170514, Type - D
......End crawling info from 7C


{'Result': {'code': '0000',
  'data': [{'RBD': 'N',
    'YClassStatusCode': '',
    'acType': '737',
    'afterDepDate': '',
    'afterMinFare': '0',
    'arrDate': '20170514',
    'arrStn': 'CJU',
    'arrStnDesc': '제주(CJU)',
    'arrTime': '0730',
    'beforeDepDate': '',
    'beforeMinFare': '0',
    'carrier': '7C',
    'connectAvailData': {'acType': '',
     'arrDate': '',
     'arrStn': '',
     'arrStnDesc': '',
     'arrTime': '',
     'carrier': '',
     'currency': '',
     'depDate': '',
     'depStn': '',
     'depStnDesc': '',
     'depTime': '',
     'discountEquivFare': '',
     'discountEquivFareBasis': '',
     'discountEquivFareName': '',
     'discountRBD': '',
     'fltNo': '',
     'fltType': '',
     'normalEquivFare': '',
     'normalEquivFareBasis': '',
     'normalEquivFareName': '',
     'normalRBD': '',
     'specialEquivFare': '',
     'specialEquivFareBasis': '',
     'specialEquivFareName': '',
     'specialRBD': ''},
    'connectNo': '',
    'currency': '

In [8]:
## 정해진 기간의 데이터 읽어오기
dpt, arr,dom_int = 'GMP','CJU','D'
ndate = 30 ## 읽어올 데이터 기간
read_7C_date_range_fare(dpt,arr,dom_int,ndate)

Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - GMP, Arrival - CJU, Date - 20170420, Type - D
......End crawling info from 7C
crawling tax - param
 {'FareBasis': 'YX', 'RBD': 'Y', 'TripType': 'OW', 'EquivFare': '65600', 'DepDate': '201704201815', 'FareTypeNo': '3', 'DepStn': 'GMP', 'ArrDate': '201704201925', 'ArrStn': 'CJU', 'FltNo': '127 ', 'RouteType': 'D'}
Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - GMP, Arrival - CJU, Date - 20170421, Type - D
......End crawling info from 7C
crawling tax - param
 {'FareBasis': 'YP', 'RBD': 'Y', 'TripType': 'OW', 'EquivFare': '97700', 'DepDate': '201704211315', 'FareTypeNo': '3', 'DepStn': 'GMP', 'ArrDate': '201704211425', 'ArrStn': 'CJU', 'FltNo': '143 ', 'RouteType': 'D'}
Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - GMP, Arrival - CJU, Date - 20170422, Type - D
......End crawling info from 7C
crawli

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,rbd,fare_basis,seat,tax1,tax2
0,20170420,127,1815,1925,0,0,65600,Y,YX,6,2200,4000
1,20170420,129,1905,2015,0,0,65600,Y,YX,0,2200,4000
2,20170420,133,1935,2045,0,0,65600,Y,YX,0,2200,4000
3,20170420,min,max,mean,65600,65600,65600.0,,,,,
4,20170421,151,0625,0730,0,0,97700,Y,YP,0,2200,4000
5,20170421,101,0630,0740,0,0,97700,Y,YP,0,2200,4000
6,20170421,103,0700,0810,0,0,97700,Y,YP,0,2200,4000
7,20170421,105,0750,0900,0,0,97700,Y,YP,0,2200,4000
8,20170421,107,0900,1010,0,0,97700,Y,YP,0,2200,4000
9,20170421,141,0945,1055,0,0,97700,Y,YP,0,2200,4000


In [13]:
## 정해진 기간의 데이터 읽어오기
dpt, arr,dom_int = 'ICN','NRT','I'
ndate = 30 ## 읽어올 데이터 기간
read_7C_date_range_fare(dpt,arr,dom_int,ndate)

Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - ICN, Arrival - NRT, Date - 20170420, Type - I
......End crawling info from 7C
********** No Data Type 1 **********
Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - ICN, Arrival - NRT, Date - 20170421, Type - I
......End crawling info from 7C
crawling tax - param
 {'FareBasis': 'YOW2KR', 'RBD': 'Y', 'TripType': 'OW', 'EquivFare': '360000', 'DepDate': '201704210700', 'FareTypeNo': '3', 'DepStn': 'ICN', 'ArrDate': '201704210950', 'ArrStn': 'NRT', 'FltNo': '1162', 'RouteType': 'I'}
Start crawling info from 7C......
>> make session info
>> crawl one way fare info : Departure - ICN, Arrival - NRT, Date - 20170422, Type - I
......End crawling info from 7C
crawling tax - param
 {'FareBasis': 'YOW2KR', 'RBD': 'Y', 'TripType': 'OW', 'EquivFare': '360000', 'DepDate': '201704220700', 'FareTypeNo': '3', 'DepStn': 'ICN', 'ArrDate': '201704220950', 'ArrStn': '

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,rbd,fare_basis,seat,tax1,tax2
0,20170421,1162,0700,0950,,120000,360000,Y,YOW2KR,6,1200,28000
1,20170421,1102,0830,1050,,160000,360000,Y,YOW2KR,7,1200,28000
2,20170421,1106,1035,1255,,160000,360000,Y,YOW2KR,4,1200,28000
3,20170421,1104,1505,1730,,130000,360000,Y,YOW2KR,9,1200,28000
4,20170421,min,max,mean,120000,360000,192500.0,,,,,
5,20170422,1162,0700,0950,,150000,360000,Y,YOW2KR,9,1200,28000
6,20170422,1102,0830,1050,,110000,360000,Y,YOW2KR,9,1200,28000
7,20170422,1106,1035,1255,,190000,360000,Y,YOW2KR,9,1200,28000
8,20170422,1104,1505,1730,,120000,360000,Y,YOW2KR,9,1200,28000
9,20170422,min,max,mean,110000,360000,186000.0,,,,,
