## ZE 기준 각 플라이트별 경쟁사의 최저 가격 비교
- ZE 정보는 홈페이지의 스케줄 정보(편, 시간, CFG, BKG, 최저운임)
    * CFG : 스케줄 정보의 seatCnt
    * BKG : 실시간 운항 스케줄 정보의 leg 정보의 aircraftTypeSuffix
- 각 항공사 정보는 인터파크 가격 기준

In [141]:
import json
import logging
import datetime
import pandas as pd
from common.log_util import log, logger_initialize
from common.parsing_util import parsing_json_data_to_dict
from common.crawl_func import *
from common.scrap_func import *

logger_initialize('dev_logger_setting.json')

In [142]:
## 인터파크 정보 읽어오기
## 포맷 : 항공사,날짜,출발지,도착지,편명,출발시간,판매최저값
def scraping_IP_schdedule(raw_data):
    log('start scraping Interpark crawled data')
    raw_json = json_loads(raw_data)#json.loads(raw_data)
    if raw_json is None: ## 파싱 에러
        return None
    if raw_json['replyHeader']['errorCode'] == '1': ## 에러
        log('crawling data not found!',logging.ERROR)
        return None
    loop_field = "['replyAvailFare']['availFareSet']"
    parse_info = {
        'airline':"['replyAvailFare']['availFareSet'][{}]['segFare']['carCode']",#i
        'date':"['replyAvailFare']['availFareSet'][{}]['segFare']['depDate']",#i
        'flt':"['replyAvailFare']['availFareSet'][{}]['segFare']['mainFlt']",#i
        'dpt':"['replyAvailFare']['availFareSet'][{}]['segFare']['depCity']",#i
        'dpt_time':"['replyAvailFare']['availFareSet'][{}]['segFare']['depTime']",#i
        'arr':"['replyAvailFare']['availFareSet'][{}]['segFare']['arrCity']",#i
        'arr_time':"['replyAvailFare']['availFareSet'][{}]['segFare']['arrTime']",#i
        'fare1':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['fare']",#i,j
        'fare2':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['fare']",#i,j+1
        'tax':"['replyAvailFare']['availFareSet'][{}]['segFare']['airTax']",#i
        'fuel':"['replyAvailFare']['availFareSet'][{}]['segFare']['fuelChg']",#i
        'seat1':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['noOfAvailSeat']",#i,j
        'seat2':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['noOfAvailSeat']",#i,j+1
    }

    parsed_list = parsing_json_data_to_dict(raw_json,loop_field, parse_info)
    if len(parsed_list) == 0:
        log('scraping data not found!',logging.WARNING)
        return None
    scraped_list = []
    for td in parsed_list:
        scraped_list.append([td['airline'],td['date'],td['dpt'],td['arr'],td['flt'],td['dpt_time']] +\
                            [min([int(td['fare1']),int(td['fare1'])])])
    log('end scraping Interpark crawled data')
    return scraped_list
## 이스타항공 정보 읽어오기
## 포맷 : 날짜,출발지,도착지,편명,출발시간,CFG,BKG,L/F,판매최저값
def scraping_ZE_schedule(raw_data,dpt,arr,dpt_date):
    log('start scraping Eastarjet crawled data')
    raw_json = json.loads(raw_data)
    fare_info = []
    for trip in raw_json['result'][0]['resultData'][0]['FlightSearch']['trips'][0]:
        line = {
            'flt':trip['flightNumberText'],
            'dpt_time':trip['standardTimeOfDeparture'],
            'CFG':int(trip['seatCnt']),
            'fare1':0, ## 특가
            'fare2':0, ## 할인 - 해피패키지는 e_amount - 제외
            'fare3':0, ## 정상
        }
        if 'e_amount' in trip.keys():
            line['fare1'] = int(trip['e_amount'])
        if 'd_amount' in trip.keys():
            line['fare2'] = int(trip['d_amount'])
        if 'y_amount' in trip.keys():
            line['fare3'] = int(trip['y_amount'])
        fare_info.append(line)
    scraped_list = []
    for fare in fare_info:
        td_list = []
        td_list.extend([dpt_date,dpt,arr,fare['flt'],fare['dpt_time'][8:12],fare['CFG']])
        ## BKG 값을 온라인에서 읽어오기
        td_list.append(0) ## BKG
        td_list.append(0) ## L/F
        ## 운임은 최저가
        fare_list = [fare['fare1'],fare['fare2'],fare['fare3']]
        if max(fare_list) == 0:
            td_list.append(0)
        else:
            td_list.append(min([p for p in fare_list if p > 0]))
        scraped_list.append(td_list)
    log('end scraping Eastarjet crawled data')
    return scraped_list

## 실시간 스케줄 정보
def crawling_ZE_rtschedule(dpt,arr,dpt_date):
    log('Crawling eastarjet homepage realtime schedule info')
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    session_url = "https://www.eastarjet.com/newstar/PGWIB00002"
    session_head = {
        'Referer':'https://www.eastarjet.com/newstar/PGWIB00001',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }
    sess = requests.Session()
    sess.get(session_url,headers=session_head)
    time.sleep(1) ## 1초가 지연 처리    
    url = 'https://www.eastarjet.com/json/dataService'
    head = {
        'Referer':'https://www.eastarjet.com/newstar/PGWBA00002',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }
    flightSchedule = {
        "departure0":['"'+dpt+'"'],"arrival0":['"'+arr+'"'],"departure1":['""'],"arrival1":['""'],"departure2":['""'],"arrival2":['""'],
        "departure3":['""'],"arrival3":['""'],"beginDate0":dpt_date[:4]+'-'+dpt_date[4:6]+'-'+dpt_date[6:],"endDate":"","beginDate1":"","beginDate2":"",
        "beginDate3":"","flyCount":"1","flyType":"2","isCheck":"Daily"
    }
    map_info = {"flightSchedule":json.dumps(flightSchedule)}
    pid, rcode, fname, pname = 1,'PGWIB00002','DTWIB00001','flightSchedule'
    payload = get_ZE_payload(pid,rcode,fname,pname,map_info)
    raw_json = jsonpayload_crawling(url, payload, session=sess, head=head, method='post', json=True)
    journeys = raw_json['result'][0]['resultData'][0]['flightScheduleList'][0]['nsFlightScheduleInfo']['journeyDateMarketList'][0]['journeys']
    parsed_list = []
    for journey in journeys:
        actype = journey['segments'][0]['legs'][0]['aircraftType']
        cfg = journey['segments'][0]['legs'][0]['aircraftTypeSuffix']
        if actype == '737': cfg = '149'
        parsed_list.append([actype,cfg,journey['segments'][0]['legs'][0]['flightDesignator']['flightNumber'].strip()])
    return parsed_list

## Payload 구조
## map_info 예){"flightSearch":flight_parameter}
def get_ZE_payload(pid,rcode,fname,pname,map_info):
    ## id, requestUniqueCode, functionName, paramName, inParameters-map
    payLoad={
        "id": pid, "method": "DataService.service",
        "params": [{
            "javaClass": "com.jein.framework.connectivity.parameter.RequestParameter",
            "requestUniqueCode": rcode, "requestExecuteType": "BIZ",
            "DBTransaction": False, "sourceName": None, "sourceExtension": None,
            "functionName": fname, "panelId": None, "methodType": None,
            "inParameters": {
                "javaClass": "java.util.List",
                "list": [{
                    "javaClass": "com.jein.framework.connectivity.parameter.InParameter",
                    "paramName": pname, "ioType": "IN", "structureType": "FIELD",
                    "data": {
                        "javaClass": "java.util.List",
                        "list": [{
                            "map": map_info,
                            "javaClass": "java.util.Map"}]
                        }
                }]
            },
            "filterParameter": {"javaClass": "java.util.Map", "map": {}}
        }]
    }
    return payLoad

## 특정 시간에 대한 HHMM(시간분) 구조의 배열에서 시간 간격이 최소인 배열의 인덱스 찾기
## 인덱스와 차이값 리턴
def select_row_by_minute(hhmm,hhmm_list, reverse=False):
    ## 찾을 시간, 시간 배열, 최소/최대 여부 - 기본은 최소
    #print(hhmm)
    compare_list = []
    for i,l in enumerate(hhmm_list):
        compare_list.append([cal_minute(hhmm,l),i])
    return sorted(compare_list,reverse=reverse)[0][::-1]

## 시간 차이 분으로 계산 포맷 HHMM
def cal_minute(h1,h2):
    return abs((int(h1[:2])-int(h2[:2]))*60+int(h1[2:])-int(h2[2:]))
## 일일 리포트 생성
def make_daily_dom_fare_report(dpt,arr,dpt_date,airlines):
    ## 스케줄 정보 읽어오기
    raw_data = crawling_ZE_dom_int(dpt,arr,dpt_date)
    ## 데이터 스크래핑
    ze_schedule_list = scraping_ZE_schedule(raw_data,dpt,arr,dpt_date)
    ze_rtschedule_list = crawling_ZE_rtschedule(dpt,arr,dpt_date)
    ## 실시간 정보를 이용하여 CFG 정보 업데이트
    for sch in ze_schedule_list:
        sch[6] = [rtsch[1] for rtsch in ze_rtschedule_list if rtsch[2]==sch[3][2:]][0]
    ## 인터파크 정보 읽어 타사 리스트 생성
    ip_schedule_datas = []
    for airline in airlines:
        ip_schedule_datas.append(scraping_IP_schdedule(crawling_IP_dom(airline,dpt,arr,dpt_date)))
    target_list = ze_schedule_list.copy()
    ## ZE 스케줄을 기준으로 경쟁 항공사의 편, 가격 매칭
    for ip_schedule_list in ip_schedule_datas:
        for p,sch in enumerate(target_list):
            i,g = select_row_by_minute(sch[4],[sch[5] for sch in ip_schedule_list])
            target_list[p] = sch + [ip_schedule_list[i][0]+'-'+ip_schedule_list[i][4]+'-'+ip_schedule_list[i][5],ip_schedule_list[i][6]]
    return target_list

## 템플릿 이용 HTML 파일 작성
def make_daily_report_html(column_list,data_list):
    ## 검색조건 만들기
    title = 'daily report'
    column_html = make_grid_column(column_list)
    grid_html = make_grid_data(data_list)
    with open('template/daily_fare_report.html',encoding='utf-8') as fp:
        template = fp.read()
    ## html 생성
    html = template.replace('{title}',title).replace('{columns}',column_html).replace('{grid_datas}',grid_html).replace('\n','')
    ## 파일 저장
    html_file = 'html/daily_fare_report_{}.html'.format(datetime.today().strftime('%Y%m%d%H%M%S%f'))
    # print(html_file)
    with open(html_file,'wt',encoding='utf-8') as fp:
        fp.write(html)
    return html_file
    

## Grid 데이터 작성
def make_grid_data(data_list):
    tr_html = '<tr>{}</tr>'
    td_html = '<td>{}</td>'
    return ''.join([tr_html.format(''.join([td_html.format(d) for d in td])) for td in data_list])
def make_grid_column(column_list):
    tr_html = '<tr>{}</tr>'
    th_html = '<th>{}</th>'
    return tr_html.format(''.join([th_html.format(th) for th in column_list]))

In [143]:
dpt,arr,dpt_date = 'GMP','CJU','20170620'
airlines = ['7C','TW','LJ']
columns = ['date','dpt','arr','flt','dpt_time','CFG','BKG','R/F','fare']
for acode in airlines:
    columns.append(acode+'_info')
    columns.append(acode+'_fare')

result_list = make_daily_dom_fare_report(dpt,arr,dpt_date,airlines)

2017-06-16 13:36:55,258 root     DEBUG    Crawling eastarjet homepage schedule site


Start Json Payload crawling :  https://www.eastarjet.com/json/dataService
>> Parameters
method:DataService.service , id:2 , params:[{'panelId': None, 'requestExecuteType': 'BIZ', 'methodType': None, 'inParameters': {'javaClass': 'java.util.List', 'list': [{'javaClass': 'com.jein.framework.connectivity.parameter.InParameter', 'paramName': 'flightSearch', 'data': {'javaClass': 'java.util.List', 'list': [{'map': {'flightSearch': '{"viewType":"","fly_type":"2","person1":"1","person2":"0","person3":"0",    "residentCountry":"KR","currency":"","promotion_cd":"",    "flySection":[{"departure_cd":"GMP","arrival_cd":"CJU","departure_date_cd":"20170620"}]}'}, 'javaClass': 'java.util.Map'}]}, 'ioType': 'IN', 'structureType': 'FIELD'}]}, 'requestUniqueCode': 'PGWBA00002', 'sourceName': None, 'javaClass': 'com.jein.framework.connectivity.parameter.RequestParameter', 'functionName': 'DTWBA00022', 'sourceExtension': None, 'DBTransaction': False, 'filterParameter': {'javaClass': 'java.util.Map', 'map'

2017-06-16 13:36:58,764 root     DEBUG    start scraping Eastarjet crawled data
2017-06-16 13:36:58,817 root     DEBUG    end scraping Eastarjet crawled data
2017-06-16 13:36:58,822 root     DEBUG    Crawling eastarjet homepage realtime schedule info


End Json Payload crawling
Start Json Payload crawling :  https://www.eastarjet.com/json/dataService
>> Parameters
method:DataService.service , id:1 , params:[{'panelId': None, 'requestExecuteType': 'BIZ', 'methodType': None, 'inParameters': {'javaClass': 'java.util.List', 'list': [{'javaClass': 'com.jein.framework.connectivity.parameter.InParameter', 'paramName': 'flightSchedule', 'data': {'javaClass': 'java.util.List', 'list': [{'map': {'flightSchedule': '{"departure3": ["\\"\\""], "endDate": "", "arrival0": ["\\"CJU\\""], "flyType": "2", "flyCount": "1", "beginDate0": "2017-06-20", "isCheck": "Daily", "beginDate1": "", "arrival3": ["\\"\\""], "departure0": ["\\"GMP\\""], "arrival1": ["\\"\\""], "beginDate2": "", "departure2": ["\\"\\""], "departure1": ["\\"\\""], "beginDate3": "", "arrival2": ["\\"\\""]}'}, 'javaClass': 'java.util.Map'}]}, 'ioType': 'IN', 'structureType': 'FIELD'}]}, 'requestUniqueCode': 'PGWIB00002', 'sourceName': None, 'javaClass': 'com.jein.framework.connectivity.

2017-06-16 13:37:01,714 root     DEBUG    Crawling Interpark domastic schedule site


End Json Payload crawling
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
tripDivi:0 , inf:0 , depDate:20170620 , airlineCode:7C , arr:CJU , format:json , dep:GMP , chd:0 , adt:1 , 


2017-06-16 13:37:04,083 root     DEBUG    start scraping Interpark crawled data
2017-06-16 13:37:04,096 root     DEBUG    end scraping Interpark crawled data
2017-06-16 13:37:04,100 root     DEBUG    Crawling Interpark domastic schedule site


End Simple crawling
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
tripDivi:0 , inf:0 , depDate:20170620 , airlineCode:TW , arr:CJU , format:json , dep:GMP , chd:0 , adt:1 , 


2017-06-16 13:37:04,746 root     DEBUG    start scraping Interpark crawled data
2017-06-16 13:37:04,761 root     DEBUG    end scraping Interpark crawled data
2017-06-16 13:37:04,764 root     DEBUG    Crawling Interpark domastic schedule site


End Simple crawling
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
tripDivi:0 , inf:0 , depDate:20170620 , airlineCode:LJ , arr:CJU , format:json , dep:GMP , chd:0 , adt:1 , 


2017-06-16 13:37:05,033 root     DEBUG    start scraping Interpark crawled data
2017-06-16 13:37:05,042 root     DEBUG    end scraping Interpark crawled data


End Simple crawling


In [144]:
pd.DataFrame(result_list,columns=columns)

Unnamed: 0,date,dpt,arr,flt,dpt_time,CFG,BKG,R/F,fare,7C_info,7C_fare,TW_info,TW_fare,LJ_info,LJ_fare
0,20170620,GMP,CJU,ZE251,610,144,189,0,34900,7C-151-0625,31900,TW-741-0640,36900,LJ-0301-0615,65600
1,20170620,GMP,CJU,ZE201,615,145,189,0,31900,7C-151-0625,31900,TW-741-0640,36900,LJ-0301-0615,65600
2,20170620,GMP,CJU,ZE281,620,138,183,0,31900,7C-151-0625,31900,TW-741-0640,36900,LJ-0303-0620,65600
3,20170620,GMP,CJU,ZE203,650,121,149,0,65500,7C-103-0700,41900,TW-741-0640,36900,LJ-0303-0620,65600
4,20170620,GMP,CJU,ZE205,810,118,149,0,50900,7C-105-0750,51900,TW-703-0815,49900,LJ-0305-0800,65600
5,20170620,GMP,CJU,ZE207,950,142,189,0,50900,7C-153-1025,46900,TW-705-0945,46900,LJ-0309-0950,65600
6,20170620,GMP,CJU,ZE259,1005,132,189,0,65500,7C-153-1025,46900,TW-705-0945,46900,LJ-0309-0950,65600
7,20170620,GMP,CJU,ZE209,1035,145,183,0,46900,7C-153-1025,46900,TW-707-1030,46900,LJ-0311-1025,65600
8,20170620,GMP,CJU,ZE211,1120,140,189,0,42900,7C-113-1120,41900,TW-709-1145,39900,LJ-0313-1055,65600
9,20170620,GMP,CJU,ZE213,1150,119,149,0,46900,7C-113-1120,41900,TW-709-1145,39900,LJ-0313-1055,65600


In [None]:
make_daily_report_html(columns,result_list)