## 웹투어 국제 노선 가격 정보 읽어오기
- 검색 기간 : 실시일로부터 1~30일, 31~45일, 46~90일 정보 주기적으로 읽어오기

### 웹투어 사이트 웹 크롤링
    - 웹투어 국제선 실시간항공예약 페이지를 이용한 크롤링
    - URL : http://airfnt.webtour.com/aa/aalist.asp
    - post 방식, json 리턴
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import time
from common.crawling_util import simple_crawling
from common.parsing_util import stat_fare

## 국제선용 읽어오기
def crawling_WebTour(dpt,arr,dpt_date):
    print('Crawling WebTour domastic schedule site')
    url = "http://airfnt.webtour.com/aa/include/list/aaList__json.asp"
    param = {
        'fnv_Url':'http://airapicj.webtour.com/Avail/Avail.aspx',
        'mk_Params':'SDATE1='+dpt_date+'&SCITY1='+dpt+'&ECITY1='+arr+'&Adult=1'\
            '&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW'\
            '&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y'\
            '&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y'
    }
    ## 문자 앞부분과 끝부분을 제외한 부분만 읽어오기 JSON 포맷 에러 발생
    return simple_crawling(url,param,method='post',json=True)

# 오늘기준으로 기간의 데이터를 JSON 형태의 raw data 리스트를 가져온다.
## airlines는 리스트 형태로 원하는 항공사 코드 정보를 전달한다.
def crawling_WebTour_from_to(dpt,arr,start=0,end=30):
    # 기본 30일간 데이터 읽어서 JSON 데이터를 리스트로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    raw_data_list = []
    for d in date_range:
        try:
            raw_data = crawling_WebTour(dpt,arr,d)
            if raw_data is not None:
                raw_data_list.append(raw_data)
        except Exception as e:
            print('****** Error occured : ',e)
        time.sleep(2) ## 지연고려 sleep
    print('Crawling Total Result : ', len(raw_data_list))
    return raw_data_list

## 국제선용 데이터 파싱, 유틸 사용이 어려운 구조로 별도 작성
## loop_field = ['Responses']['GoodsList']['Goods']
## airlines 파라미터는 반드시 리스트 형태로
def make_raw_data_to_list(raw_data,airlines):
    if type(raw_data['Responses']['GoodsList']) == str: ## 데이터가 없을 경우 체크
        return None
    raw_dict = {}
    fare_goods = raw_data['Responses']['GoodsList']['Goods']
    if type(fare_goods) == dict: ##데이터가 하나인 경우 방지용
        fare_goods = [fare_goods]
    for i,fare_set in enumerate(fare_goods):
        air_itns = fare_set['AirAvail']['AvailList1']['AirItn']
        if type(air_itns) == dict: ##데이터가 하나인 경우 방지용
            air_itns = [air_itns]
        for j,air_itn in enumerate(air_itns):
            seg_detail = air_itn['SegDetail']
            if type(seg_detail) == dict:
                seg_detail = [seg_detail]
            #print(i,j,seg_detail_t)
            for sdetail in seg_detail:
                ## airline 코드 체크
                acd = sdetail['AirV']
                if acd in airlines:
                    raw_list = raw_dict.get(acd,[])
                    raw_list.append([acd,fare_set['SDate1'],sdetail['FltNum'],sdetail['StartDtTm'][8:],
                        sdetail['EndDtTm'][8:],fare_set['SaleFare'],fare_set['Tax'],fare_set['Qcharge']])
                    raw_dict[acd] = raw_list
    return list(raw_dict.values())

data_header = ['airline','date','flt','dpt','arr','fare','tax1','tax2']
## 기간별로 읽어온 데이터를 DataFrame으로 변경 및 저장 처리
def make_raw_data_list_to_df(airlines,dpt,arr,start,end,raw_data_list):
    '''
    1. raw_data 루프 처리
    2. 일간 최소값, 최고값, 평균값 계산
    3. DataFrame 최종 concat
    4. 파일 생성
    5. DataFrame 리턴
    '''
    df_list = []
    for pos,raw_data in enumerate(raw_data_list):
        #print(pos)
        data_list = make_raw_data_to_list(raw_data,airlines)
        for dlist in data_list:
            df = pd.DataFrame(dlist,columns=data_header)
            if len(df) == 0: ## 읽어온 데이터가 없을 경우, 지연현상이나 여러가지 이유로
                print('********** Data Not Found! pos - {}**********'.format(pos))
                continue
            #df = df[data_header] ## 헤더 조정
            df = df.fillna(0)
            stat = stat_fare(df,columns=['fare'])
            df.ix[len(df)] = [df.ix[0][0],df.ix[0][1],'min','max','mean',str(stat[0]),str(stat[1]),str(stat[2])]
            df_list.append(df)
    result_df = pd.concat(df_list,ignore_index=True)
    result_df.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','WebTour',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%M')))
    return result_df

In [86]:
## 국제선 1일 데이터 읽어오기
dpt, arr, dpt_date, airlines = 'ICN','NRT','20170501',['7C']
raw_data = crawling_WebTour(dpt,arr,dpt_date)
make_raw_data_to_list(raw_data,airlines)

Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170501&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling


[[['7C', '20170501', '1104', '1505', '1730', '85000', '28000', '1200'],
  ['7C', '20170501', '1104', '1505', '1730', '110000', '28000', '1200'],
  ['7C', '20170501', '1162', '0700', '0950', '110000', '28000', '1200'],
  ['7C', '20170501', '1102', '0830', '1050', '120000', '28000', '1200'],
  ['7C', '20170501', '1106', '1035', '1255', '120000', '28000', '1200'],
  ['7C', '20170501', '1104', '1505', '1730', '120000', '28000', '1200'],
  ['7C', '20170501', '1162', '0700', '0950', '120000', '28000', '1200'],
  ['7C', '20170501', '1102', '0830', '1050', '130000', '28000', '1200'],
  ['7C', '20170501', '1106', '1035', '1255', '130000', '28000', '1200'],
  ['7C', '20170501', '1104', '1505', '1730', '130000', '28000', '1200'],
  ['7C', '20170501', '1162', '0700', '0950', '130000', '28000', '1200'],
  ['7C', '20170501', '1102', '0830', '1050', '150000', '28000', '1200'],
  ['7C', '20170501', '1106', '1035', '1255', '150000', '28000', '1200'],
  ['7C', '20170501', '1104', '1505', '1730', '150000

In [95]:
## 국제선 기간 데이터 읽어오기
dpt, arr, airlines = 'ICN','NRT',['7C','TW','ZE','LJ','BX']
start, end = 1,3
raw_data_list = crawling_WebTour_from_to(dpt,arr,start=start,end=end)
make_raw_data_list_to_df(airlines,dpt,arr,start,end,raw_data_list)

Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170426&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170427&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling
Crawling Total Result :  2


Unnamed: 0,airline,date,flt,dpt,arr,fare,tax1,tax2
0,7C,20170426,1102,0830,1050,75000,28000,1200
1,7C,20170426,1106,1035,1255,75000,28000,1200
2,7C,20170426,1104,1505,1730,75000,28000,1200
3,7C,20170426,1162,0700,0950,75000,28000,1200
4,7C,20170426,1102,0830,1050,85000,28000,1200
5,7C,20170426,1106,1035,1255,85000,28000,1200
6,7C,20170426,1104,1505,1730,85000,28000,1200
7,7C,20170426,1162,0700,0950,85000,28000,1200
8,7C,20170426,1102,0830,1050,110000,28000,1200
9,7C,20170426,1106,1035,1255,110000,28000,1200


In [97]:
start, end = 31,46
raw_data_list = crawling_WebTour_from_to(dpt,arr,start=start,end=end)
make_raw_data_list_to_df(airlines,dpt,arr,start,end,raw_data_list)

Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170526&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170527&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__jso

Unnamed: 0,airline,date,flt,dpt,arr,fare,tax1,tax2
0,TW,20170526,0201,0745,1015,90000,28000,1300
1,TW,20170526,0201,0745,1015,100000,28000,1300
2,TW,20170526,0201,0745,1015,110000,28000,1300
3,TW,20170526,0201,0745,1015,120000,28000,1300
4,TW,20170526,0201,0745,1015,140000,28000,1300
5,TW,20170526,0201,0745,1015,160000,28000,1300
6,TW,20170526,0201,0745,1015,190000,28000,1300
7,TW,20170526,0201,0745,1015,230000,28000,1300
8,TW,20170526,0201,0745,1015,280000,28000,1300
9,TW,20170526,min,max,mean,90000.0,280000.0,157777.777778


In [98]:
start, end = 46,91
raw_data_list = crawling_WebTour_from_to(dpt,arr,start=start,end=end)
make_raw_data_list_to_df(airlines,dpt,arr,start,end,raw_data_list)

Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170610&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__json.asp
>> Parameters
mk_Params:SDATE1=20170611&SCITY1=ICN&ECITY1=NRT&Adult=1&Child=0&Infant=0&FareType=Y&StayLength=&TRIP=OW&Agent=WWW&PageNo=1&SplitNo=1000&MoreKey=&Schedule=Y&SeatStatus=Y&SeatType=W&SeatClosed=N&Soto=N&Summary=Y&Best=Y&RTInd=Y , fnv_Url:http://airapicj.webtour.com/Avail/Avail.aspx , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://airfnt.webtour.com/aa/include/list/aaList__jso

Unnamed: 0,airline,date,flt,dpt,arr,fare,tax1,tax2
0,TW,20170610,0203,1115,1325,90000,28000,1300
1,TW,20170610,0203,1115,1325,100000,28000,1300
2,TW,20170610,0201,0745,1015,100000,28000,1300
3,TW,20170610,0203,1115,1325,110000,28000,1300
4,TW,20170610,0201,0745,1015,110000,28000,1300
5,TW,20170610,0203,1115,1325,120000,28000,1300
6,TW,20170610,0201,0745,1015,120000,28000,1300
7,TW,20170610,0203,1115,1325,140000,28000,1300
8,TW,20170610,0201,0745,1015,140000,28000,1300
9,TW,20170610,0203,1115,1325,160000,28000,1300
