## 웹투어 국내 노선 가격 정보 읽어오기
- 검색 기간 : 실시일로부터 1~30일, 31~45일, 46~90일 정보 주기적으로 읽어오기

### 웹투어 사이트 웹 크롤링
    - 웹투어 국내선 실시간항공예약 페이지를 이용한 크롤링
    - URL : http://www.webtour.com/DA/da__ajx.asp
    - get 방식, HTML 리턴
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [105]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
from common.crawling_util import simple_crawling
from common.parsing_util import stat_fare

## 국내선용 읽어오기
def crawling_WebTour(airline,dpt,arr,dpt_date):
    print('Crawling WebTour domastic schedule site')
    url = 'http://www.webtour.com/DA/da__ajx.asp'
    param = {
        'whereDepCity':dpt,
        'whereArrCity':arr,
        'whereSDate':dpt_date,
        'whereEDate':dpt_date,
        'whereCarCode':airline,
        'cmd':'Ajx_DAir_SetAvaliList_ALL',
        'whereTarget':'daRTBx',           'whereDirection':'',
        'whereItinerary':'oneway',        'whereSeatClass':'All',
        'whereACnt':'1',        'whereCCnt':'0',        'whereICnt':'0',
    }

    return simple_crawling(url,param,method='get',json=False)

## 오늘기준으로 기간의 데이터를 HTML 형태로 raw data dict를 가져온다.
## dict의 키는 (airline code,date)
## airlines는 리스트 형태로 원하는 항공사 코드 정보를 전달한다.
def crawling_WebTour_from_to(airlines,dpt,arr,start=0,end=30):
    # 기본 30일간 데이터 읽어서 HTML 데이터를 리스트로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    raw_data_dict = {} ## (airline code, 날짜)를 키로 가지는 dict 생성
    for d in date_range:
        for acd in airlines:
            try:
                raw_data = crawling_WebTour(acd,dpt,arr,d)
                if raw_data is not None:
                    raw_data_dict[(acd,d)]=raw_data
            except Exception as e:
                print('****** Error occured : ',e)
        time.sleep(2) ## 지연고려 sleep
    print('Crawling Total Result : ', len(raw_data_dict))
    return raw_data_dict

    
## 국제선용 데이터 파싱, HRML 데이터 파싱 처리
def make_raw_data_to_list(raw_data):
    soup = BeautifulSoup(raw_data,'lxml')
    fare_list = []
    for fare in soup.select('.daLeftBxUlAll'):
        fare_list.append([fare.select('.flight em')[0].text,fare.select('.time em')[0].text,fare.select('.seat')[0].text,
              fare.select('.fare em')[0].text,fare.select('.num em')[0].text])
    return fare_list


## 하루치 raw 데이터를 리스트로 변경한 데이터를 DataFrame으로 변경
def make_raw_list_to_df(airline,dpt_date,raw_list):
    '''
    1. raw_data 루프 처리
    2. 일간 최소값, 최고값, 평균값 계산
    3. DataFrame 최종 concat
    4. 파일 생성
    5. DataFrame 리턴
    '''
    data_header = ['airline','dpt_date','flt','dpt','arr','type','fare','seat']
    result_header = ['airline','dpt_date','flt','dpt','arr','fare1','fare2','fare3','seat']
    result_list = []
    for pos,r_list in enumerate(raw_list):
        #print(pos)
        result_list.append([airline,dpt_date,r_list[0],r_list[1][:5].replace(':',''),r_list[1][6:].replace(':',''),
                   r_list[2],r_list[3][:-1].replace(',',''),r_list[4]])
    ## 1차 DataFrame 생성
    df = pd.DataFrame(result_list,columns=data_header)
    ## 특가석, 할인석이 없는 데이터에 대해 임의의 로우 추가
    if '특가석' not in df['type'].values:
        ix = len(df)
        df.ix[ix] = df.ix[ix-1]
        df.ix[ix]['type','fare','seat'] = ['특가석','0','0']
    if '할인석' not in df['type'].values:
        ix = len(df)
        df.ix[ix] = df.ix[ix-1]
        df.ix[ix]['type','fare','seat'] = ['할인석','0','0']
    ## 각편의 일반석, 할인석이 별도의 로우로 되어 있어 하나의 로우로 groupby
    df_group=df.groupby(data_header[:6]).sum()
    df_group = df_group.unstack(fill_value='0')
    ## 헤더 조정, 그룹화 후 일반석, 할인석 순서로 정렬되기 때문에 아래와 같이 조정
    df_group.columns = ['fare3','fare1','fare2','seat','seat1','seat2']
    ## 일반석의 좌석이 있는 부분만 추출, 체크 필요
    df_group = df_group[df_group['seat']!='0']
    result=df_group.reset_index()[result_header]
    if len(result) > 0:
        ## 가격 통계라인 추가
        fare_stat = stat_fare(result,columns=['fare1','fare2','fare3'])
        result.ix[len(result)] = [airline,dpt_date,'min','max','mean',
                                  str(fare_stat[0]),str(fare_stat[1]),str(fare_stat[2]),'']
    return result

## 기간 raw 데이터 리스트를 DataFrame 으로 변경 및 저장 처리
def make_raw_data_dict_to_df(dpt,arr,start,end,raw_data_dict):
    df_list = []
    for k,raw_data in raw_data_dict.items():
        #print(k)
        raw_list = make_raw_data_to_list(raw_data)
        if len(raw_list) == 0: ## 읽어온 데이터가 없을 경우, 지연현상이나 여러가지 이유로
            print('********** Data Not Found! pos - {}**********'.format(k))
            continue
        raw_df = make_raw_list_to_df(*k,raw_list)
        if len(raw_df) == 0: ## 읽어온 데이터가 없을 경우, 지연현상이나 여러가지 이유로
            print('********** Data Not Found! pos - {}**********'.format(k))
            continue
        df_list.append(raw_df)
    result_df = pd.concat(df_list,ignore_index=True)
    result_df.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','WebTour',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%M')))
    return result_df

In [106]:
## 국내선 1일 데이터 읽어오기
dpt, arr, dpt_date, airlines = 'GMP','CJU','20170705','7C'
raw_data = crawling_WebTour(airlines,dpt,arr,dpt_date)
raw_list= make_raw_data_to_list(raw_data)
df=make_raw_list_to_df(airlines,dpt_date,raw_list)
df.tail()

Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:7C , whereSDate:20170705 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170705 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling


Unnamed: 0,airline,dpt_date,flt,dpt,arr,fare1,fare2,fare3,seat
16,7C,20170705,7C145,1645,1755,26100.0,0.0,71800.0,9.0
17,7C,20170705,7C147,2025,2135,26100.0,0.0,71800.0,9.0
18,7C,20170705,7C151,0625,0730,0.0,35100.0,71800.0,9.0
19,7C,20170705,7C155,1525,1640,0.0,29100.0,71800.0,9.0
20,7C,20170705,min,max,mean,26100.0,71800.0,42400.0,


In [91]:
## 국제선 기간 데이터 읽어오기
dpt, arr, airlines = 'GMP','CJU',['7C','TW','ZE','LJ','BX']
start, end = 1,31
raw_data_dict = crawling_WebTour_from_to(airlines,dpt,arr,start=start,end=end)

df = make_raw_data_dict_to_df(dpt,arr,start,end,raw_data_dict)
df.head()

Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:7C , whereSDate:20170426 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170426 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:TW , whereSDate:20170426 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170426 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whe

Unnamed: 0,airline,dpt_date,flt,dpt,arr,fare1,fare2,fare3,seat
0,ZE,20170524,ZE201,615,725,38100,0,71700,9
1,ZE,20170524,ZE203,650,755,62100,0,71700,9
2,ZE,20170524,ZE215,1255,1405,67100,0,71700,9
3,ZE,20170524,ZE217,1435,1545,38100,0,71700,9
4,ZE,20170524,ZE219,1540,1650,32100,0,71700,9


In [94]:
## 국제선 기간 데이터 읽어오기
dpt, arr, airlines = 'GMP','CJU',['7C','TW','ZE','LJ','BX']
start, end = 31,45
raw_data_dict = crawling_WebTour_from_to(airlines,dpt,arr,start=start,end=end)

df = make_raw_data_dict_to_df(dpt,arr,start,end,raw_data_dict)
df.head()

Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:7C , whereSDate:20170526 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170526 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:TW , whereSDate:20170526 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170526 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whe

Unnamed: 0,airline,dpt_date,flt,dpt,arr,fare1,fare2,fare3,seat
0,7C,20170605,7C101,630,740,0,0,103900,9
1,7C,20170605,7C103,700,810,0,0,103900,9
2,7C,20170605,7C105,750,900,0,0,103900,9
3,7C,20170605,7C107,900,1010,0,0,103900,9
4,7C,20170605,7C113,1120,1230,0,0,103900,9


In [99]:
## 국제선 기간 데이터 읽어오기
dpt, arr, airlines = 'GMP','CJU',['7C','TW','ZE','LJ','BX']
start, end = 45,90
raw_data_dict = crawling_WebTour_from_to(airlines,dpt,arr,start=start,end=end)

df = make_raw_data_dict_to_df(dpt,arr,start,end,raw_data_dict)
df.head()

Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:7C , whereSDate:20170609 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170609 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whereTarget:daRTBx , whereCarCode:TW , whereSDate:20170609 , whereArrCity:CJU , whereDepCity:GMP , whereEDate:20170609 , whereCCnt:0 , whereACnt:1 , whereSeatClass:All , 
End Simple crawling
Crawling WebTour domastic schedule site
Start Simple crawling :  http://www.webtour.com/DA/da__ajx.asp
>> Parameters
whereDirection: , whereItinerary:oneway , whereICnt:0 , cmd:Ajx_DAir_SetAvaliList_ALL , whe

Unnamed: 0,airline,dpt_date,flt,dpt,arr,fare1,fare2,fare3,seat
0,LJ,20170628,LJ0301,615,720,37700,0,71800,9
1,LJ,20170628,LJ0303,625,730,37700,0,71800,9
2,LJ,20170628,LJ0305,800,910,58700,0,71800,9
3,LJ,20170628,LJ0307,845,955,62000,0,71800,9
4,LJ,20170628,LJ0311,1025,1135,58700,0,71800,9


In [98]:
datetime.today().strftime('%Y%m%d%H%M')

'201704251124'