## 김포 제주 노선 가격 정보 읽어오기
- 대상 사이트 : 아시아나항공 국내선 예매 사이트

### 대한항공 사이트 웹 크롤링
    - 아시아나항공 국내선 항공권예매 페이지를 이용한 크롤링
    - 사이트 접속 데이터 크롤하여 BeautifulSoup 파싱
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [14]:
from IPython.display import display
import requests
import pandas as pd
import numpy as np
from pandas import DataFrame
from bs4 import BeautifulSoup
import time
from datetime import datetime
from datetime import timedelta
from common.crawling_util import simple_crawling
from common.parsing_util import stat_fare
from common.util import find_between

def crawling_OZ_data(dpt,arr,dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    print('Crawling asiana homepage schedule site')
    url = "https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do"
    head = {
        'Referer':'https://flyasiana.com/I/ko/RevenueDomesticTravelRegist.do',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    }
    param ={
        'hidPageType':'',      'departureAirportC':'',
        'arrivalAirportC':'',  'departureDateC':'',
        'arrivalDateC':'',     'officeId':'',
        'entType':'',          'corporateCode':'',
        'couponDesc':'',       'cServiceType':'',
        'couponCode':'',       'query':'',
        'CallPage':'RevenueDomesticTravelRegist',
        #'sessionUniqueKey':'ec5fd537-302d-48f0-fee5-f4b649026ecf',
        'domIntType':'D',               'tripType':'OW',
        'departureAirport':dpt,       'arrivalAirport':arr,
        'openDepartureAirport1':dpt,  'openArrivalAirport1':arr,
        'openDepartureAirport2':arr,  'openArrivalAirport2':dpt,
        'departureDate':dpt_date,   'arrivalDate':dpt_date,
        '__strSDate':dpt_date,      '__strEDate':dpt_date,
        'adultCount':'1',      'childCount':'0',
        'infantCount':'0',     'ageCalYear':'2017',
        'ageCalMonth':'1',     'ageCalDay':'0',    
    }

    return simple_crawling(url,param,head=head,method='post',json=False)

## 자바스크립트의 JSON 데이터 부분 캡쳐하여 처리 하는 부분 국제선과 동일 포맷으로 추정, 테스트 필요
def get_json_data(raw_data):
    ## 유류세, 공항세 정보
    start_txt = "var depFareFamilyWithAllAvail = eval('"
    end_txt = "');"
    find_data = find_between(raw_data,start_txt,end_txt)
    return {'json_data': eval(find_data.replace('true',"True").replace('false',"False"))}

def raw_json_to_df(raw_data):
    d_heads = ['date','flt','dpt','arr','fare','tax1','tax2','seat']
    json_data = get_json_data(raw_data)
    sch_list = []
    for l_data in json_data['json_data']:
        ## fare 정보 찾아
        fare_data = l_data['fareFamilyAmounts'][0]['itineraryAvailDataOfRecommend']['paxTypeFareDatas'][0]
        ## flight 정보 찾아오기
        for flt_info in l_data['fareFamilyAmounts'][0]['itineraryAvailDataOfRecommend']['availDataList'][0]:
            l_list = []
            flt_data = flt_info['flightInfoDatas'][0]
            ## flight 정보 추가
            l_list.extend([flt_data['departureDate'][:8],flt_data['flightNo'],flt_data['departureDate'][8:12],flt_data['arrivalDate'][8:12]])
            ## fare 정보 추가
            l_list.extend([fare_data['amountWithoutTax'],fare_data['fuelCharge'],fare_data['totalTax']])
            ## seat 정보 추가
            l_list.append(flt_info['seatCount'] if flt_info['seatCount'] != '' else '0')
            sch_list.append(l_list)
    return pd.DataFrame(sorted(sch_list),columns=d_heads)

## 국내선 텍스 정보 리스트 읽어오기
def get_tax_info():
    ## 유류세, 공항세 정보
    url = 'http://flyasiana.com/CW/ko/domesticFare/domesticFareNormal.do'
    req = requests.get(url)
    soup = BeautifulSoup(req.text,'lxml')
    return [td.text.replace(',','')[:-1] for td in soup.select('.common-table-3  tbody td')[1:3]]

## 국내선용 파싱 정보
parsing_heads = ['date','dpt','arr','flt','fare1','fare2','fare3','tax1','tax2','seat']
data_heads = ['date','flt','dpt','arr','fare1','fare2','fare3','tax1','tax2','seat']
## raw 데이터로 부터 DataFrame 생성
def raw_to_df(dpt_date,raw_data):
    soup = BeautifulSoup(raw_data,'lxml')
    sch_list = []
    ## 가격 정보 페이지에서 텍스 정보 읽어오기
    tax_list = get_tax_info()
    ## 스케줄 정보 테이블 읽어오기
    for row in soup.select('#tbDepartureFlightList tbody tr'):
        row_list = [dpt_date]
        ## 시간정보 및 편정보 읽어오기
        row_list.extend(row.select('.air_name')[0].text.replace(':','').replace('→','').split())
        ## 좌석 및 가격 정보 읽어오기
        for td in row.select('.td'):
            seat = td.select('.seat')[0]['seatcount']
            if seat != '0':
                fare_info = td.select('label')[0].text
                row_list.append(fare_info.split()[0].replace(',',''))
            else:
                row_list.append('0')
        ## 공항세, 유류세 추가
        row_list.extend(tax_list)
        ## 좌석 정보 추가
        row_list.append(seat)
        sch_list.append(row_list)
    return pd.DataFrame(sch_list,columns=parsing_heads)[data_heads]

## 하루 기준 가격정보, 텍스정보, 최소값, 최대값, 평균 DataFrame 생성
def read_OZ_1day_fare(dpt,arr,dpt_date):
    ## 데이터 읽어오기
    raw_data = crawling_OZ_data(dpt,arr,dpt_date)
    df=raw_to_df(dpt_date,raw_data)
    df = df[df['date'] == dpt_date].reset_index()[data_heads]
    if df is None or len(df) == 0: ## 읽는 도중 에러가 나거나 익셉션 발생으로 문제가 있을 경우 처리
        print('********** No Data **********')
        return None
    ## 최소값 최대값 평균 계산
    stat = stat_fare(df,columns=['fare1','fare2','fare3'])
    ## 요약정보 추가
    df.ix[len(df)] = [dpt_date,'min','max','mean',str(int(stat[0])),str(int(stat[1])),str(int(stat[2])),'','','']
    return df

## 정해진 기간의 데이터 읽어오기, 기본 30일
def read_OZ_date_range_fare(dpt,arr,start=0,end=31):
    # 기본 30일간 데이터 읽어서 파일로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    fare_list = []
    for d in date_range:
        try:
            fare_df = read_OZ_1day_fare(dpt,arr,d)
            if fare_df is not None:
                fare_list.append(fare_df)
        except Exception as e:
            print('****** Error occured : ',e)
    result = pd.concat(fare_list,ignore_index=True)
    print('++++++++++Total : ', len(result))
    ## 파일 저장
    result.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','OZ',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%M')))
    return result

In [17]:
## 적용 고려 필요
dpt, arr, dpt_date = 'GMP','CJU','20170615'
raw_data=crawling_OZ_data(dpt,arr,dpt_date)
raw_json_to_df(raw_data)

Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
ageCalMonth:1 , __strEDate:20170615 , cServiceType: , couponCode: , arrivalAirport:CJU , openDepartureAirport2:CJU , departureDate:20170615 , officeId: , arrivalAirportC: , adultCount:1 , departureAirport:GMP , arrivalDateC: , domIntType:D , openDepartureAirport1:GMP , departureDateC: , couponDesc: , __strSDate:20170615 , ageCalYear:2017 , hidPageType: , openArrivalAirport1:CJU , corporateCode: , ageCalDay:0 , childCount:0 , arrivalDate:20170615 , query: , entType: , departureAirportC: , tripType:OW , infantCount:0 , openArrivalAirport2:GMP , CallPage:RevenueDomesticTravelRegist , 
End Simple crawling


Unnamed: 0,date,flt,dpt,arr,fare,tax1,tax2,seat
0,20170615,8901,0610,0720,48000.00,2200.00,6200.00,8
1,20170615,8901,0610,0720,86000.00,2200.00,6200.00,9
2,20170615,8905,0830,0935,78000.00,2200.00,6200.00,9
3,20170615,8905,0830,0935,86000.00,2200.00,6200.00,9
4,20170615,8907,0710,0820,61000.00,2200.00,6200.00,8
5,20170615,8907,0710,0820,86000.00,2200.00,6200.00,9
6,20170615,8909,0615,0715,43000.00,2200.00,6200.00,9
7,20170615,8909,0615,0715,86000.00,2200.00,6200.00,9
8,20170615,8913,0845,0955,86000.00,2200.00,6200.00,9
9,20170615,8915,1015,1125,69000.00,2200.00,6200.00,9


In [31]:
## 하루치 데이터 읽어오기
dpt, arr, dpt_date = 'GMP','CJU','20170615'
read_OZ_1day_fare(dpt,arr,dpt_date)

Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170615 , query: , arrivalAirport:CJU , arrivalDate:20170615 , departureDate:20170615 , arrivalAirportC: , CallPage:RevenueDomesticTravelRegist , domIntType:D , infantCount:0 , ageCalYear:2017 , openArrivalAirport2:GMP , cServiceType: , departureDateC: , openArrivalAirport1:CJU , corporateCode: , adultCount:1 , departureAirport:GMP , entType: , arrivalDateC: , tripType:OW , couponCode: , ageCalMonth:1 , ageCalDay:0 , __strEDate:20170615 , openDepartureAirport2:CJU , couponDesc: , openDepartureAirport1:GMP , 
End Simple crawling


Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170615,OZ8901,0610,0720,0,54200,92200,2200.0,4000.0,9.0
1,20170615,OZ8909,0615,0715,0,49200,92200,2200.0,4000.0,9.0
2,20170615,OZ8985,0620,0730,0,54200,92200,2200.0,4000.0,9.0
3,20170615,OZ8981,0635,0745,45200,49200,92200,2200.0,4000.0,9.0
4,20170615,OZ8907,0710,0820,0,62200,92200,2200.0,4000.0,9.0
5,20170615,OZ8971,0730,0840,0,71200,92200,2200.0,4000.0,9.0
6,20170615,OZ8961,0810,0920,0,75200,92200,2200.0,4000.0,9.0
7,20170615,OZ8905,0830,0935,0,84200,92200,2200.0,4000.0,9.0
8,20170615,OZ8913,0845,0955,0,0,92200,2200.0,4000.0,9.0
9,20170615,OZ8919,0925,1030,0,84200,92200,2200.0,4000.0,9.0


In [36]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =1, 31 ## 읽어올 데이터 기간
read_OZ_date_range_fare(dpt,arr,start,end)

Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170520 , query: , arrivalAirport:CJU , arrivalDate:20170520 , departureDate:20170520 , arrivalAirportC: , CallPage:RevenueDomesticTravelRegist , domIntType:D , infantCount:0 , ageCalYear:2017 , openArrivalAirport2:GMP , cServiceType: , departureDateC: , openArrivalAirport1:CJU , corporateCode: , adultCount:1 , departureAirport:GMP , entType: , arrivalDateC: , tripType:OW , couponCode: , ageCalMonth:1 , ageCalDay:0 , __strEDate:20170520 , openDepartureAirport2:CJU , couponDesc: , openDepartureAirport1:GMP , 
End Simple crawling
Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170521 , query: , arriv

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170520,OZ8901,0610,0720,0,86200,119200,2200,4000,9
1,20170520,OZ8985,0620,0730,0,91200,119200,2200,4000,9
2,20170520,OZ8981,0635,0745,0,97200,119200,2200,4000,9
3,20170520,OZ8907,0650,0800,0,108200,119200,2200,4000,9
4,20170520,OZ8971,0730,0840,0,0,0,2200,4000,0
5,20170520,OZ8961,0810,0920,0,0,0,2200,4000,0
6,20170520,OZ8905,0830,0935,0,0,0,2200,4000,0
7,20170520,OZ8913,0845,0955,0,0,0,2200,4000,0
8,20170520,OZ8983,0950,1100,0,0,119200,2200,4000,1
9,20170520,OZ8915,1015,1125,0,0,119200,2200,4000,4


In [38]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =31, 45 ## 읽어올 데이터 기간
read_OZ_date_range_fare(dpt,arr,start,end)

Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170619 , query: , arrivalAirport:CJU , arrivalDate:20170619 , departureDate:20170619 , arrivalAirportC: , CallPage:RevenueDomesticTravelRegist , domIntType:D , infantCount:0 , ageCalYear:2017 , openArrivalAirport2:GMP , cServiceType: , departureDateC: , openArrivalAirport1:CJU , corporateCode: , adultCount:1 , departureAirport:GMP , entType: , arrivalDateC: , tripType:OW , couponCode: , ageCalMonth:1 , ageCalDay:0 , __strEDate:20170619 , openDepartureAirport2:CJU , couponDesc: , openDepartureAirport1:GMP , 
End Simple crawling
Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170620 , query: , arriv

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170619,OZ8901,0610,0720,0,58200,92200,2200,4000,9
1,20170619,OZ8985,0620,0730,0,58200,92200,2200,4000,9
2,20170619,OZ8981,0635,0745,0,62200,92200,2200,4000,9
3,20170619,OZ8907,0710,0820,0,84200,92200,2200,4000,9
4,20170619,OZ8971,0730,0840,0,84200,92200,2200,4000,9
5,20170619,OZ8909,0755,0905,0,0,92200,2200,4000,9
6,20170619,OZ8961,0810,0920,0,0,92200,2200,4000,9
7,20170619,OZ8905,0830,0935,0,0,92200,2200,4000,9
8,20170619,OZ8913,0845,0955,0,0,92200,2200,4000,9
9,20170619,OZ8983,0950,1100,0,0,92200,2200,4000,9


In [39]:
dpt, arr = 'GMP','CJU'
start,end =45, 90 ## 읽어올 데이터 기간
read_OZ_date_range_fare(dpt,arr,start,end)

Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170703 , query: , arrivalAirport:CJU , arrivalDate:20170703 , departureDate:20170703 , arrivalAirportC: , CallPage:RevenueDomesticTravelRegist , domIntType:D , infantCount:0 , ageCalYear:2017 , openArrivalAirport2:GMP , cServiceType: , departureDateC: , openArrivalAirport1:CJU , corporateCode: , adultCount:1 , departureAirport:GMP , entType: , arrivalDateC: , tripType:OW , couponCode: , ageCalMonth:1 , ageCalDay:0 , __strEDate:20170703 , openDepartureAirport2:CJU , couponDesc: , openDepartureAirport1:GMP , 
End Simple crawling
Crawling asiana homepage schedule site
Start Simple crawling :  https://flyasiana.com/I/ko/RevenueDomesticFareDrivenFlightSelect.do
>> Parameters
departureAirportC: , officeId: , childCount:0 , hidPageType: , __strSDate:20170704 , query: , arriv

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,fare3,tax1,tax2,seat
0,20170703,OZ8901,0610,0720,0,49200,92200,2200,4000,9
1,20170703,OZ8985,0620,0730,0,54200,92200,2200,4000,9
2,20170703,OZ8981,0635,0745,0,58200,92200,2200,4000,9
3,20170703,OZ8907,0710,0820,0,84200,92200,2200,4000,9
4,20170703,OZ8971,0730,0840,0,84200,92200,2200,4000,9
5,20170703,OZ8909,0755,0905,0,0,92200,2200,4000,9
6,20170703,OZ8961,0810,0920,0,0,92200,2200,4000,9
7,20170703,OZ8905,0830,0935,0,0,92200,2200,4000,9
8,20170703,OZ8913,0845,0955,0,0,92200,2200,4000,9
9,20170703,OZ8983,0950,1100,0,0,92200,2200,4000,9
