## 김포 제주 노선 가격 정보 읽어오기
- 대상 사이트 : 진에어 웹사이트

### 진에어 사이트 웹 크롤링
    - 진에어 항공권예매 페이지를 이용한 크롤링
    - 전달할 payload 생성 하여 사이트 정보 읽어오기
    - 넘어온 정보가 XML 형태
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [29]:
from IPython.display import display
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time
from datetime import datetime
from datetime import timedelta
from common.crawling_util import payload_crawling
from common.parsing_util import stat_fare

dom_int='D' ## 국내선(D) 국제선(I) 구분
def crawling_LJ_data(dpt,arr,dpt_date):
    ##출발지, 도착지, 출발일을 기준으로 국내선(국제선) 편도 가격 읽어오기
    print('Crawling jejuair homepage schedule site')
    url = "https://www.jinair.com/RSV/RSV_WebResult.aspx"
    head = {
        'Referer':'https://www.jinair.com/RSV/Reservation.aspx',
    }
    param ={
        'TASK':'NormalFare',
        'OWListId':'ctl00_ContentPlaceHolder1_fltlstDownLine',
        'OWDepDate':dpt_date,
        'OWDep':dpt,
        'OWArr':arr,
        'MemberClass':'I',
        'DisCode':'',
        'MbrGb':'N'
    }
    payload = '<REQUEST><TASK>{TASK}</TASK><OWListId>{OWListId}</OWListId><OWDepDate>{OWDepDate}</OWDepDate>'
    payload += '<OWDep>{OWDep}</OWDep><OWArr>{OWArr}</OWArr><MemberClass>{MemberClass}</MemberClass>'
    payload += '<DisCode>{DisCode}</DisCode><MbrGb>{MbrGb}</MbrGb></REQUEST>'

    return payload_crawling(url,payload.format(**param),head=head,method='post',json=False)

data_heads = ['date','flt','dpt','arr','fare1','fare2','seat','tax1','tax2']
## raw 데이터로 부터 DataFrame 생성
def raw_to_df(raw_data):
    ## 읽어온 데이터의 json['Result']['code'] 값이 0000 이 아닌 경우 오류
    data_fields = ['dep_date','flt_x0023_','dept','arrt','eamt','yamt','y_avail','tax_amt','fuel_amt']
    soup = BeautifulSoup(raw_data,'lxml')
    return pd.DataFrame([[fare.find(f).text for f in data_fields] for fare in soup.select('ow')],columns=data_heads)

## 하루 기준 가격정보, 텍스정보, 최소값, 최대값, 평균 DataFrame 생성
def read_LJ_1day_fare(dpt,arr,dpt_date):
    ## 데이터 읽어오기
    raw_data = crawling_LJ_data(dpt,arr,dpt_date)
    df=raw_to_df(raw_data)
    if df is None or len(df) == 0: ## 읽는 도중 에러가 나거나 익셉션 발생으로 문제가 있을 경우 처리
        print('********** No Data Type 1 **********')
        return None

    ## 최소값 최대값 평균 계산
    stat = stat_fare(df,columns=['fare1','fare2'])
    ## 요약정보 추가
    df.ix[len(df)] = [dpt_date,'min','max','mean',str(stat[0]),str(stat[1]),str(stat[2]),'','']
    return df

## 정해진 기간의 데이터 읽어오기, 기본 30일
def read_LJ_date_range_fare(dpt,arr,start=0,end=31):
    # 기본 30일간 데이터 읽어서 파일로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    fare_list = []
    for d in date_range:
        try:
            fare_df = read_LJ_1day_fare(dpt,arr,d)
            if fare_df is not None:
                fare_list.append(fare_df)
        except Exception as e:
            print('****** Error occured : ',e)
    result = pd.concat(fare_list,ignore_index=True)
    print('++++++++++Total : ', len(result))
    ## 파일 저장
    result.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','LJ',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%m')))
    return result

In [27]:
## 하루치 데이터 읽어오기
dpt, arr, dpt_date = 'GMP','CJU','20170514'
read_LJ_1day_fare(dpt,arr,dpt_date)

Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170514</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling


Unnamed: 0,date,flt,dpt,arr,fare1,fare2,seat,tax1,tax2
0,20170514,LJ0301,0605,0710,0.0,80000.0,8.0,4000.0,2200.0
1,20170514,LJ0303,0625,0730,74900.0,80000.0,36.0,4000.0,2200.0
2,20170514,LJ0305,0800,0910,74900.0,80000.0,17.0,4000.0,2200.0
3,20170514,LJ0307,0845,0955,0.0,80000.0,17.0,4000.0,2200.0
4,20170514,LJ0309,0955,1100,0.0,80000.0,9.0,4000.0,2200.0
5,20170514,LJ0311,1025,1135,74900.0,80000.0,32.0,4000.0,2200.0
6,20170514,LJ0313,1055,1205,69900.0,80000.0,19.0,4000.0,2200.0
7,20170514,LJ0319,1340,1445,69900.0,80000.0,26.0,4000.0,2200.0
8,20170514,LJ0321,1425,1530,43900.0,80000.0,162.0,4000.0,2200.0
9,20170514,LJ0323,1505,1615,39900.0,80000.0,98.0,4000.0,2200.0


In [30]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =0, 31 ## 읽어올 데이터 기간
read_LJ_date_range_fare(dpt,arr,start,end)

Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170426</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling
Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170427</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling
Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170428</OWDepDa

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,seat,tax1,tax2
0,20170426,LJ0301,0605,0710,0,65600,17,4000,2200
1,20170426,LJ0303,0625,0730,0,65600,0,4000,2200
2,20170426,LJ0305,0800,0910,0,65600,0,4000,2200
3,20170426,LJ0307,0845,0955,0,65600,1,4000,2200
4,20170426,LJ0309,0955,1100,0,65600,-2,4000,2200
5,20170426,LJ0311,1025,1135,0,65600,-1,4000,2200
6,20170426,LJ0313,1055,1205,0,65600,0,4000,2200
7,20170426,LJ0319,1340,1445,0,65600,0,4000,2200
8,20170426,LJ0321,1425,1530,0,65600,0,4000,2200
9,20170426,LJ0323,1505,1615,0,65600,5,4000,2200


In [31]:
## 정해진 기간의 데이터 읽어오기
dpt, arr = 'GMP','CJU'
start,end =31, 45 ## 읽어올 데이터 기간
read_LJ_date_range_fare(dpt,arr,start,end)

Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170527</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling
Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170528</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling
Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170529</OWDepDa

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,seat,tax1,tax2
0,20170527,LJ0301,0605,0710,0,97700,292,4000,2200
1,20170527,LJ0303,0625,0730,0,97700,277,4000,2200
2,20170527,LJ0305,0800,0910,0,97700,-4,4000,2200
3,20170527,LJ0351,0830,0940,0,97700,14,4000,2200
4,20170527,LJ0307,0845,0955,0,97700,0,4000,2200
5,20170527,LJ0309,1000,1105,0,97700,1,4000,2200
6,20170527,LJ0311,1025,1135,0,97700,108,4000,2200
7,20170527,LJ0313,1055,1205,0,97700,36,4000,2200
8,20170527,LJ0319,1340,1445,69900,80000,170,4000,2200
9,20170527,LJ0321,1425,1530,59900,80000,244,4000,2200


In [32]:
dpt, arr = 'GMP','CJU'
start,end =45, 90 ## 읽어올 데이터 기간
read_LJ_date_range_fare(dpt,arr,start,end)

Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170610</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling
Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170611</OWDepDate><OWDep>GMP</OWDep><OWArr>CJU</OWArr><MemberClass>I</MemberClass><DisCode></DisCode><MbrGb>N</MbrGb></REQUEST>
End Payload crawling
Crawling jejuair homepage schedule site
Start Payload crawling :  https://www.jinair.com/RSV/RSV_WebResult.aspx
payload :  <REQUEST><TASK>NormalFare</TASK><OWListId>ctl00_ContentPlaceHolder1_fltlstDownLine</OWListId><OWDepDate>20170612</OWDepDa

Unnamed: 0,date,flt,dpt,arr,fare1,fare2,seat,tax1,tax2
0,20170610,LJ0301,0615,0720,0,97700,169,4000,2200
1,20170610,LJ0303,0625,0730,0,97700,355,4000,2200
2,20170610,LJ0305,0800,0910,0,97700,12,4000,2200
3,20170610,LJ0307,0845,0955,0,97700,0,4000,2200
4,20170610,LJ0311,1025,1135,0,97700,86,4000,2200
5,20170610,LJ0313,1055,1205,0,97700,181,4000,2200
6,20170610,LJ0319,1330,1435,0,80000,141,4000,2200
7,20170610,LJ0321,1425,1530,62400,80000,344,4000,2200
8,20170610,LJ0323,1455,1600,62400,80000,177,4000,2200
9,20170610,LJ0325,1525,1635,51200,80000,157,4000,2200
