## 인터파크 국내 노선 가격 정보 읽어오기
- 검색 기간 : 실시일로부터 1~30일, 31~45일, 46~90일 정보 주기적으로 읽어오기

### 인터파크 사이트 웹 크롤링
    - 인터파크투어 국내(국제)실시간항공예약 페이지를 이용한 크롤링
    - json api 사이트 gep 메서드 이용 정보 읽어오기
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import time
from common.crawling_util import simple_crawling
from common.parsing_util import parsing_json_data_to_dict, stat_fare
from common.util import series_chk_min
from common.chart_util import plot_by_mpld3

## 국내선용 읽어오기
def crawling_InterPark(airline,dpt,arr,dpt_date):
    print('Crawling Interpark domastic schedule site')
    url = 'http://domair.interpark.com/api/booking/airJourney.do'
    param = {
        'format':'json',   # JSON 포맷
        'dep':dpt,         # 출발
        'arr':arr,         # 도착
        'depDate':dpt_date, # 출발일 yyyymmdd
        'airlineCode':airline, # 항공사 코드
        'tripDivi':'0',     #편도 왕복 구분 0 - 편도 1 - 왕복
        'adt':'1',
        'chd':'0',
        'inf':'0'
    }
    return simple_crawling(url,param,method='get',json=True)

## 오늘기준으로 기간의 데이터를 JSON 형태의 raw data 리스트를 가져온다.
## airlines는 리스트 형태로 원하는 항공사 코드 정보를 전달한다.
def crawling_InterPark_from_to(airlines,dpt,arr,start=0,end=30):
    # 기본 30일간 데이터 읽어서 JSON 데이터를 리스트로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    raw_data_list = []
    for d in date_range:
        for acd in airlines:
            try:
                raw_data = crawling_InterPark(acd,dpt,arr,d)
                if raw_data is not None:
                    raw_data_list.append(raw_data)
            except Exception as e:
                print('****** Error occured : ',e)
        time.sleep(2) ## 지연고려 sleep
    print('Crawling Total Result : ', len(raw_data_list))
    return raw_data_list

def crawling_InterPark_from_range(airlines,dpt,arr,start=None,end=30):
    # 기본 30일간 데이터 읽어서 JSON 데이터를 리스트로 저장
    if start is None:
        date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(1,end+1)]
    else:
        td = datetime.strptime(start,'%Y%m%d') - datetime.today()
        if td.days < 0:
            date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(1,end+1)]
        else:
            if end < 1:
                end = 1
            date_range = [ (datetime.strptime(start,'%Y%m%d')+timedelta(1)*i).strftime('%Y%m%d') for i in range(end)]
    raw_data_list = []
    for d in date_range:
        for acd in airlines:
            try:
                raw_data = crawling_InterPark(acd,dpt,arr,d)
                if raw_data is not None:
                    raw_data_list.append(raw_data)
            except Exception as e:
                print('****** Error occured : ',e)
        time.sleep(2) ## 지연고려 sleep
    print('Crawling Total Result : ', len(raw_data_list))
    return raw_data_list

## 국내선용 파싱 정보, LCC 용 대한항공, 아시아나 적용시 고려
loop_field = "['replyAvailFare']['availFareSet']"
parse_info = {
    'airline':"['replyAvailFare']['availFareSet'][{}]['segFare']['carCode']",#i
    'date':"['replyAvailFare']['availFareSet'][{}]['segFare']['depDate']",#i
    'flt':"['replyAvailFare']['availFareSet'][{}]['segFare']['mainFlt']",#i
    'dpt':"['replyAvailFare']['availFareSet'][{}]['segFare']['depTime']",#i
    'arr':"['replyAvailFare']['availFareSet'][{}]['segFare']['arrTime']",#i
    'fare1':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['fare']",#i,j
    'fare2':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['fare']",#i,j+1
    'tax1':"['replyAvailFare']['availFareSet'][{}]['segFare']['airTax']",#i
    'tax2':"['replyAvailFare']['availFareSet'][{}]['segFare']['fuelChg']",#i
    'seat1':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][0]['noOfAvailSeat']",#i,j
    'seat2':"['replyAvailFare']['availFareSet'][{}]['segFare']['classDetail'][1]['noOfAvailSeat']",#i,j+1
}
data_header = ['airline','date','flt','dpt','arr','fare1','fare2','tax1','tax2','seat1','seat2']
def make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list):
    '''
    1. raw_data 루프 처리
    2. 일간 최소값, 최고값, 평균값 계산
    3. DataFrame 최종 concat
    4. 파일 생성
    5. DataFrame 리턴
    '''
    df_list = []
    for pos,raw_data in enumerate(raw_data_list):
        #print(pos)
        if raw_data['replyHeader']['errorCode'] == '1':
            ## 에러코드 체크 - 데이터 없음
            print('********** Crawling exception! pos - {}**********'.format(pos))
            continue
        df = pd.DataFrame(parsing_json_data_to_dict(raw_data,loop_field, parse_info))
        df = df.fillna('0') ## None field 값 변경
        if len(df) == 0: ## 읽어온 데이터가 없을 경우, 지연현상이나 여러가지 이유로
            print('********** Data Not Found! pos - {}**********'.format(pos))
            continue
        df = df[data_header] ## 헤더 조정
        df = df.fillna(0)
        stat = stat_fare(df,columns=['fare1','fare2'])
        df.ix[len(df)] = [df.ix[0][0],df.ix[0][1],'min','max','mean',str(int(stat[0])),str(int(stat[1])),str(int(stat[2])),'','','']
        df_list.append(df)
    result_df = pd.concat(df_list,ignore_index=True)
    result_df.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','InterPark',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%M')))
    return result_df

def make_chart_data(dataframe):
    plot_df = dataframe[['date','airline','fare1','fare2']]
    plot_df[['fare1','fare2']] = plot_df[['fare1','fare2']].astype('int')
    plot_df['fare']=plot_df[['fare1','fare2']].apply(series_chk_min,axis=1)
    plot_df = plot_df[['date','airline','fare']]
    plot_df = plot_df.groupby(['date','airline']).min()
    plot_df = plot_df.unstack()
    plot_df = plot_df['fare']
    return plot_df.bfill()

In [22]:
## 국내선 1일 데이터 읽어오기
dpt, arr, dpt_date, airline = 'GMP','CJU','20170510','7C'
raw_data = crawling_InterPark(airline,dpt,arr,dpt_date)
parsed_dict = parsing_json_data_to_dict(raw_data,loop_field,parse_info)
df = pd.DataFrame(parsed_dict)
df = df[data_header] ## 헤더 조정
df = df.fillna(0)
df.head()

Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
arr:CJU , dep:GMP , chd:0 , airlineCode:7C , adt:1 , tripDivi:0 , inf:0 , depDate:20170510 , format:json , 
End Simple crawling


Unnamed: 0,airline,date,flt,dpt,arr,fare1,fare2,tax1,tax2,seat1,seat2
0,7C,20170510,151,625,730,46900,65600,4000,2200,9,9
1,7C,20170510,103,700,810,65600,0,4000,2200,1,0
2,7C,20170510,105,750,900,65600,0,4000,2200,1,0
3,7C,20170510,113,1120,1230,65600,0,4000,2200,1,0
4,7C,20170510,115,1225,1335,46900,65600,4000,2200,9,9


In [9]:
## 국내선 기간 데이터 읽어오기
dpt, arr, airlines = 'GMP','CJU',['7C','TW']
start, end = 1,10
raw_data_list = crawling_InterPark_from_to(airlines,dpt,arr,start=start,end=end)
df = make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)
df.head()

Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
adt:1 , format:json , arr:CJU , airlineCode:7C , tripDivi:0 , inf:0 , chd:0 , depDate:20170504 , dep:GMP , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
adt:1 , format:json , arr:CJU , airlineCode:TW , tripDivi:0 , inf:0 , chd:0 , depDate:20170504 , dep:GMP , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
adt:1 , format:json , arr:CJU , airlineCode:7C , tripDivi:0 , inf:0 , chd:0 , depDate:20170505 , dep:GMP , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
adt:1 , format:json , arr:CJU , airlineCode:TW , tripDivi:0 , inf:0 , chd:0 

Unnamed: 0,airline,date,flt,dpt,arr,fare1,fare2,tax1,tax2,seat1,seat2
0,7C,20170504,161,0620,0725,87900,97700,4000,2200.0,2.0,2.0
1,7C,20170504,115,1225,1335,97700,0,4000,2200.0,1.0,0.0
2,7C,20170504,127,1815,1925,97700,0,4000,2200.0,9.0,0.0
3,7C,20170504,min,max,mean,87900,97700,92800,,,
4,TW,20170504,719,1735,1845,79900,0,4000,2200.0,9.0,0.0


In [3]:
dpt, arr, airlines = 'GMP','CJU',['7C','TW','ZE','LJ','BX']
start = '20170510'
end = 10
raw_data_list = crawling_InterPark_from_range(airlines,dpt,arr,start,end)
df = make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)
df.head()

Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
chd:0 , airlineCode:7C , adt:1 , tripDivi:0 , inf:0 , arr:CJU , format:json , depDate:20170510 , dep:GMP , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
chd:0 , airlineCode:TW , adt:1 , tripDivi:0 , inf:0 , arr:CJU , format:json , depDate:20170510 , dep:GMP , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
chd:0 , airlineCode:ZE , adt:1 , tripDivi:0 , inf:0 , arr:CJU , format:json , depDate:20170510 , dep:GMP , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://domair.interpark.com/api/booking/airJourney.do
>> Parameters
chd:0 , airlineCode:LJ , adt:1 , tripDivi:0 , inf:0 , arr:CJU , format:json 

Unnamed: 0,airline,date,flt,dpt,arr,fare1,fare2,tax1,tax2,seat1,seat2
0,TW,20170510,9741,620,730,46900,0,4000,2200,9,0
1,TW,20170510,9741,620,730,65600,0,4000,2200,9,0
2,TW,20170510,723,1725,1835,19900,0,4000,2200,9,0
3,TW,20170510,723,1725,1835,65600,0,4000,2200,9,0
4,TW,20170510,747,1810,1920,14500,0,4000,2200,3,0


In [4]:
df['airline'].unique()

array(['TW', 'ZE', 'LJ', 'BX'], dtype=object)

In [None]:
dpt, arr, airlines = 'GMP','CJU',['7C','TW','ZE','LJ','BX']
start, end = 31,46
raw_data_list = crawling_InterPark_from_to(airlines,dpt,arr,start=start,end=end)
make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)

In [None]:
dpt, arr, airlines = 'GMP','CJU',['7C','TW','ZE','LJ','BX']
start, end = 46,91
raw_data_list = crawling_InterPark_from_to(airlines,dpt,arr,start=start,end=end)
make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)

In [15]:
chart_df = make_chart_data(df)
chart_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


airline,BX,LJ
date,Unnamed: 1_level_1,Unnamed: 2_level_1
20170504,0,0
20170505,0,0
20170506,49400,97700
20170507,38000,97700
20170508,0,97700
20170509,17250,65600
20170510,20700,0
20170511,37950,0
20170512,76000,0
20170513,0,0


In [16]:
plot_by_mpld3(chart_df)

In [2]:
from common.chart_util import plot_by_mpld3
import numpy as np
import pandas as pd
np.random.seed(9615)

N = 100
df = pd.DataFrame((.1 * (np.random.random((N, 5)) - .5)).cumsum(0),
                  columns=['a', 'b', 'c', 'd', 'e'],)
#mpld3.enable_notebook()
plot_by_mpld3(df)

In [7]:
pd.Series([1,0,2,3])
series_chk_min(pd.Series([0,0,2,3]))

2