## 인터파크 국제 노선 가격 정보 읽어오기
- 검색 기간 : 실시일로부터 1~30일, 31~45일, 46~90일 정보 주기적으로 읽어오기

### 인터파크 사이트 웹 크롤링
    - 인터파크투어 국제 실시간항공예약 페이지를 이용한 크롤링
    - json api 사이트 get 메서드 이용 정보 읽어오기
    - 필요 데이터 추출 하여 Pandas 의 DataFrame 형태로 생성
    - 생성된 데이터 엑셀 파일로 저장

In [3]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import time
from common.crawling_util import simple_crawling
from common.parsing_util import parsing_json_data_to_dict

## 국제선용 읽어오기
def crawling_InterPark(airline,dpt,arr,dpt_date):
    print('Crawling Interpark domastic schedule site')
    url = "http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx"
    param = {
        'FLEX':'N',         'Soto':'N',
        'ptype':'I',        'SeatAvail':'Y',
        'comp':'Y',         'JSON':'Y',
        'enc':'u',          'BEST':'Y',
        'Change':'',        'StayLength':'',
        'SeatType':'A',     'trip':'OW',            # 퍈도 왕복
        'adt':'1',         'chd':'0',        'inf':'0',
        'SplitNo':'100',        # 읽어올 데이터 사이즈
        'AirLine':airline,
        'dep0':dpt,             # 출발지
        'arr0':arr,             # 도착지
        'depdate0':dpt_date,    # 출발일
    }
    ## 문자 앞부분과 끝부분을 제외한 부분만 읽어오기 JSON 포맷 에러 발생
    return json.loads(simple_crawling(url,param,method='get',json=False)[1:-1])

## 오늘기준으로 기간의 데이터를 JSON 형태의 raw data 리스트를 가져온다.
## airlines는 리스트 형태로 원하는 항공사 코드 정보를 전달한다.
def crawling_InterPark_from_to(airlines,dpt,arr,start=0,end=30):
    # 기본 30일간 데이터 읽어서 JSON 데이터를 리스트로 저장
    date_range = [ (datetime.today()+timedelta(1)*i).strftime('%Y%m%d') for i in range(start,end)]
    raw_data_list = []
    for d in date_range:
        for acd in airlines:
            try:
                raw_data = crawling_InterPark(acd,dpt,arr,d)
                if raw_data is not None:
                    raw_data_list.append(raw_data)
            except Exception as e:
                print('****** Error occured : ',e)
        time.sleep(2) ## 지연고려 sleep
    print('Crawling Total Result : ', len(raw_data_list))
    return raw_data_list

## 국제선용 데이터 파싱, 유틸 사용이 어려운 구조로 별도 작성
def make_raw_data_to_list(raw_data):
    if type(raw_data['Responses']['GoodsList']) == str: ## 데이터가 없을 경우 체크
        return None
    raw_list = []
    fare_goods = raw_data['Responses']['GoodsList']['Goods']
    if type(fare_goods) == dict: ##데이터가 하나인 경우 방지용
        fare_goods = [fare_goods]
    for fare_set in fare_goods:
        air_itns = fare_set['AirAvail']['StartAvail']['AirItn']
        if type(air_itns) == dict: ##데이터가 하나인 경우 방지용
            air_itns = [air_itns]
        for air_itn in air_itns:
            seg_detail_t = air_itn['seg_detail_t']
            raw_list.append([seg_detail_t['car_code'],fare_set['StartDT'],seg_detail_t['main_flt'],
                             seg_detail_t['dep_date_time'][8:],seg_detail_t['arr_date_time'][8:],
                              fare_set['SaleFare'],fare_set['Tax'],fare_set['Qcharge']])
    return raw_list

data_header = ['airline','date','flt','dpt','arr','fare','tax1','tax2']

## 기간별로 읽어온 데이터를 DataFrame으로 변경 및 저장 처리
def make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list):
    '''
    1. raw_data 루프 처리
    2. 일간 최소값, 최고값, 평균값 계산
    3. DataFrame 최종 concat
    4. 파일 생성
    5. DataFrame 리턴
    '''
    df_list = []
    for pos,raw_data in enumerate(raw_data_list):
        #print(pos)
        df = pd.DataFrame(make_raw_data_to_list(raw_data),columns=data_header)
        if len(df) == 0: ## 읽어온 데이터가 없을 경우, 지연현상이나 여러가지 이유로
            print('********** Data Not Found! pos - {}**********'.format(pos))
            continue
        #df = df[data_header] ## 헤더 조정
        df = df.fillna(0)
        fares = df['fare'].values
        fares = fares.astype(int)
        fares = np.unique(fares)
        if 0 in fares:
            fares = fares[1:]
        fares.min(),fares.max(),fares.mean()
        df.ix[len(df)] = [df.ix[0][0],df.ix[0][1],'min','max','mean',str(fares.min()),str(fares.max()),str(int(fares.mean()))]
        df_list.append(df)
    result_df = pd.concat(df_list,ignore_index=True)
    result_df.to_excel('{}/{}_{}_{}_{}_{}_{}.xls'.format('excel','InterPark',dpt,arr,start,end,datetime.today().strftime('%Y%m%d%H%m')))
    return result_df

In [5]:
dpt, arr, dpt_date, airline = 'ICN','NRT','20170615','7C'
crawling_InterPark(airline,dpt,arr,dpt_date)

Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0:ICN , comp:Y , chd:0 , JSON:Y , trip:OW , FLEX:N , ptype:I , arr0:NRT , SeatType:A , enc:u , SeatAvail:Y , AirLine:7C , adt:1 , StayLength: , Soto:N , BEST:Y , Change: , inf:0 , SplitNo:100 , depdate0:20170615 , 
End Simple crawling


{'Responses': {'AirVList': '',
  'FareTypeList': '',
  'GoodsList': {'Goods': [{'APIType': 'AIR',
     'AVailDayChg': {'#cdata-section': '/OneStopAvail/RetrieveAvail.aspx?SGC=J2NRT7C586&RGC=J2NRT7C586&SSCITY=ICN&SECITY=NRT&RSCITY=&RECITY=&SFLT_idx=FIM_7C000009&RFLT_idx=&BookingClass=S&AirLine=7C&WeekAddFare=0&SunDayRule= &SViaCity=&RViaCity=&Type=D&Flex=Y&ViaNo=&OpenFlag=&OneWayFlag=2&StayLength=&adt=1&chd=0&inf=0&TRIP=OW&Agent=H&EventNum=0/0&DiscountNum=0/0'},
     'Adult': '1',
     'AirAvail': {'ReturnAvail': '',
      'StartAvail': {'AirItn': [{'Res_Parameter': {'#cdata-section': '/AirSearch/PrsCreate.aspx?s_Itn=7C%2f%ec%a0%9c%ec%a3%bc%ed%95%ad%ea%b3%b5%2f1104%2fICN%2f%ec%9d%b8%ec%b2%9c%2fNRT%2f%eb%8f%84%ec%bf%84(%eb%82%98%eb%a6%ac%ed%83%80)%2f201706151505%2f%2fTHU%2f%2f201706151730%2f%2fTHU%2f%2f%2f%2f%2f%2f0%2f0225%2f0225%2f0000%2fMIL%2f%2f%2f%2f%2f%2f%2f%2f0%2f0%2f%2f%2f%2f%2f%2f%2fS%2f9&s_Itn_hd=0225/0225//MIL///&depdate0=20170615&trip=OW&scroll=1&row=20&segdt=Y&ctrl=NO&comp=Y&

In [6]:
## 국제선 1일 데이터 읽어오기
dpt, arr, dpt_date, airline = 'ICN','NRT','20170615','7C'
raw_data = crawling_InterPark(airline,dpt,arr,dpt_date)
make_raw_data_to_list(raw_data)[:3]

Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0:ICN , comp:Y , chd:0 , JSON:Y , trip:OW , FLEX:N , ptype:I , arr0:NRT , SeatType:A , enc:u , SeatAvail:Y , AirLine:7C , adt:1 , StayLength: , Soto:N , BEST:Y , Change: , inf:0 , SplitNo:100 , depdate0:20170615 , 
End Simple crawling


[['7C', '20170615', '1104', '1505', '1730', '75000', '28000', '0'],
 ['7C', '20170615', '1162', '0710', '0950', '75000', '28000', '0'],
 ['7C', '20170615', '1102', '0830', '1050', '85000', '28000', '0']]

In [133]:
## 국제선 기간 데이터 읽어오기
dpt, arr, airlines = 'ICN','NRT',['7C','TW','ZE','LJ','BX']
start, end = 1,31
raw_data_list = crawling_InterPark_from_to(airlines,dpt,arr,start=start,end=end)
make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)

Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : ICN , StayLength :  , arr0 : NRT , depdate0 : 20170422 , BEST : Y , SeatAvail : Y , Soto : N , chd : 0 , enc : u , ptype : I , AirLine : 7C , Change :  , FLEX : N , trip : OW , adt : 1 , JSON : Y , comp : Y , SeatType : A , inf : 0 , SplitNo : 100 , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : ICN , StayLength :  , arr0 : NRT , depdate0 : 20170422 , BEST : Y , SeatAvail : Y , Soto : N , chd : 0 , enc : u , ptype : I , AirLine : TW , Change :  , FLEX : N , trip : OW , adt : 1 , JSON : Y , comp : Y , SeatType : A , inf : 0 , SplitNo : 100 , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : IC

Unnamed: 0,airline,date,flt,dpt,arr,fare,tax1,tax2
0,7C,20170422,1102,0830,1050,110000,28000,1200
1,7C,20170422,1102,0830,1050,120000,28000,1200
2,7C,20170422,1104,1505,1730,120000,28000,1200
3,7C,20170422,1102,0830,1050,130000,28000,1200
4,7C,20170422,1104,1505,1730,130000,28000,1200
5,7C,20170422,1102,0830,1050,150000,28000,1200
6,7C,20170422,1104,1505,1730,150000,28000,1200
7,7C,20170422,1162,0700,0950,150000,28000,1200
8,7C,20170422,1102,0830,1050,160000,28000,1200
9,7C,20170422,1104,1505,1730,160000,28000,1200


In [136]:
start, end = 31,46
raw_data_list = crawling_InterPark_from_to(airlines,dpt,arr,start=start,end=end)
make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)

Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : ICN , StayLength :  , arr0 : NRT , depdate0 : 20170522 , BEST : Y , SeatAvail : Y , Soto : N , chd : 0 , enc : u , ptype : I , AirLine : 7C , Change :  , FLEX : N , trip : OW , adt : 1 , JSON : Y , comp : Y , SeatType : A , inf : 0 , SplitNo : 100 , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : ICN , StayLength :  , arr0 : NRT , depdate0 : 20170522 , BEST : Y , SeatAvail : Y , Soto : N , chd : 0 , enc : u , ptype : I , AirLine : TW , Change :  , FLEX : N , trip : OW , adt : 1 , JSON : Y , comp : Y , SeatType : A , inf : 0 , SplitNo : 100 , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : IC

Unnamed: 0,airline,date,flt,dpt,arr,fare,tax1,tax2
0,7C,20170522,1162,0700,0950,75000,28000,1200
1,7C,20170522,1102,0830,1050,85000,28000,1200
2,7C,20170522,1162,0700,0950,85000,28000,1200
3,7C,20170522,1102,0830,1050,110000,28000,1200
4,7C,20170522,1162,0700,0950,110000,28000,1200
5,7C,20170522,1102,0830,1050,120000,28000,1200
6,7C,20170522,1104,1505,1730,120000,28000,1200
7,7C,20170522,1162,0700,0950,120000,28000,1200
8,7C,20170522,1102,0830,1050,130000,28000,1200
9,7C,20170522,1104,1505,1730,130000,28000,1200


In [137]:
start, end = 46,91
raw_data_list = crawling_InterPark_from_to(airlines,dpt,arr,start=start,end=end)
make_raw_data_list_to_df(dpt,arr,start,end,raw_data_list)

Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : ICN , StayLength :  , arr0 : NRT , depdate0 : 20170606 , BEST : Y , SeatAvail : Y , Soto : N , chd : 0 , enc : u , ptype : I , AirLine : 7C , Change :  , FLEX : N , trip : OW , adt : 1 , JSON : Y , comp : Y , SeatType : A , inf : 0 , SplitNo : 100 , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : ICN , StayLength :  , arr0 : NRT , depdate0 : 20170606 , BEST : Y , SeatAvail : Y , Soto : N , chd : 0 , enc : u , ptype : I , AirLine : TW , Change :  , FLEX : N , trip : OW , adt : 1 , JSON : Y , comp : Y , SeatType : A , inf : 0 , SplitNo : 100 , 
End Simple crawling
Crawling Interpark domastic schedule site
Start Simple crawling :  http://smartair.interpark.com/HtmlSearch/GetGoodsSmartList.aspx
>> Parameters
dep0 : IC

Unnamed: 0,airline,date,flt,dpt,arr,fare,tax1,tax2
0,7C,20170606,1162,0655,0950,75000,28000,1200
1,7C,20170606,1102,0830,1050,85000,28000,1200
2,7C,20170606,1104,1505,1730,85000,28000,1200
3,7C,20170606,1162,0655,0950,85000,28000,1200
4,7C,20170606,1102,0830,1050,110000,28000,1200
5,7C,20170606,1104,1505,1730,110000,28000,1200
6,7C,20170606,1162,0655,0950,110000,28000,1200
7,7C,20170606,1102,0830,1050,120000,28000,1200
8,7C,20170606,1104,1505,1730,120000,28000,1200
9,7C,20170606,1106,1035,1255,120000,28000,1200
