## 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

# DataFrame 화면 출력                                                # DataFrame 화면 출력 설정이 변경되어 있습니다 #
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None # default = 60

import datetime

# API 호출
# ## https://data.go.kr/tcs/dss/selectApiDataDetailView.do?publicDataPk=15057440
from urllib.request import Request, urlopen
# Request https://docs.python.org/ko/3/library/urllib.request.html#urllib.request.Request
# urlopen https://docs.python.org/ko/3/library/urllib.request.html#urllib.request.urlopen
from urllib.parse import urlencode, quote_plus
# urlenocde https://docs.python.org/ko/3/library/urllib.parse.html#urllib.parse.urlencode
# quote_plus https://docs.python.org/ko/3/library/urllib.parse.html#urllib.parse.quote_plus
from urllib.parse import unquote
import requests

# XML 파싱
import xml.etree.ElementTree as ET

## 기본값 설정하기

In [2]:
routeId = 204000046
year = 2020
month = 11
day = 16

## Key Unquote

In [3]:
KEY = 'yEaR%2F3MDedRSlVJL%2F2pxnVg0yre1N5VF3RZ%2FUAt56MJ7J2mNpfqhUvy05pXV0uhHTVY7DbyCR8xmMaDdYga67Q%3D%3D' # 종현
# apiKEY = requests.utils.unquote(KEY)
apiKey = unquote(KEY)
# print(apiKey)
KEYSW = 'M%2B4%2FqUiadT8X8PhgFjaQLDu%2BIOgPMURfGsOX%2FmVxwHQVJgnVR%2FMPjDYXkuQNwUFbZXlfnX5Lls3SUCiCLIFjgQ%3D%3D'
apiKeySW = unquote(KEYSW)

## API 호출

In [4]:
# 노선정보항목조회 busrouteservice/info

# 해당 노선에 대한 노선번호, 기점/종점 정류소, 첫차/막차시간, 배차간격, 운행업체 등의 운행계획 정보를 제공합니다.
url = 'http://openapi.gbis.go.kr/ws/rest/busrouteservice/station'
queryParams = '?' + urlencode({ quote_plus('serviceKey') : apiKey, quote_plus('routeId') : routeId })
request = Request(url + queryParams)
request.get_method = lambda: 'GET'

# API를 호출하여 XML 형식으로 된 string 데이터를 변수 "oneLineXML" 에 저장합니다.
oneLineXML = urlopen(request).read().decode('utf8')
# oneLineXML

## XML 파싱

In [5]:
# xtree는 "openapi" 도메인 API 호출에서 "headerCd"에 상관없이 3개의 태그를 갖습니다.
# [comMsgHeader, msgHeader, msgBody]
xtree = ET.fromstring(oneLineXML)
# for branch in xtree:
#     print(branch)
msgBody = xtree[2]

tagList = ["stationSeq", "stationId", "stationName"]
stationList = []
for branch in msgBody:
    stationList.append([branch.find(tag).text for tag in tagList])

# station_df의 index가 0부터 시작하는 문제를 해결 하기 위해 rangeIndex를 지정하였습니다.
stationCount = len(stationList) # 뒤에서 다시 사용되므로 함수로 선언시 주의
rangeIndex = pd.RangeIndex(start=1, stop=stationCount+1)

station_df = pd.DataFrame(stationList, columns=tagList, index=rangeIndex)
station_df = station_df.astype({
    "stationSeq" : "int32",
    "stationId" : "int32", 
    "stationName" : "string"
    
})
station_df

Unnamed: 0,stationSeq,stationId,stationName
1,1,206000010,운중동먹거리촌
2,2,206000009,운중동푸르지오하임
3,3,206000658,한빛교회.월든힐스아파트
4,4,206000558,운중초등학교
5,5,206000007,운중중학교
6,6,206000006,운중동행정복지센터
7,7,206000005,뫼루니육교
8,8,206000004,판교원마을1.2단지.판교도서관
9,9,206000555,판교청소년수련관.판교종합사회복지관
10,10,206000003,한림아파트


## 데이터 불러오기

In [6]:
# routeId 별 정류소 도착시간을 나타내는 2차원 배열을 생성 할 수 있습니다
# 마지막 정류소 번호가 필요합니다 : 경유정류소목록조회 API 서비스를 이용하여 가져옵니다.
# 중간 또는 처음과 끝에 발생하는 결측치를 채워야 합니다.


rootPath = 'C:/Users/jongh/OneDrive/School/석사/1-2/자료구조/프로젝트/5.DATA/Master/dataAPI/buslocationservice/'
outputPath = rootPath + str(routeId) + '/' + str(routeId) + '_' + str(year)[-2:] + '-' + str(month) + '-' + str(day) + '.txt'

itemTagList = ['endBus', 'lowPlate', 'plateNo', 'plateType', 'remainSeatCnt', 'routeId', 'stationId', 'stationSeq', 'Nan']
# itemTagList = ['endBus', 'plateNo', 'plateType', 'remainSeatCnt', 'routeId', 'stationId', 'stationSeq', 'Nan']
itemTagList.insert(0, 'DateTime')
_df = pd.read_csv(outputPath, sep=' ', skiprows=[0], names=itemTagList)
_df = _df.drop(["Nan"], axis=1)
_df = _df.sort_values(['plateNo'])
_df = _df.dropna()
# _df = _df.drop(_df[_df["endBus"]=="API"].index)

# string을 datetime객체로 변환하기
_df['DateTime'] = pd.to_datetime(_df['DateTime'])
# dateTimeObj = datetime.datetime.strptime(_df['DateTime'], '%Y-%m-%dT%H:%M:%S.%f%z')
# _df['DateTime'] = _df.DateTime.str.split('.').str[0]
# _df['DateTime'] = _df.DateTime.str.replace('T', ' ')

_df = _df.astype({
    'endBus': 'int32', 
    'lowPlate': 'int32', 
    'plateNo': 'string', 
    'plateType': 'int32',
    'remainSeatCnt': 'int32', 
    'routeId': 'int32', 
    'stationId': 'int32', 
    'stationSeq': 'int32'
})
_df.dtypes

DateTime         datetime64[ns, pytz.FixedOffset(540)]
endBus                                           int32
lowPlate                                         int32
plateNo                                         string
plateType                                        int32
remainSeatCnt                                    int32
routeId                                          int32
stationId                                        int32
stationSeq                                       int32
dtype: object

In [7]:
# 차량 운행기록을 "plateNo"를 기준으로 구분합니다.
plateNoList = _df.groupby(['plateNo']).size().index

In [8]:
# 정류장의 수 만큼 행을 갖는 빈 데이터프레임을 생성합니다. 변수 rangeIndex는 [XML파싱]에서 선언되어 있습니다.
oneDay_df = pd.DataFrame(index = rangeIndex)

# 각 차량 번호를 기준으로 반복문을 수행합니다.
for plateNo in plateNoList:
    
    # 특정 차량번호와 같은 운행기록을 "DateTime을 기준으로 오름차순으로 정렬합니다."
    sameBusList = _df[_df['plateNo'] == plateNo].sort_values(['DateTime']).reset_index(drop=True)
#     print(sameBusList)
    
    
    # 버스는 뒤로가지 않으므로
    # "stationSeq"가 작아지는 인덱스 i의 리스트를 만듭니다.
    indexList = []
    for i in range(1,len(sameBusList)):
        if(sameBusList.iloc[i-1]['stationSeq'] > sameBusList.iloc[i]['stationSeq']):
            indexList.append(i)
#             print((sameBusList.iloc[i-1]['stationSeq'] , sameBusList.iloc[i]['stationSeq']))
            
    indexList.append(len(sameBusList))
#     print(indexList)
    
    # 첫번재 인덱스, 인덱스의 리스트, 마지막 인덱스를 조합하여 호차별로 운행기록을 구분합니다.
    index = 0
    tupleList = []
    for i in indexList:
        tupleList.append((index, i))
        small_df = sameBusList.iloc[index:i].groupby(by=['stationSeq']).min()
        
        # 정류소 데이터프레임에 합칩니다.
        merge_df = pd.merge(station_df, small_df, how='outer', on='stationId')
        merge_df = merge_df.set_index(keys='stationSeq')
        
        
        if(1):
            # 결측치를 처리합니다.
            # df.interpolate(method='linear' or 'polynomial')
            # 시간에 대한 결측치 처리
            merge_df['DateTime'] = merge_df['DateTime'].interpolate(method='backfill', limit=3)
            merge_df['DateTime'] = merge_df['DateTime'].interpolate(method='pad', limit=3)

            # 빈좌석에 대한 결측치 처리, limit 값이 다름에 주의
            merge_df['remainSeatCnt'] = merge_df['remainSeatCnt'].interpolate(method='linear', limit=3, limit_direction='both')

            merge_df['endBus'] = merge_df['endBus'].interpolate(method='pad', limit=3)        
            merge_df['endBus'] = merge_df['endBus'].interpolate(method='backfill', limit=3)

            merge_df['lowPlate'] = merge_df['lowPlate'].interpolate(method='pad', limit=3)        
            merge_df['lowPlate'] = merge_df['lowPlate'].interpolate(method='backfill', limit=3)

            merge_df['plateType'] = merge_df['plateType'].interpolate(method='pad', limit=3)        
            merge_df['plateType'] = merge_df['plateType'].interpolate(method='backfill', limit=3)

            merge_df['routeId'] = merge_df['routeId'].interpolate(method='pad', limit=3)        
            merge_df['routeId'] = merge_df['routeId'].interpolate(method='backfill', limit=3)

        oneDay_df[plateNo+'_'+str(index)] = merge_df['DateTime']
        index=i
        # display(merge_df)
        # break;
    # break;

# oneDay_df

In [9]:
# 출차시간(1행)을 기준으로 column을 정렬합니다.
oneDay_df = oneDay_df.sort_values(by=1, axis=1)

In [10]:
# 정렬된 column에 출차 순서를 배정합니다. (1호차, 2호차, ...)
columnCount = len(oneDay_df.columns)
oneDay_df.columns = np.arange(1,columnCount+1)

In [11]:
# datetime 객체를 string 객체로 변환합니다. 시간을 가지고 계산해야 하는 일은 이전에 실행하세요.
oneDay_df = oneDay_df.astype('string')
for column in oneDay_df.columns:
    oneDay_df[column] = oneDay_df[column].str.slice(start=10, stop=16)

In [12]:
# station_df 와 oneDay_df를 합칩니다. datetime이 아닌 다른 타입의 오브젝트가 행에 붙게 되므로 지금부터는 편집이 어렵습니다.
finalOneDay_df = pd.concat([station_df,oneDay_df], axis=1)
_df = finalOneDay_df.set_index('stationSeq', drop=True)

In [13]:
_df

Unnamed: 0_level_0,stationId,stationName,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
stationSeq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1,206000010,운중동먹거리촌,17:02,17:12,17:25,17:37,17:49,18:00,18:17,18:31,18:47,19:00,19:13,19:27,19:36,19:51,20:06,20:21,20:41,21:01,21:21,21:40,22:01,22:27,22:51,23:11,,,,,,
2,206000009,운중동푸르지오하임,17:02,17:12,17:25,17:37,17:49,18:00,18:17,18:31,18:47,19:00,19:13,19:27,19:36,19:51,20:06,20:21,20:41,21:01,21:21,21:40,22:01,22:27,22:51,23:11,,,,,,
3,206000658,한빛교회.월든힐스아파트,17:02,17:13,17:26,17:37,17:49,18:01,18:17,18:31,18:47,19:01,19:14,19:27,19:37,19:51,20:07,20:21,20:42,21:01,21:22,21:41,22:02,22:27,22:52,23:12,,,,,,
4,206000558,운중초등학교,17:02,17:15,17:27,17:38,17:50,18:03,18:18,18:33,18:48,19:02,19:15,19:27,19:38,19:52,20:08,20:22,20:43,21:03,21:23,21:42,22:02,22:27,22:53,23:13,,,,,,
5,206000007,운중중학교,17:03,17:16,17:28,17:39,17:50,18:03,18:19,18:34,18:48,19:03,19:16,19:28,19:39,19:53,20:09,20:23,20:43,21:04,21:24,21:43,22:03,22:28,22:54,23:14,,,,,,
6,206000006,운중동행정복지센터,17:07,17:17,17:29,17:40,17:52,18:04,18:20,18:34,18:49,19:04,19:18,19:30,19:40,19:55,20:10,20:24,20:44,21:05,21:24,21:44,22:05,22:29,22:55,23:15,,,,,,
7,206000005,뫼루니육교,17:08,17:20,17:32,17:42,17:52,18:07,18:22,18:37,18:51,19:07,19:19,19:32,19:41,19:57,20:11,20:27,20:45,21:05,21:25,21:47,22:07,22:31,22:55,23:17,,,,,,
8,206000004,판교원마을1.2단지.판교도서관,17:10,17:22,17:35,17:43,17:55,18:08,18:22,18:40,18:52,19:08,19:20,19:32,19:43,19:58,20:13,20:28,20:47,21:08,21:27,21:48,22:10,22:32,22:58,23:18,,,17:02,,,
9,206000555,판교청소년수련관.판교종합사회복지관,17:13,17:25,17:36,17:46,17:55,18:10,18:25,18:41,18:55,19:08,19:22,19:35,19:44,20:00,20:14,20:31,20:48,21:08,21:28,21:48,22:10,22:33,22:58,23:18,,,17:02,,,
10,206000003,한림아파트,17:14,17:26,17:36,17:46,17:56,18:11,18:26,18:42,18:56,19:09,19:23,19:36,19:44,20:01,20:14,20:31,20:49,21:09,21:28,21:49,22:11,22:34,22:59,23:19,,,17:02,,,
