## 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np

# DataFrame 화면 출력                                                
# DataFrame 화면 출력 설정이 변경되어 있습니다
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None # default = 60

import datetime

# API 호출
# ## https://data.go.kr/tcs/dss/selectApiDataDetailView.do?publicDataPk=15057440
from urllib.request import Request, urlopen
# Request https://docs.python.org/ko/3/library/urllib.request.html#urllib.request.Request
# urlopen https://docs.python.org/ko/3/library/urllib.request.html#urllib.request.urlopen
from urllib.parse import urlencode, quote_plus
# urlenocde https://docs.python.org/ko/3/library/urllib.parse.html#urllib.parse.urlencode
# quote_plus https://docs.python.org/ko/3/library/urllib.parse.html#urllib.parse.quote_plus
from urllib.parse import unquote
import requests

# XML 파싱
import xml.etree.ElementTree as ET

## 기본값 설정하기

In [2]:
routeId = 204000046
year = 2020
month = 11
day = 17

## Key Unquote

In [3]:
KEY = 'yEaR%2F3MDedRSlVJL%2F2pxnVg0yre1N5VF3RZ%2FUAt56MJ7J2mNpfqhUvy05pXV0uhHTVY7DbyCR8xmMaDdYga67Q%3D%3D' # 종현
# apiKEY = requests.utils.unquote(KEY)
apiKey = unquote(KEY)
# print(apiKey)
KEYSW = 'M%2B4%2FqUiadT8X8PhgFjaQLDu%2BIOgPMURfGsOX%2FmVxwHQVJgnVR%2FMPjDYXkuQNwUFbZXlfnX5Lls3SUCiCLIFjgQ%3D%3D'
apiKeySW = unquote(KEYSW)

## API 호출

In [4]:
# 노선정보항목조회 busrouteservice/info
# routeId의 정류장 목록을 받아옵니다.
url = 'http://openapi.gbis.go.kr/ws/rest/busrouteservice/station'
queryParams = '?' + urlencode({ quote_plus('serviceKey') : apiKey, quote_plus('routeId') : routeId })
request = Request(url + queryParams)
request.get_method = lambda: 'GET'

# API를 호출하여 XML 형식으로 된 string 데이터를 변수 "oneLineXML" 에 저장합니다.
oneLineXML = urlopen(request).read().decode('utf8')
# oneLineXML

## XML 파싱

In [5]:
# xtree는 "openapi" 도메인 API 호출에서 "headerCd"에 상관없이 3개의 태그를 갖습니다.
# [comMsgHeader, msgHeader, msgBody]
xtree = ET.fromstring(oneLineXML)
# for branch in xtree:
#     print(branch)
msgBody = xtree[2]

tagList = ["stationSeq", "stationId", "stationName"]
stationList = []
for branch in msgBody:
    stationList.append([branch.find(tag).text for tag in tagList])

# station_df의 index가 0부터 시작하는 문제를 해결 하기 위해 rangeIndex를 지정하였습니다.
stationCount = len(stationList) # 뒤에서 다시 사용되므로 함수로 선언시 주의
rangeIndex = pd.RangeIndex(start=1, stop=stationCount+1)

station_df = pd.DataFrame(stationList, columns=tagList, index=rangeIndex)
station_df = station_df.astype({
    "stationSeq" : "int32",
    "stationId" : "int32", 
    "stationName" : "string"
    
})
# station_df

## 데이터 불러오기

In [6]:
# routeId 별 정류소 도착시간을 나타내는 2차원 배열을 생성 할 수 있습니다
# 마지막 정류소 번호가 필요합니다 : 경유정류소목록조회 API 서비스를 이용하여 가져옵니다.
# 중간 또는 처음과 끝에 발생하는 결측치를 채워야 합니다.


rootPath = 'C:/Users/jongh/OneDrive/School/석사/1-2/자료구조/프로젝트/5.DATA/Master/dataAPI/buslocationservice/'
outputPath = rootPath + str(routeId) + '/' + str(routeId) + '_' + str(year)[-2:] + '-' + str(month) + '-' + str(day) + '.txt'

itemTagList = ['endBus', 'lowPlate', 'plateNo', 'plateType', 'remainSeatCnt', 'routeId', 'stationId', 'stationSeq', 'Nan']
# itemTagList = ['endBus', 'plateNo', 'plateType', 'remainSeatCnt', 'routeId', 'stationId', 'stationSeq', 'Nan']
itemTagList.insert(0, 'DateTime')
_df = pd.read_csv(outputPath, sep=' ', skiprows=[0], names=itemTagList)
_df = _df.drop(["Nan"], axis=1)
_df = _df.sort_values(['plateNo'])
_df = _df.dropna()
# _df = _df.drop(_df[_df["endBus"]=="API"].index)

# string을 datetime객체로 변환하기
_df['DateTime'] = pd.to_datetime(_df['DateTime'])
# dateTimeObj = datetime.datetime.strptime(_df['DateTime'], '%Y-%m-%dT%H:%M:%S.%f%z')
# _df['DateTime'] = _df.DateTime.str.split('.').str[0]
# _df['DateTime'] = _df.DateTime.str.replace('T', ' ')

_df = _df.astype({
    'endBus': 'int32', 
    'lowPlate': 'int32', 
    'plateNo': 'string', 
    'plateType': 'int32',
    'remainSeatCnt': 'int32', 
    'routeId': 'int32', 
    'stationId': 'int32', 
    'stationSeq': 'int32'
})
# _df.dtypes

In [7]:
# stationSeq 가 뒤바뀌는 사례도 발생합니다. 
# cf) routeId = 204000046, DateTime = 20-11-17, plateNo = 경기78아1147, index = 166
_df[_df['plateNo']=='경기78아1147'].sort_values(by='DateTime').reset_index(drop=True).iloc[160:170] 

Unnamed: 0,DateTime,endBus,lowPlate,plateNo,plateType,remainSeatCnt,routeId,stationId,stationSeq
160,2020-11-17 09:59:59.366742+09:00,0,0,경기78아1147,3,37,204000046,100000001,24
161,2020-11-17 10:00:57.500708+09:00,0,0,경기78아1147,3,41,204000046,101000141,25
162,2020-11-17 10:01:55.394719+09:00,0,0,경기78아1147,3,41,204000046,101000141,25
163,2020-11-17 10:02:53.268866+09:00,0,0,경기78아1147,3,41,204000046,101000148,31
164,2020-11-17 10:03:51.302821+09:00,0,0,경기78아1147,3,41,204000046,101000148,31
165,2020-11-17 10:04:49.286911+09:00,0,0,경기78아1147,3,41,204000046,101000148,31
166,2020-11-17 10:05:47.511093+09:00,0,0,경기78아1147,3,42,204000046,101000264,26
167,2020-11-17 10:06:45.535132+09:00,0,0,경기78아1147,3,42,204000046,101000264,26
168,2020-11-17 10:07:43.579278+09:00,0,0,경기78아1147,3,42,204000046,101000264,26
169,2020-11-17 10:08:41.213157+09:00,0,0,경기78아1147,3,42,204000046,101000264,26


In [8]:
# 차량 운행기록을 "plateNo"를 기준으로 구분합니다.
plateNoList = _df.groupby(['plateNo']).size().index
plateNoList

Index(['강원71바1297', '경기70아7361', '경기70아7474', '경기70아7482', '경기70아8601',
       '경기70아8629', '경기70아8789', '경기70아8880', '경기76사5089', '경기77바1097',
       '경기77바1198', '경기78아1117', '경기78아1147', '경기78아1178'],
      dtype='object', name='plateNo')

In [9]:
# 정류장의 수 만큼 행을 갖는 빈 데이터프레임을 생성합니다. 변수 rangeIndex는 [XML파싱]에서 선언되어 있습니다.
_oneDay_df = pd.DataFrame(index = rangeIndex)

# 각 차량 번호를 기준으로 반복문을 수행합니다.
for plateNo in plateNoList:
    
    # 특정 차량번호와 같은 운행기록을 "DateTime을 기준으로 오름차순으로 정렬합니다."
    sameBusList = _df[_df['plateNo'] == plateNo].sort_values(['DateTime']).reset_index(drop=True)
#     print(sameBusList)
    
    
    # 버스는 뒤로가지 않으므로
    # "stationSeq"가 작아지는 인덱스 i의 리스트를 만듭니다.
    indexList = []
    for i in range(1,len(sameBusList)):
        if(sameBusList.iloc[i-1]['stationSeq'] > sameBusList.iloc[i]['stationSeq']):
            indexList.append(i)
            # print((sameBusList.iloc[i-1]['stationSeq'] , sameBusList.iloc[i]['stationSeq']))
            
    indexList.append(len(sameBusList))
    # print(indexList)
    
    # 첫번재 인덱스, 인덱스의 리스트, 마지막 인덱스를 조합하여 호차별로 운행기록을 구분합니다.
    index = 0
    tupleList = []
    for i in indexList:
        tupleList.append((index, i))
        small_df = sameBusList.iloc[index:i].groupby(by=['stationSeq']).min()
        
        # 정류소 데이터프레임에 합칩니다.
        merge_df = pd.merge(station_df, small_df, how='outer', on='stationId')
        # merge_df = merge_df.set_index(keys='stationSeq')
        
        # 1번 정류장에 대한 DateTime의 결측치가 많으면 1일 시간표에서 column 전체가 정렬이 안되므로 bfill방식으로 두번 결측치를 채웁니다.
        merge_df['DateTime'] = merge_df['DateTime'].interpolate(method='bfill', limit=2)
        
        if(0):
            # 결측치를 채울 경우 문제가 되는 데이터가 잘 드러나지 않으므로 결측치는 가장 마지막에 채우세요.
            # if의 인수가 1이면 결측치를 처리합니다. 결측치 처리를 하지 않으려면 if의 인수를 0으로 수정하세요. 
            # df.interpolate(method='linear' or 'polynomial')
            # 시간에 대한 결측치 처리
            merge_df['DateTime'] = merge_df['DateTime'].interpolate(method='backfill', limit=3)
            merge_df['DateTime'] = merge_df['DateTime'].interpolate(method='pad', limit=3)

            # 빈좌석에 대한 결측치 처리, limit 값이 다름에 주의
            merge_df['remainSeatCnt'] = merge_df['remainSeatCnt'].interpolate(method='linear', limit=3, limit_direction='both')

            merge_df['endBus'] = merge_df['endBus'].interpolate(method='pad', limit=3)        
            merge_df['endBus'] = merge_df['endBus'].interpolate(method='backfill', limit=3)

            merge_df['lowPlate'] = merge_df['lowPlate'].interpolate(method='pad', limit=3)        
            merge_df['lowPlate'] = merge_df['lowPlate'].interpolate(method='backfill', limit=3)

            merge_df['plateType'] = merge_df['plateType'].interpolate(method='pad', limit=3)        
            merge_df['plateType'] = merge_df['plateType'].interpolate(method='backfill', limit=3)

            merge_df['routeId'] = merge_df['routeId'].interpolate(method='pad', limit=3)        
            merge_df['routeId'] = merge_df['routeId'].interpolate(method='backfill', limit=3)

        _oneDay_df[str(plateNo)+'_'+str(index)] = merge_df['DateTime']
        index=i
        #print(str(plateNo)+'_'+str(index))
        #display(merge_df)
#         break;
#     break;

# _oneDay_df

In [10]:
# 출차시간(1행)을 기준으로 column을 정렬합니다.
oneDay_df = _oneDay_df.sort_values(by=1, axis=1)

In [11]:
# datetime 객체를 string 객체로 변환합니다. 시간을 가지고 계산해야 하는 일은 이전에 실행하세요.
oneDay_df = oneDay_df.astype('string')
for column in oneDay_df.columns:
    oneDay_df[column] = oneDay_df[column].str.slice(start=10, stop=16)

In [12]:
# [column명 : 차량번호] station_df 와 oneDay_df를 합칩니다. datetime이 아닌 다른 타입의 오브젝트가 행에 붙게 되므로 지금부터는 편집이 어렵습니다.
_finalOneDay_df = pd.concat([station_df,oneDay_df], axis=1)
_finalOneDay_df.set_index('stationSeq', drop=True, inplace=True)
# _finalOneDay_df

In [13]:
# 정렬된 column에 출차 순서를 배정합니다. (1호차, 2호차, ...)
columnCount = len(oneDay_df.columns)
oneDay_df.columns = np.arange(1,columnCount+1)

In [14]:
# [column명 : n호차] station_df 와 oneDay_df를 합칩니다. datetime이 아닌 다른 타입의 오브젝트가 행에 붙게 되므로 지금부터는 편집이 어렵습니다.
finalOneDay_df = pd.concat([station_df,oneDay_df], axis=1)
df = finalOneDay_df.set_index('stationSeq', drop=True)

In [15]:
df

Unnamed: 0_level_0,stationId,stationName,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
stationSeq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1,206000010,운중동먹거리촌,05:01,05:21,05:42,06:01,06:12,06:25,06:36,06:46,06:56,07:03,07:11,07:16,07:23,07:37,07:46,07:57,08:03,08:13,08:25,08:36,08:49,09:00,09:16,09:31,09:47,10:00,10:16,10:30,10:46,11:01,11:16,11:30,11:46,12:00,12:21,12:41,13:00,13:20,13:45,14:04,14:20,14:41,15:01,15:16,15:31,15:45,16:01,16:15,16:45,17:14,17:25,17:37,17:49,18:01,18:16,,,,
2,206000009,운중동푸르지오하임,05:02,05:21,05:43,06:02,06:13,06:25,06:37,06:46,06:56,07:04,07:11,07:17,07:24,07:39,07:46,07:57,08:04,08:13,08:25,08:37,08:49,09:01,09:17,09:31,09:47,10:01,10:17,10:32,10:47,11:02,11:16,11:31,11:46,12:01,12:21,12:42,13:01,13:21,13:45,14:05,14:21,14:41,15:02,15:17,15:31,15:47,16:02,16:16,16:46,17:15,17:25,17:37,17:50,18:03,18:17,,,,
3,206000658,한빛교회.월든힐스아파트,05:03,05:22,05:44,06:03,06:15,06:28,06:38,06:47,06:58,07:05,07:12,07:18,07:25,07:40,07:47,07:58,08:05,08:15,08:26,08:40,08:50,09:02,09:18,09:32,09:48,10:02,10:18,10:32,10:48,11:02,11:18,11:32,11:48,12:02,12:22,12:43,13:02,13:22,13:47,14:08,14:22,14:42,15:03,15:20,15:33,15:50,16:03,16:17,16:47,17:15,17:25,17:40,17:50,18:05,18:18,17:07,,,
4,206000558,운중초등학교,05:04,05:23,05:46,06:04,06:16,06:28,06:39,06:48,06:59,07:06,07:14,07:19,07:27,07:41,07:49,07:59,08:06,08:16,08:26,08:40,08:51,09:03,09:19,09:33,09:49,10:03,10:18,10:34,10:49,11:04,11:19,11:33,11:49,12:03,12:23,12:44,13:03,13:23,13:48,14:08,14:24,14:43,15:04,15:21,15:33,15:50,16:04,16:19,16:48,17:16,17:26,17:40,17:51,18:05,18:19,17:07,16:38,,
5,206000007,운중중학교,05:05,05:24,05:47,06:05,06:17,06:30,06:39,06:50,07:00,07:07,07:17,07:22,07:28,07:44,07:52,08:00,08:07,08:19,08:28,08:42,08:52,09:05,09:20,09:34,09:49,10:04,10:20,10:35,10:52,11:05,11:20,11:34,11:50,12:05,12:25,12:45,13:05,13:25,13:49,14:10,14:25,14:45,15:05,15:22,15:35,15:52,16:05,16:20,16:49,17:17,17:27,17:42,17:53,18:07,18:20,17:07,16:38,,
6,206000006,운중동행정복지센터,05:07,05:25,05:49,06:05,06:19,06:32,06:42,06:52,07:02,07:09,07:20,07:23,07:30,07:45,07:53,08:02,08:09,08:20,08:30,08:45,08:55,09:07,09:22,09:37,09:52,10:07,10:22,10:37,10:55,11:07,11:22,11:37,11:53,12:07,12:27,12:47,13:07,13:27,13:52,14:12,14:27,14:47,15:08,15:24,15:36,15:53,16:07,16:22,16:50,17:20,17:29,17:44,17:55,18:09,18:22,17:08,16:38,,
7,206000005,뫼루니육교,05:08,05:27,05:50,06:08,06:20,06:33,06:42,06:55,07:03,07:10,07:22,07:25,07:30,07:48,07:55,08:05,08:12,08:22,08:31,08:48,08:58,09:10,09:23,09:38,09:53,10:08,10:25,10:40,10:55,11:10,11:23,11:37,11:53,12:08,12:28,12:48,13:08,13:28,13:55,14:13,14:28,14:48,15:09,15:25,15:38,15:55,16:07,16:23,16:52,17:21,17:32,17:45,17:56,18:10,18:23,17:10,16:40,,
8,206000004,판교원마을1.2단지.판교도서관,05:11,05:28,05:53,06:08,06:23,06:36,06:45,06:58,07:06,07:13,07:26,07:28,07:33,07:51,07:58,08:07,08:15,08:25,08:33,08:50,09:00,09:13,09:26,09:40,09:56,10:10,10:26,10:43,10:57,11:11,11:25,11:38,11:55,12:10,12:30,12:50,13:08,13:30,13:55,14:15,14:30,14:50,15:11,15:28,15:38,15:56,16:10,16:25,16:55,17:23,17:33,17:46,17:56,18:13,18:27,17:13,16:42,,
9,206000555,판교청소년수련관.판교종합사회복지관,05:12,05:29,05:54,06:09,06:23,06:37,06:46,06:59,07:06,07:14,07:27,07:29,07:34,07:53,07:59,08:08,08:16,08:26,08:33,08:52,09:01,09:14,09:27,09:41,09:57,10:11,10:27,10:44,10:58,11:11,11:25,11:39,11:56,12:11,12:31,12:51,13:09,13:31,13:57,14:16,14:31,14:51,15:12,15:29,15:39,15:57,16:11,16:26,16:56,17:24,17:34,17:47,17:58,18:15,18:29,17:14,16:43,,
10,206000003,한림아파트,05:14,05:32,05:56,06:11,06:25,06:39,06:48,07:01,07:08,07:16,07:28,07:31,07:36,07:54,08:01,08:10,08:18,08:28,08:35,08:53,09:03,09:16,09:29,09:43,09:59,10:13,10:28,10:46,11:00,11:13,11:26,11:40,11:58,12:13,12:31,12:53,13:11,13:31,13:58,14:18,14:32,14:51,15:13,15:31,15:40,15:58,16:13,16:28,16:58,17:24,17:34,17:48,17:58,18:16,18:30,17:16,16:45,,
