# 대도시권 광역교통 빅데이터 분석 시각화 및 활용 아이디어 경진대회
##### [데이콘 페이지](https://dacon.io/competitions/official/236009/overview/description)

#### 주제: 대도시권 광역교통 분야 분석 시각화 및 개선 아이디어 제안
###### <예시>
- 교통데이터 활용 광역 비즈니스 모델 수립, 대도시권 교통 문제 개선 제안
- 광역교통서비스 평가지표 신규제안, 기존 평가지표 측정 방안 개선 제안
- 광연 MaaS, M-DRT 도입에 대한 정책 제안
- 광역교통 주차수요 분석, 연계 환승통행체계 방법론 제안

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl

from matplotlib import rc
rc("font", family = "Malgun Gothic")

get_ipython().run_line_magic("matplotlib", "inline")

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

import time
from tqdm.notebook import tqdm

In [3]:
from glob import glob
import warnings
warnings.filterwarnings('ignore')

# 1. 대중교통 이용객 수요 데이터 수집
- [참고 사이트](https://stcis.go.kr/pivotIndi/wpsPivotIndicator.do?siteGb=P&indiClss=IC04)

In [5]:
url = "https://stcis.go.kr/pivotIndi/wpsPivotIndicator.do?siteGb=P&indiClss=IC04"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('C:/Users/gktna/Documents/study/driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

### (1) 하나씩 해보기
#### (1)-1 기간 선택

In [108]:
month_but = driver.find_element(By.XPATH, '//*[@id="divRdoDate"]/li[2]')
month_but.click()

In [110]:
cal_but = driver.find_element(By.XPATH, '//*[@id="date2"]/li[1]/img')
cal_but.click() 

In [111]:
year_but = driver.find_element(By.XPATH, '//*[@id="ui-datepicker-div"]/div[1]/div/select[1]/option[4]')
year_but.click()

In [112]:
day_but = driver.find_element(By.XPATH, '//*[@id="ui-datepicker-div"]/div[1]/div/select[2]')
day_but.send_keys(3)

In [113]:
ok_but = driver.find_element(By.XPATH, '//*[@id="ui-datepicker-div"]/div[2]/button')
ok_but.click()

#### (1)-2 출발지 선택
 - 지역별 구명은 리스트로 따로 저장

In [10]:
sigungu_but = driver.find_element(By.XPATH, '//*[@id="divRdoODAear"]/li[2]')
sigungu_but.click()

In [11]:
sido_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSd"]')
sido_but.send_keys("서울특별시")

In [12]:
ss_list = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSgg"]').text.split('\n')[1:]
ss_list

['강남구',
 '강동구',
 '강북구',
 '강서구',
 '관악구',
 '광진구',
 '구로구',
 '금천구',
 '노원구',
 '도봉구',
 '동대문구',
 '동작구',
 '마포구',
 '서대문구',
 '서초구',
 '성동구',
 '성북구',
 '송파구',
 '양천구',
 '영등포구',
 '용산구',
 '은평구',
 '종로구',
 '중구',
 '중랑구']

In [13]:
sido_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSd"]')
sido_but.send_keys("경기도")

In [14]:
gg_list = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSgg"]').text.split('\n')[1:]
gg_list

['가평군',
 '고양시',
 '과천시',
 '광명시',
 '광주시',
 '구리시',
 '군포시',
 '김포시',
 '남양주시',
 '동두천시',
 '부천시',
 '성남시',
 '수원시',
 '시흥시',
 '안산시',
 '안성시',
 '안양시',
 '양주시',
 '양평군',
 '여주시',
 '연천군',
 '오산시',
 '용인시',
 '의왕시',
 '의정부시',
 '이천시',
 '파주시',
 '평택시',
 '포천시',
 '하남시',
 '화성시']

In [15]:
gg_list[12]

'수원시'

In [16]:
sido_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSd"]')
sido_but.send_keys("인천광역시")

In [17]:
ii_list = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSgg"]').text.split('\n')[1:]
ii_list

['강화군', '계양구', '남동구', '동구', '미추홀구', '부평구', '서구', '연수구', '옹진군', '중구']

#### (1)-3 도착지 선택

In [70]:
select_all_but = driver.find_element(By.XPATH, '//*[@id="OD_end_box"]/div[1]/ul[2]/li/label[2]')
select_all_but.click()

In [71]:
res_but = driver.find_element(By.XPATH, '//*[@id="btnSearch"]/button')
res_but.click()

#### (1)-4 결과 저장

In [7]:
load_but = driver.find_element(By.XPATH, '//*[@id="rgrstyReportResult"]/h2/p/span[1]')
load_but.click()


#### (1)-5 뒤로가기

In [8]:
back_but = driver.find_element(By.XPATH, '//*[@id="tab1"]/div[2]/div[3]/button')
back_but.click()

In [75]:
driver.close()

### (2) 반복문으로 2019-2021 년도의 자료 수집하기
- 서울, 인천, 경기 지역 자료만 수집

In [18]:
si_list = ['서울특별시', '인천광역시', '경기도']
gu_list = [ss_list, ii_list, gg_list]

In [107]:
url = "https://stcis.go.kr/pivotIndi/wpsPivotIndicator.do?siteGb=P&indiClss=IC04"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('../driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

In [22]:
for i in tqdm(range(0, 3)):
    for k in range(len(gu_list[i])):
        sigungu_but = driver.find_element(By.XPATH, '//*[@id="divRdoODAear"]/li[2]')
        sigungu_but.click()

        sido_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSd"]')
        sido_but.send_keys(si_list[i])

        sigungu_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSgg"]')
        sigungu_but.send_keys(gu_list[i][k])

        select_all_but = driver.find_element(By.XPATH, '//*[@id="OD_end_box"]/div[1]/ul[2]/li/label[2]')
        select_all_but.click()

        res_but = driver.find_element(By.XPATH, '//*[@id="btnSearch"]/button')
        res_but.click()

        time.sleep(5)

        try:
            driver.find_element(By.XPATH, '//*[@id="rgrstyReportResult"]/h2/p/span[1]').click()  #다운로드 버튼
        except:
            print( gu_list[i][k], ' 다운로드 실패')




        time.sleep(5)

        back_but = driver.find_element(By.XPATH, '//*[@id="tab1"]/div[2]/div[3]/button')
        back_but.click()

        time.sleep(1) 

  0%|          | 0/3 [00:00<?, ?it/s]

고양시  다운로드 실패
성남시  다운로드 실패
수원시  다운로드 실패
용인시  다운로드 실패


In [116]:
#2019년 3월
for i in tqdm(range(0, 3)):

    month_but = driver.find_element(By.XPATH, '//*[@id="divRdoDate"]/li[2]')
    month_but.click()

    cal_but = driver.find_element(By.XPATH, '//*[@id="date2"]/li[1]/img')
    cal_but.click() 

    year_but = driver.find_element(By.XPATH, '//*[@id="ui-datepicker-div"]/div[1]/div/select[1]/option[2]') 
    year_but.click()

    day_but = driver.find_element(By.XPATH, '//*[@id="ui-datepicker-div"]/div[1]/div/select[2]')
    day_but.send_keys(3)

    ok_but = driver.find_element(By.XPATH, '//*[@id="ui-datepicker-div"]/div[2]/button')
    ok_but.click()

    for k in range(len(gu_list[i])):

        sigungu_but = driver.find_element(By.XPATH, '//*[@id="divRdoODAear"]/li[2]')
        sigungu_but.click()

        sido_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSd"]')
        sido_but.send_keys(si_list[i])

        sigungu_but = driver.find_element(By.XPATH, '//*[@id="searchStgptZoneSgg"]')
        sigungu_but.send_keys(gu_list[i][k])

        select_all_but = driver.find_element(By.XPATH, '//*[@id="OD_end_box"]/div[1]/ul[2]/li/label[2]')
        select_all_but.click()

        res_but = driver.find_element(By.XPATH, '//*[@id="btnSearch"]/button')
        res_but.click()

        time.sleep(5)

        try:
            driver.find_element(By.XPATH, '//*[@id="rgrstyReportResult"]/h2/p/span[1]').click()  #다운로드 버튼
        except:
            print( gu_list[i][k], ' 다운로드 실패')
            



        time.sleep(5)

        back_but = driver.find_element(By.XPATH, '//*[@id="tab1"]/div[2]/div[3]/button')
        back_but.click()

        time.sleep(1) 

  0%|          | 0/3 [00:00<?, ?it/s]

고양시  다운로드 실패
성남시  다운로드 실패
수원시  다운로드 실패
용인시  다운로드 실패


- 고양, 성남, 수원, 용인시 파일은 수동으로 다운로드!

In [23]:
driver.close()
driver.quit()

## (3) 수집한 데이터 정리
- glob를 사용해 다운받은 파일을 한번에 불러오기

In [24]:
od_files_2019 = glob("./OD_2019_03/이용객*.xlsx")
od_files_2020 = glob("./OD_2020_03/이용객*.xlsx")
od_files_2021 = glob("./OD_2021_03/이용객*.xlsx")
od_files_2019[0]

'./OD_2019_03\\이용객 수요 일반버스·도시철도 이용 OD_20221123 (1).xlsx'

In [25]:
# 하나만 읽어보기
tmp = pd.read_excel(od_files_2019[0], header=0)
tmp

Unnamed: 0,일자,시도(출발),시군구(출발),읍면동(출발),시도(도착),시군구(도착),읍면동(도착),통행량,통행시간,통행거리
0,2019-03-13(수),서울특별시,강동구,-,강원도,춘천시,-,71,-,-
1,2019-03-13(수),서울특별시,강동구,-,경기도,가평군,-,28,-,-
2,2019-03-13(수),서울특별시,강동구,-,경기도,고양시,-,704,-,-
3,2019-03-13(수),서울특별시,강동구,-,경기도,과천시,-,218,-,-
4,2019-03-13(수),서울특별시,강동구,-,경기도,광명시,-,271,-,-
...,...,...,...,...,...,...,...,...,...,...
65,2019-03-13(수),서울특별시,강동구,-,인천광역시,연수구,-,81,-,-
66,2019-03-13(수),서울특별시,강동구,-,인천광역시,옹진군,-,1,-,-
67,2019-03-13(수),서울특별시,강동구,-,인천광역시,중구,-,149,-,-
68,2019-03-13(수),서울특별시,강동구,-,충청남도,아산시,-,13,-,-


In [26]:
# 하나만 읽어보기
tmp = pd.read_excel(od_files_2021[0], header=0)
tmp

Unnamed: 0,일자,시도(출발),시군구(출발),읍면동(출발),시도(도착),시군구(도착),읍면동(도착),통행량,통행시간,통행거리
0,2021-03-17(수),서울특별시,강동구,-,강원도,춘천시,-,65,126.38,88947
1,2021-03-17(수),서울특별시,강동구,-,경기도,가평군,-,31,96.62,51460
2,2021-03-17(수),서울특별시,강동구,-,경기도,고양시,-,561,90.14,38449
3,2021-03-17(수),서울특별시,강동구,-,경기도,과천시,-,183,63.21,23120
4,2021-03-17(수),서울특별시,강동구,-,경기도,광명시,-,211,74.64,31534
...,...,...,...,...,...,...,...,...,...,...
64,2021-03-17(수),서울특별시,강동구,-,인천광역시,연수구,-,37,121.11,58783
65,2021-03-17(수),서울특별시,강동구,-,인천광역시,중구,-,72,108.00,52652
66,2021-03-17(수),서울특별시,강동구,-,충청남도,아산시,-,2,179.94,109883
67,2021-03-17(수),서울특별시,강동구,-,충청남도,천안시,-,15,162.99,100146


In [27]:
tmp_raw = []

for file_name in od_files_2019:
    tmp = pd.read_excel(file_name, header=0)
    tmp_raw.append(tmp)
    
for file_name in od_files_2020:
    tmp = pd.read_excel(file_name, header=0)
    tmp_raw.append(tmp)
    
for file_name in od_files_2021:
    tmp = pd.read_excel(file_name, header=0)
    tmp_raw.append(tmp)

In [28]:
tmp_raw[:2]

[               일자 시도(출발) 시군구(출발) 읍면동(출발) 시도(도착) 시군구(도착) 읍면동(도착)  통행량 통행시간 통행거리
 0   2019-03-13(수)  서울특별시     강동구       -    강원도     춘천시       -   71    -    -
 1   2019-03-13(수)  서울특별시     강동구       -    경기도     가평군       -   28    -    -
 2   2019-03-13(수)  서울특별시     강동구       -    경기도    고양시        -  704    -    -
 3   2019-03-13(수)  서울특별시     강동구       -    경기도     과천시       -  218    -    -
 4   2019-03-13(수)  서울특별시     강동구       -    경기도     광명시       -  271    -    -
 ..            ...    ...     ...     ...    ...     ...     ...  ...  ...  ...
 65  2019-03-13(수)  서울특별시     강동구       -  인천광역시     연수구       -   81    -    -
 66  2019-03-13(수)  서울특별시     강동구       -  인천광역시     옹진군       -    1    -    -
 67  2019-03-13(수)  서울특별시     강동구       -  인천광역시      중구       -  149    -    -
 68  2019-03-13(수)  서울특별시     강동구       -   충청남도     아산시       -   13    -    -
 69  2019-03-13(수)  서울특별시     강동구       -   충청남도    천안시        -   15    -    -
 
 [70 rows x 10 columns],
             

In [29]:
od_raw = pd.concat(tmp_raw)
od_raw

Unnamed: 0,일자,시도(출발),시군구(출발),읍면동(출발),시도(도착),시군구(도착),읍면동(도착),통행량,통행시간,통행거리
0,2019-03-13(수),서울특별시,강동구,-,강원도,춘천시,-,71,-,-
1,2019-03-13(수),서울특별시,강동구,-,경기도,가평군,-,28,-,-
2,2019-03-13(수),서울특별시,강동구,-,경기도,고양시,-,704,-,-
3,2019-03-13(수),서울특별시,강동구,-,경기도,과천시,-,218,-,-
4,2019-03-13(수),서울특별시,강동구,-,경기도,광명시,-,271,-,-
...,...,...,...,...,...,...,...,...,...,...
64,2021-03-17(수),서울특별시,강남구,-,인천광역시,연수구,-,871,93.08,49735
65,2021-03-17(수),서울특별시,강남구,-,인천광역시,옹진군,-,2,141.63,82852
66,2021-03-17(수),서울특별시,강남구,-,인천광역시,중구,-,745,87.64,42374
67,2021-03-17(수),서울특별시,강남구,-,충청남도,아산시,-,22,163.25,105612


In [30]:
od_df = pd.DataFrame({"일자":od_raw["일자"],
                        "시도(출발)":od_raw["시도(출발)"],
                        "시군구(출발)":od_raw["시군구(출발)"],
                        "시도(도착)":od_raw["시도(도착)"],
                        "시군구(도착)":od_raw["시군구(도착)"],
                         "통행량":od_raw["통행량"]})
od_df

Unnamed: 0,일자,시도(출발),시군구(출발),시도(도착),시군구(도착),통행량
0,2019-03-13(수),서울특별시,강동구,강원도,춘천시,71
1,2019-03-13(수),서울특별시,강동구,경기도,가평군,28
2,2019-03-13(수),서울특별시,강동구,경기도,고양시,704
3,2019-03-13(수),서울특별시,강동구,경기도,과천시,218
4,2019-03-13(수),서울특별시,강동구,경기도,광명시,271
...,...,...,...,...,...,...
64,2021-03-17(수),서울특별시,강남구,인천광역시,연수구,871
65,2021-03-17(수),서울특별시,강남구,인천광역시,옹진군,2
66,2021-03-17(수),서울특별시,강남구,인천광역시,중구,745
67,2021-03-17(수),서울특별시,강남구,충청남도,아산시,22


In [35]:
od_df.reset_index(inplace=True, drop=True)

for idx, row in tqdm(od_df.iterrows()):
    od_df.iloc[idx, 0] = row["일자"][:-6]
    
od_df

0it [00:00, ?it/s]

Unnamed: 0,일자,시도(출발),시군구(출발),시도(도착),시군구(도착),통행량
0,2019-03,서울특별시,강동구,강원도,춘천시,71
1,2019-03,서울특별시,강동구,경기도,가평군,28
2,2019-03,서울특별시,강동구,경기도,고양시,704
3,2019-03,서울특별시,강동구,경기도,과천시,218
4,2019-03,서울특별시,강동구,경기도,광명시,271
...,...,...,...,...,...,...
12964,2021-03,서울특별시,강남구,인천광역시,연수구,871
12965,2021-03,서울특별시,강남구,인천광역시,옹진군,2
12966,2021-03,서울특별시,강남구,인천광역시,중구,745
12967,2021-03,서울특별시,강남구,충청남도,아산시,22


In [37]:
od_df.to_excel('./od_df.xlsx', encoding='utf-8')

In [38]:
od_df = pd.read_excel('./od_df.xlsx', index_col=0)
od_df.head()

Unnamed: 0,일자,시도(출발),시군구(출발),시도(도착),시군구(도착),통행량
0,2019-03,서울특별시,강동구,강원도,춘천시,71
1,2019-03,서울특별시,강동구,경기도,가평군,28
2,2019-03,서울특별시,강동구,경기도,고양시,704
3,2019-03,서울특별시,강동구,경기도,과천시,218
4,2019-03,서울특별시,강동구,경기도,광명시,271


In [139]:
od_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13646 entries, 0 to 13645
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   일자       13646 non-null  object
 1   시도(출발)   13646 non-null  object
 2   시군구(출발)  13646 non-null  object
 3   시도(도착)   13646 non-null  object
 4   시군구(도착)  13646 non-null  object
 5   통행량      13646 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 746.3+ KB


In [39]:
si_list = ['서울특별시', '인천광역시', '경기도']

df = od_df[od_df['시도(도착)'].isin(si_list)]
df.sort_values(by=['일자', '시도(출발)', '시군구(출발)'], inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,일자,시도(출발),시군구(출발),시도(도착),시군구(도착),통행량
0,2019-03,경기도,가평군,경기도,가평군,3014
1,2019-03,경기도,가평군,경기도,고양시,16
2,2019-03,경기도,가평군,경기도,과천시,1
3,2019-03,경기도,가평군,경기도,광명시,6
4,2019-03,경기도,가평군,경기도,구리시,142
...,...,...,...,...,...,...
12406,2021-03,인천광역시,중구,인천광역시,부평구,3672
12407,2021-03,인천광역시,중구,인천광역시,서구,3736
12408,2021-03,인천광역시,중구,인천광역시,연수구,3686
12409,2021-03,인천광역시,중구,인천광역시,옹진군,2


In [40]:
df_2019 = df[df['일자'] == '2019-03']
df_2020 = df[df['일자'] == '2020-03']
df_2021 = df[df['일자'] == '2021-03']

In [41]:
pd.pivot_table(data=df, index=['시도(출발)', '시군구(출발)', '일자'],
               columns=['시도(도착)', '시군구(도착)'], values='통행량',
              aggfunc=np.sum, fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,시도(도착),경기도,경기도,경기도,경기도,경기도,경기도,경기도,경기도,경기도,경기도,...,인천광역시,인천광역시,인천광역시,인천광역시,인천광역시,인천광역시,인천광역시,인천광역시,인천광역시,인천광역시
Unnamed: 0_level_1,Unnamed: 1_level_1,시군구(도착),가평군,고양시,과천시,광명시,광주시,구리시,군포시,김포시,남양주시,동두천시,...,강화군,계양구,남동구,동구,미추홀구,부평구,서구,연수구,옹진군,중구
시도(출발),시군구(출발),일자,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
경기도,가평군,2019-03,3014,16,1,6,0,142,1,0,1326,5,...,0,1,3,0,2,8,3,2,0,5
경기도,가평군,2020-03,1770,14,1,6,1,117,1,1,857,2,...,0,2,1,0,2,3,1,0,0,2
경기도,가평군,2021-03,2383,20,3,8,5,116,3,6,1037,2,...,0,2,2,0,6,11,4,1,0,1
경기도,고양시,2019-03,14,247645,328,419,110,256,368,3681,399,62,...,73,290,179,16,180,368,563,172,0,280
경기도,고양시,2020-03,14,159559,179,280,65,161,175,2409,254,32,...,34,191,136,9,120,239,383,91,0,150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
인천광역시,옹진군,2020-03,0,0,0,2,0,0,1,0,0,0,...,0,0,19,0,15,4,0,7,6,4
인천광역시,옹진군,2021-03,0,0,0,0,0,0,0,0,0,0,...,0,3,35,0,3,3,0,3,7,2
인천광역시,중구,2019-03,5,273,38,200,9,43,110,516,65,29,...,9,3600,5688,2429,4305,5755,6722,5470,5,25245
인천광역시,중구,2020-03,4,160,25,130,8,25,63,361,33,10,...,6,2199,3791,1621,7978,3570,4499,3583,3,18771


In [42]:
total_df = pd.DataFrame(pd.pivot_table(data=df, index=['시도(출발)', '시군구(출발)', '일자'],
               columns=['시도(도착)', '시군구(도착)'], values='통행량',
              aggfunc=np.sum, fill_value=0))
total_df.to_excel('./total_df.xlsx', encoding='utf-8')

----
# 2. 버스 정류장 자료
### 1. 자료의 범위
#### <지역>
- 수도권: 서울, 인천, 경기도
- 부울권: 부산, 울산, 양산, 김해, 창원, 경주
- 대구권: 대구, 구미, 경산, 영천, 군위, 청도, 고령, 성주, 칠곡, 창녕
- 광주권: 광주, 나주, 담양, 화순, 함평, 장성
- 대전권: 대전, 세종, 공주, 논산, 계룡, 금산, 청주, 보은, 옥천

#### <시간>
- 2019-2021년 3월 중 평일 1일

### 2. 자료 내용
- Stop : 대중교통 정차지 정보
- Route : 대중교통 노선 기초정보
- RouteStop : 대중교통 노선별 경유정차지 정보
- StopTime : 대중교통 노선별, 운행회차별 정차순번 및 도착/출발시각 정보

In [None]:
Stop_bus_2019 = pd.read_excel("./2019.03/201903_1_버스_1_BaseInfo.xlsx", sheet_name=0)
Route_bus_2019 = pd.read_excel("./2019.03/201903_1_버스_1_BaseInfo.xlsx", sheet_name=1)
RouteStop_bus_2019 = pd.read_excel("./2019.03/201903_1_버스_1_BaseInfo.xlsx", sheet_name=2)
StopTime_bus_2019 = pd.read_csv("./2019.03/201903_1_버스_2_StopTime.txt")

Stop_bus_2020 = pd.read_excel("./2020.03/202003_1_버스_1_BaseInfo.xlsx", sheet_name=0)
Route_bus_2020 = pd.read_excel("./2020.03/202003_1_버스_1_BaseInfo.xlsx", sheet_name=1)
RouteStop_bus_2020 = pd.read_excel("./2020.03/202003_1_버스_1_BaseInfo.xlsx", sheet_name=2)
StopTime_bus_2020 = pd.read_csv("./2020.03/202003_1_버스_2_StopTime.txt")

Stop_bus_2021 = pd.read_excel("./2021.03/202103_1_버스_1_BaseInfo.xlsx", sheet_name=0)
Route_bus_2021 = pd.read_excel("./2021.03/202103_1_버스_1_BaseInfo.xlsx", sheet_name=1)
RouteStop_bus_2021 = pd.read_excel("./2021.03/202103_1_버스_1_BaseInfo.xlsx", sheet_name=2)
StopTime_bus_2021 = pd.read_csv("./2021.03/202103_1_버스_2_StopTime.txt")

In [None]:
Route_bus = pd.concat([Route_bus_2019, Route_bus_2020, Route_bus_2021])
Route_bus.tail()

In [6]:
stt = list(Route_bus['Start'].unique())
rtn = list(Route_bus['Return'].unique())
end = list(Route_bus['End'].unique())

In [7]:
stop_bus_list_raw = stt + rtn + end
stop_bus_list = list(set(stop_bus_list_raw))
len(stop_bus_list)

7131

- set() 을 이용하면 리스트 내 중복을 제거할 수 있다.
- 반환값은 list형이 아니라서 list()로 감싸야 한다.

### 버스정류장 위치 구하기

In [97]:
url = "https://map.naver.com/v5/bus/bus-station/?c=14136568.1676528,4516193.4217655,18,0,0,0,dh"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('../driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

#### 검색창-검색어 입력

In [99]:
search_but = driver.find_element(By.XPATH, '//*[@id="input_search1667622936460"]')
search_but.send_keys(stop_bus_list[0])

#### Enter 버튼
[참고](https://stackoverflow.com/questions/1629053/typing-the-enter-return-key-in-selenium)

In [100]:
from selenium.webdriver.common.keys import Keys

driver.find_element(By.ID,"input_search1667622936460").send_keys(Keys.RETURN)

#### 결과 가져오기

In [101]:
res = driver.find_element(By.XPATH, '//*[@id="container"]/shrinkable-layout/div/bus-home/div[1]/div/fusion-search-list/fusion-bus-station-list/div/fusion-bus-station-item[1]/div/div/div/div[3]/span').text.split()
res[:2]

['서울특별시', '관악구']

#### 검색창 비우기

In [103]:
search_but = driver.find_element(By.XPATH, '//*[@id="input_search1667622936460"]')
search_but.clear()

### 반복문으로 모든 정류장의 위치 구하기

In [3]:
url = "https://map.naver.com/v5/bus/bus-station/?c=14136568.1676528,4516193.4217655,18,0,0,0,dh"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('../driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

In [104]:
import time
from tqdm.notebook import tqdm
from selenium.webdriver.common.keys import Keys

res_si = []
res_gu = []

for st in tqdm(stop_bus_list):
    search_but = driver.find_element(By.XPATH, '//*[@id="input_search1667622936460"]')
    search_but.send_keys(st)
   
    driver.find_element(By.ID,"input_search1667622936460").send_keys(Keys.RETURN) # enter 버튼
    time.sleep(3)
    
    try:
        res = driver.find_element(By.XPATH, '//*[@id="container"]/shrinkable-layout/div/bus-home/div[1]/div/fusion-search-list/fusion-bus-station-list/div/fusion-bus-station-item[1]/div/div/div/div[3]/span').text.split()
    except:
        res = ['', '']
    
    res_si.append(res[0])
    res_gu.append(res[1])
    
    search_but = driver.find_element(By.XPATH, '//*[@id="input_search1667622936460"]')
    search_but.clear()
    time.sleep(0.5)
    
driver.close()
driver.quit()

  0%|          | 0/7131 [00:00<?, ?it/s]

#### bus_stop_add 으로 저장

In [105]:
bus_stop_add = pd.DataFrame({"stop":stop_bus_list, "si":res_si, "gu":res_gu})
bus_stop_add.to_excel('./bus_stop_add.xlsx', encoding='utf-8')
bus_stop_add

Unnamed: 0,stop,si,gu
0,서울대입구역,서울특별시,관악구
1,송암 종점,충청북도,청주시
2,위례중앙중학교,경기도,성남시
3,사기마을,전라남도,함평군
4,천태사,경상남도,양산시
...,...,...,...
7126,감포종점,경상북도,경주시
7127,오산1리.오미마을,충청남도,논산시
7128,여주대하차장,경기도,여주시
7129,신현마을,부산광역시,금정구


In [8]:
bus_stop_add = pd.read_excel('./bus_stop_add.xlsx',index_col=0)

si_list = ['서울특별시', '인천광역시', '경기도']

bus_stop_add[bus_stop_add['si'].isin(si_list)]

Unnamed: 0,stop,si,gu
2,위례중앙중학교,경기도,성남시
5,현대사원아파트,경기도,이천시
10,하장경로당,경기도,안성시
12,뇌조리,경기도,파주시
15,오이도역,경기도,시흥시
...,...,...,...
7121,광남중학교.광남고등학교.청소년수련관.태전지웰,경기도,광주시
7122,솔빛마을.서해.쌍용아파트,경기도,화성시
7123,한화리조트.종점,경기도,양평군
7125,신도아크라티움.농협,경기도,의정부시


### NULL값 다시 채우기

In [9]:
df_null = bus_stop_add[bus_stop_add["si"].isnull()]
df_null.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 9 to 7120
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   stop    986 non-null    object
 1   si      0 non-null      object
 2   gu      0 non-null      object
dtypes: object(3)
memory usage: 30.8+ KB


#### Stop_bus의 위도, 경도 정보를 활용할 예정, merge를 사용해 파일 합치기

In [10]:
Stop_bus = pd.concat([Stop_bus_2019, Stop_bus_2020, Stop_bus_2021])

st_df_merge = pd.merge(left=Stop_bus, right=df_null, left_on='Stop_NM', right_on='stop')
st_df_merge = st_df_merge.loc[:, ['ARS_ID', 'Lati', 'Long', 'stop', 'si', 'gu']]
st_df_merge.head()

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
0,1504,37.58765,126.9967,명륜새마을금고,,
1,1504,37.587653,126.996705,명륜새마을금고,,
2,1504,37.587653,126.996705,명륜새마을금고,,
3,2001,37.56498,126.98753,남대문세무서.서울백병원,,
4,2001,37.564985,126.98753,남대문세무서.서울백병원,,


In [11]:
search_stop = st_df_merge.drop_duplicates(['stop'], keep='first', inplace=False, ignore_index=True)
search_stop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844 entries, 0 to 843
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ARS_ID  844 non-null    object 
 1   Lati    844 non-null    float64
 2   Long    844 non-null    float64
 3   stop    844 non-null    object 
 4   si      0 non-null      object 
 5   gu      0 non-null      object 
dtypes: float64(2), object(4)
memory usage: 39.7+ KB


#### 검색은 문자열만 가능. 데이터타입 변환

In [12]:
search_stop['Lati'] = search_stop['Lati'].astype(str)
search_stop['Long'] = search_stop['Long'].astype(str)

search_stop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844 entries, 0 to 843
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ARS_ID  844 non-null    object
 1   Lati    844 non-null    object
 2   Long    844 non-null    object
 3   stop    844 non-null    object
 4   si      0 non-null      object
 5   gu      0 non-null      object
dtypes: object(6)
memory usage: 39.7+ KB


#### 구글지도에 검색해 위치정보 추출

In [13]:
url = "https://www.google.co.kr/maps?hl=ko&tab=rl"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('../driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

In [17]:
a = str(search_stop.loc[0, 'Lati'] + ", " +search_stop.loc[0, 'Long'])
a

'37.58765, 126.9967'

In [18]:
search_but = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
search_but.send_keys(a)

driver.find_element(By.ID,"searchboxinput").send_keys(Keys.RETURN) 

In [19]:
res = driver.find_element(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[10]/div/div[1]/span[3]/span[3]').text.split()
res[:2]

['서울특별시', '종로구']

In [24]:
search_but = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
search_but.clear()

In [25]:
si_list = []
gu_list = []

for idx, row in tqdm(search_stop[563:].iterrows()):
    search_text = str(row["Lati"] + ', ' + row["Long"])
    
    search_but = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
    search_but.send_keys(search_text)

    enter_but = driver.find_element(By.XPATH, '//*[@id="searchbox-searchbutton"]') 
    enter_but.click()
    time.sleep(5)
    
    res = driver.find_element(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[10]/div/div[1]/span[3]/span[3]').text.split()
    time.sleep(1)

    si_list.append(res[0])
            
    try:  
        gu_list.append(res[1])
    except:
        gu_list.append("")
    
    print(res)
  

    search_but = driver.find_element(By.XPATH, '//*[@id="searchboxinput"]')
    search_but.clear()
    
driver.close()
driver.quit()

0it [00:00, ?it/s]

['창원중앙역(종점)']
['경상남도', '창원시', '진해구', '장천동', '480-13']
['팡팡게임랜드']
['경상남도', '창원시', '의창구', '소답동', '141-20']
['경상북도', '경산시', '진량읍', '신상리', '1097-19']
['경산시']
['대구광역시']
['대구광역시', '동구', '해안동', '41-12']
['대구광역시']
['대구광역시']
['대구광역시']
['대구광역시', '북구', '학정동', '472-2']
['대구광역시', '북구', '검단동', '1393-11']
['대구광역시', '달서구', '장기동', '660-22']
['대구광역시', '수성구', '황금1동', '산72-1']
['대구광역시', '수성구', '황금1동', '984']
['대구광역시', '달성군', '화원읍', '설화리', '804-1']
['경상북도', '칠곡군', '지천면', '송정리', '1085-1']
['경상북도', '구미시', '산동면', '인덕리', '51-2']
['영천시']
['경상북도', '영천시', '금호읍', '냉천리', '300-11']
['계룡시']
['충청남도', '연기군', '금남면', '신촌리', '215-6']
['대전광역시']
['대전광역시', '서구', '기성동', '1019']
['경상남도', '김해시', '외동', '1264']
['부산광역시', '강서구', '가락동', '204-4']
['경상북도', '구미시', '선주원남동', '146-12']
['경상북도', '김천시', '평화남산동', '442-7']
['경상북도', '구미시', '원평1동', '964-193']
['구미시', '공단1동']
['경상북도', '구미시', '공단1동', '110-15']
['공주시']
['공주시']
['공주시']
['충청남도', '공주시', '유구읍', '녹천리', '295-5']
['공주시']
['공주시']
['충청남도', '공주시', '탄천면', '삼각리', '502-20']
['공주시']
['공주시']
['

#### 결과 저장

In [26]:
sigu_df = pd.DataFrame({'si':si_list, 'gu':gu_list})
sigu_df.to_csv('./sigu_df_2.csv')

In [30]:
sigu_df_1 = pd.read_csv('./sigu_df.csv', index_col=0)
sigu_df_2 = pd.read_csv('./sigu_df_2.csv', index_col=0)

search_sigu = pd.concat([sigu_df_1, sigu_df_2])
si_list = list(search_sigu['si'])
gu_list = list(search_sigu['gu'])

#### search_stop에 붙여넣기

In [35]:
search_stop['si'] = si_list
search_stop['gu'] = gu_list

search_stop

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
0,01504,37.58765,126.9967,명륜새마을금고,서울특별시,종로구
1,02001,37.56498,126.98753,남대문세무서.서울백병원,서울특별시,중구
2,02507,37.56736,126.97743,프레스센터,서울특별시,
3,02522,37.5517,127.01212,중구청소년수련관.남산타운2상가,서울특별시,중구
4,03183,37.53758,126.96708,문배동등기소앞,서울특별시,용산구
...,...,...,...,...,...,...
839,366000490,35.7465113,128.8602433,갈고개,경상북도,청도군
840,367000131,35.7300641,128.2710261,고령시외버스정류장,경상북도,고령군
841,368000003,35.8371216,128.3376229,용암-용정,경상북도,성주군
842,368000009,35.7936444,128.1631228,백운1리학발,성주군,


In [90]:
search_stop.to_csv('./search_stop.csv', encoding='utf-8')

### 오류 체크

In [38]:
search_stop['si'].unique()

array(['서울특별시', '성남시', '안양시', '부천시', '경기도', '고양시', '하남시', '부산광역시',
       '남포동6가', '당리동', '주례동', '경상남도', '동해', '인천광역시', '부평동', '수원시', '의정부시',
       '#111', '평택시', '동두천시', '안산시', '황해', '덕양구', '구리시', '남양주시', '와부읍',
       '오산시', '시흥시', '의왕시', '용인시', '파주시', '이천시', '김포시', '화성시', '광주시',
       '양주시', '포천시', '여주군', '가평군', '양평군', '광주광역시', '전라남도', '나주시', '담양군',
       '화순군', '대구광역시', '경상북도', '영천시', '경주시', '서면', '칠곡군', '대전광역시', '충청북도',
       '청주시', '청원군', '천안시', '연기군', '환경시설관리사업소(후문건너편)', '창원중앙역(종점)',
       '팡팡게임랜드', '경산시', '계룡시', '충청남도', '구미시', '공주시', '포항시', '울산광역시',
       '보은군', '옥천군', '논산시', '증평군', '세종,', '자여입구(회차)', '용원종점(회차)', '군위군',
       '의성군', '청도군', '성주군'], dtype=object)

#### 경기도 지역은 si="경기도", gu="시명" 으로 변경

In [40]:
search_stop[search_stop['gu']=='하남시']

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
249,28293,37.53898,127.22968,BRT환승센터.검단산입구,경기도,하남시
251,28452,37.51063,127.14745,효죽동교회,경기도,하남시
437,-,37.53895,127.23043,BRT공영버스차고지(경유),경기도,하남시
720,-,37.5361667,127.1952333,기점(경유),경기도,하남시


- 경기도와 시명이 제대로 입력된 것도 있다!

In [65]:
search_stop['gu'] = search_stop['gu'].fillna(0)
search_stop

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
0,01504,37.58765,126.9967,명륜새마을금고,서울특별시,종로구
1,02001,37.56498,126.98753,남대문세무서.서울백병원,서울특별시,중구
2,02507,37.56736,126.97743,프레스센터,서울특별시,0
3,02522,37.5517,127.01212,중구청소년수련관.남산타운2상가,서울특별시,중구
4,03183,37.53758,126.96708,문배동등기소앞,서울특별시,용산구
...,...,...,...,...,...,...
839,366000490,35.7465113,128.8602433,갈고개,경상북도,청도군
840,367000131,35.7300641,128.2710261,고령시외버스정류장,경상북도,고령군
841,368000003,35.8371216,128.3376229,용암-용정,경상북도,성주군
842,368000009,35.7936444,128.1631228,백운1리학발,성주군,0


In [72]:
for i, r in search_stop.iterrows():
    if r['si'] in (gg_list):
        search_stop.loc[i, ['si','gu']] = ['경기도' , r['si']]

#### 자잘한 것들은 직접 수정
##### (1) search_stop['si'] == '부평동'

In [73]:
search_stop['si'].unique()

array(['서울특별시', '경기도', '부산광역시', '남포동6가', '당리동', '주례동', '경상남도', '동해',
       '인천광역시', '부평동', '#111', '황해', '덕양구', '와부읍', '광주광역시', '전라남도', '나주시',
       '담양군', '화순군', '대구광역시', '경상북도', '영천시', '경주시', '서면', '칠곡군', '대전광역시',
       '충청북도', '청주시', '청원군', '천안시', '연기군', '환경시설관리사업소(후문건너편)',
       '창원중앙역(종점)', '팡팡게임랜드', '경산시', '계룡시', '충청남도', '구미시', '공주시', '포항시',
       '울산광역시', '보은군', '옥천군', '논산시', '증평군', '세종,', '자여입구(회차)', '용원종점(회차)',
       '군위군', '의성군', '청도군', '성주군'], dtype=object)

In [74]:
search_stop[search_stop['si'] == '부평동']

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
137,97050,37.49147,126.72532,부평역(구보건소),부평동,159-40번지


In [75]:
search_stop.loc[137, ['si', 'gu']] = ['인천광역시', '부평동']

##### (2) search_stop['si'] == '#111'

In [77]:
search_stop[search_stop['si'] == '#111']

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
188,13326,37.52262,126.76735,쌍용3차.부천테크노파크,#111,Unit


In [92]:
search_stop.loc[188, ['si', 'gu']] = ['경기도', '부천시']

##### (3) search_stop['si'] == '덕양구'

In [80]:
search_stop[search_stop['si'] == '덕양구']

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
216,57439,37.65377,126.89567,삼송역7번출구,덕양구,삼송동


In [81]:
search_stop.loc[216, ['si', 'gu']] = ['경기도', '고양시']

##### (3) search_stop['si'] == '황해'

In [87]:
search_stop[search_stop['si'] == '황해']

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
209,18100,37.2978,126.78438,계양전기,황해,0
212,18099,37.29773,126.7841,태화상운차고,황해,0
314,-,37.27377,126.67167,형도종점,황해,0
319,-,37.14378,126.6796,백미리항,황해,0
323,-,37.2743,126.82783,송산고,황해,0
638,38565,37.38903,126.63948,복합환승센타투모로우시티,황해,0
649,42104,37.5062,126.6482,율도선착장입구,황해,0
650,42961,37.52909,126.62334,청라동문굿모닝힐,황해,0
743,38640,37.397237,126.621446,힐스테이트레이크송도2차(207동),황해,0
745,89334,37.536366,126.627346,레이크블루,황해,0


In [88]:
search_stop.loc[209, ['si', 'gu']] = ['경기도', '안산시']
search_stop.loc[212, ['si', 'gu']] = ['경기도', '안산시']
search_stop.loc[314, ['si', 'gu']] = ['경기도', '화성시']
search_stop.loc[319, ['si', 'gu']] = ['경기도', '화성시']
search_stop.loc[323, ['si', 'gu']] = ['경기도', '화성시']
search_stop.loc[638, ['si', 'gu']] = ['인천광역시', '연수구']
search_stop.loc[649, ['si', 'gu']] = ['인천광역시', '서구']
search_stop.loc[650, ['si', 'gu']] = ['인천광역시', '서구']
search_stop.loc[743, ['si', 'gu']] = ['인천광역시', '연수구']
search_stop.loc[745, ['si', 'gu']] = ['인천광역시', '서구']
search_stop.loc[827, ['si', 'gu']] = ['인천광역시', '연수구']

### 서울, 경기, 인천 지역만 추출

In [99]:
search_stop_sgi = search_stop[search_stop['si'].isin(['서울특별시', '인천광역시', '경기도'])]
search_stop_sgi.reset_index(drop=True, inplace=True)
search_stop_sgi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ARS_ID  496 non-null    object
 1   Lati    496 non-null    object
 2   Long    496 non-null    object
 3   stop    496 non-null    object
 4   si      496 non-null    object
 5   gu      496 non-null    object
dtypes: object(6)
memory usage: 23.4+ KB


#### 구명 없는 데이터

In [109]:
search_stop_sgi = pd.read_csv('./search_stop_sgi.csv', index_col=0)
search_stop_sgi

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
0,01504,37.587650,126.996700,명륜새마을금고,서울특별시,종로구
1,02001,37.564980,126.987530,남대문세무서.서울백병원,서울특별시,중구
2,02507,37.567360,126.977430,프레스센터,서울특별시,0
3,02522,37.551700,127.012120,중구청소년수련관.남산타운2상가,서울특별시,중구
4,03183,37.537580,126.967080,문배동등기소앞,서울특별시,용산구
...,...,...,...,...,...,...
491,-,38.037400,126.910867,명진여객차고지(경유),경기도,연천군
492,36182,37.484009,126.618956,만석어린이공원,인천광역시,동구
493,37655,37.465944,126.667593,도화역광장,인천광역시,남구
494,38642,37.413821,126.616998,송도호반베르디움3차에듀시티,인천광역시,연수구


In [113]:
url = "https://map.naver.com/v5/?c=14141309.6655158,4506633.6114111,16,0,0,0,dh"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('../driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

In [116]:
text = str('37.567360' + ',' + '126.977430')
driver.find_element(By.XPATH, '//*[@id="input_search1667716168062"]').send_keys(text)
    
driver.find_element(By.ID,"input_search1667716168062").send_keys(Keys.RETURN)

In [129]:
res = driver.find_element(By.XPATH, '//*[@id="container"]/shrinkable-layout/div/app-base/entry-layout/entry-address/div/div[2]/div/div[1]/div[1]/div[2]').text.split()
res[0]

'서울특별시'

In [145]:
for i, r in search_stop_sgi[search_stop_sgi['gu'] == '0'].iterrows():
    text = str(r['Lati']+ ", " + r['Long'])
    driver.find_element(By.XPATH, '//*[@id="input_search1667716168062"]').send_keys(text)
    
    driver.find_element(By.ID,"input_search1667716168062").send_keys(Keys.RETURN)
    time.sleep(3)
    
    try:
        res = driver.find_element(By.XPATH, '//*[@id="container"]/shrinkable-layout/div/app-base/entry-layout/entry-address/div/div[2]/div/div[1]/div[1]/div[2]').text.split()
    except:
        continue
        
    if len(res) == 0:
        continue
    search_stop_sgi.loc[i, ['si', 'gu']] = [res[0], res[1]]

In [147]:
driver.close()
driver.quit()

In [148]:
search_stop_sgi

Unnamed: 0,ARS_ID,Lati,Long,stop,si,gu
0,01504,37.58765,126.9967,명륜새마을금고,서울특별시,종로구
1,02001,37.56498,126.98753,남대문세무서.서울백병원,서울특별시,중구
2,02507,37.56736,126.97743,프레스센터,서울특별시,중구
3,02522,37.5517,127.01212,중구청소년수련관.남산타운2상가,서울특별시,중구
4,03183,37.53758,126.96708,문배동등기소앞,서울특별시,용산구
...,...,...,...,...,...,...
491,-,38.0374,126.9108667,명진여객차고지(경유),경기도,연천군
492,36182,37.484009,126.618956,만석어린이공원,인천광역시,동구
493,37655,37.465944,126.667593,도화역광장,인천광역시,남구
494,38642,37.413821,126.616998,송도호반베르디움3차에듀시티,인천광역시,연수구


In [149]:
search_stop_sgi.to_csv('./search_stop_sgi.csv', encoding='utf-8')

In [None]:
search_stop_sgi.rename()

### bus_stop_add 에 search_stop_sgi 추가하기

In [151]:
stop_sgi = search_stop_sgi.loc[:, ['stop', 'si', 'gu']]
bus_stop_info = pd.merge(left=bus_stop_add, right=search_stop_sgi, on = 'stop')
bus_stop_info = bus_stop_info.loc[:, ['stop', 'si_y', 'gu_y']]
bus_stop_info.rename({'si_y':"si", "gu_y":"gu"})
bus_stop_info

Unnamed: 0,stop,si_y,gu_y
0,(구)현대백화점,인천광역시,부평구
1,판교세븐벤처벨리,경기도,성남시
2,기점대기(경유),경기도,고양시
3,소요산차고지(경유),경기도,동두천시
4,열우물보조경기장주차장,인천광역시,부평구
...,...,...,...
491,천호역.현대백화점,서울특별시,강동구
492,경복대학,경기도,남양주시
493,광릉내출발지(경유),경기도,남양주시
494,홍대입구역(중),서울특별시,마포구


In [152]:
bus_stop_info.to_excel('./bus_stop_info.xlsx', encoding='utf-8')

----
# 3 전철역 자료
### 1. 자료의 범위
#### <지역>
- 수도권: 서울, 인천, 경기도
- 부울권: 부산, 울산, 양산, 김해, 창원, 경주
- 대구권: 대구, 구미, 경산, 영천, 군위, 청도, 고령, 성주, 칠곡, 창녕
- 광주권: 광주, 나주, 담양, 화순, 함평, 장성
- 대전권: 대전, 세종, 공주, 논산, 계룡, 금산, 청주, 보은, 옥천

#### <시간>
- 2019-2021년 3월 중 평일 1일

### 2. 자료 내용
- Stop : 대중교통 정차지 정보
- Xfer : 도시철도 환승 정보
- Route : 대중교통 노선 기초정보
- RouteStop : 대중교통 노선별 경유정차지 정보
- StopTime : 대중교통 노선별, 운행회차별 정차순번 및 도착/출발시각 정보

In [7]:
Stop_rail_2019 = pd.read_excel("./2019.03/201903_2_도시철도.xlsx", sheet_name=0)
Xfer_rail_2019 = pd.read_excel("./2019.03/201903_2_도시철도.xlsx", sheet_name=1)
Route_rail_2019 = pd.read_excel("./2019.03/201903_2_도시철도.xlsx", sheet_name=2)
RouteStop_rail_2019 = pd.read_excel("./2019.03/201903_2_도시철도.xlsx", sheet_name=3)
StopTime_rail_2019 = pd.read_excel("./2019.03/201903_2_도시철도.xlsx", sheet_name=4)

Stop_rail_2020 = pd.read_excel("./2020.03/202003_2_도시철도.xlsx", sheet_name=0)
Xfer_rail_2020 = pd.read_excel("./2020.03/202003_2_도시철도.xlsx", sheet_name=1)
Route_rail_2020 = pd.read_excel("./2020.03/202003_2_도시철도.xlsx", sheet_name=2)
RouteStop_rail_2020 = pd.read_excel("./2020.03/202003_2_도시철도.xlsx", sheet_name=3)
StopTime_rail_2020 = pd.read_excel("./2020.03/202003_2_도시철도.xlsx", sheet_name=4)

Stop_rail_2021 = pd.read_excel("./2021.03/202103_2_도시철도.xlsx", sheet_name=0)
Xfer_rail_2021 = pd.read_excel("./2021.03/202103_2_도시철도.xlsx", sheet_name=1)
Route_rail_2021 = pd.read_excel("./2021.03/202103_2_도시철도.xlsx", sheet_name=2)
RouteStop_rail_2021 = pd.read_excel("./2021.03/202103_2_도시철도.xlsx", sheet_name=3)
StopTime_rail_2021 = pd.read_excel("./2021.03/202103_2_도시철도.xlsx", sheet_name=4)

### (1) 위도, 경도를 검색해 역의 위치정보 수집

#### (1)-1 하나씩 해보기

In [37]:
Stop_rail = pd.concat([Stop_rail_2019, Stop_rail_2020, Stop_rail_2021])

In [12]:
Stop_rail['Lati'] = Stop_rail['Lati'].astype(str)
Stop_rail['Long'] = Stop_rail['Long'].astype(str)

In [11]:
url = "https://map.naver.com/v5/?c=14141309.6655158,4506633.6114111,16,0,0,0,dh"
chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=Service('../driver/chromedriver.exe'), options=chrome_options)
driver.get(url)

In [23]:
t = str('37.55587'+ ", " + '126.9721')
driver.find_element(By.XPATH, '//*[@id="input_search1667796607440"]').send_keys(t)
driver.find_element(By.ID,"input_search1667796607440").send_keys(Keys.RETURN)

In [24]:
res = driver.find_element(By.XPATH, '//*[@id="container"]/shrinkable-layout/div/app-base/entry-layout/entry-address/div/div[2]/div/div[1]/div[1]/div[2]').text
res

'서울특별시 중구 봉래동2가 122-25'

In [None]:
driver.find_element(By.ID,"input_search1667796607440").clear()

#### (1)-2 반복문으로 모든 역의 위치 수집

In [27]:
Stop_rail['si'] = "0"
Stop_rail['gu'] = "0"

for idx, row in tqdm(Stop_rail.iterrows()):
    t = str(row['Lati'] + ", " + row['Long'])
    driver.find_element(By.XPATH, '//*[@id="input_search1667796607440"]').send_keys(t)
    driver.find_element(By.ID,"input_search1667796607440").send_keys(Keys.RETURN)
    time.sleep(2)
    
    try:
        res = driver.find_element(By.XPATH, '//*[@id="container"]/shrinkable-layout/div/app-base/entry-layout/entry-address/div/div[2]/div/div[1]/div[1]/div[2]').text.split()
     
    except:
        res = ["1", "1"]
        
    Stop_rail.loc[idx, 'si'] = res[0]
    Stop_rail.loc[idx, 'gu'] = res[1]
        
        
    driver.find_element(By.ID,"input_search1667796607440").clear()

0it [00:00, ?it/s]

In [None]:
driver.close()
driver.quit()

### (2) 결과를 Stop_rail 로 저장

In [36]:
Stop_rail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3022 entries, 0 to 1016
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Data_SEQ  3022 non-null   int64 
 1   Month     3022 non-null   int64 
 2   Stop_ID   3022 non-null   object
 3   Stop_NM   3022 non-null   object
 4   Gate      3022 non-null   object
 5   Lati      3022 non-null   object
 6   Long      3022 non-null   object
 7   si        3022 non-null   object
 8   gu        3022 non-null   object
dtypes: int64(2), object(7)
memory usage: 300.6+ KB


In [29]:
Stop_rail.to_excel('./Stop_rail.xlsx', encoding='utf-8')

### (3) 중복값 제거 & 지역정보 없는 데이터 수정
- 손으로 직접 수정

In [63]:
Stop_rail = pd.read_excel('./Stop_rail.xlsx', index_col=0)

In [65]:
Stop_rail.sort_values(by='Stop_NM', inplace=True)
Stop_rail.reset_index(drop=True, inplace=True)

Stop_rail_d = Stop_rail.drop_duplicates(['Lati', 'Long'], keep='first', inplace=False, ignore_index=True)
Stop_rail_d[Stop_rail_d['si'] == '1']

Unnamed: 0,Data_SEQ,Month,Stop_ID,Stop_NM,Gate,Lati,Long,si,gu
133,62,201903,RS_ACC1_S-1-0310,구파발,Y,37.63676,126.91882,1,1
206,141,202003,RS_ACC1_S-1-1021,녹천,Y,37.6448,127.05127,1,1
439,615,202003,RS_ACC1_S-1-4129,봉은사,Y,37.514219,127.060245,1,1
471,36,202103,RS_ACC1_S-1-0226,사당,Y,37.47653,126.98162,1,1
811,959,202103,RS_ACC1_S-3-0325,원대,Y,35.88784,128.57436,1,1
911,615,201903,RS_ACC1_S-1-4130,종합운동장,Y,37.511699,127.076879,1,1
985,53,201903,RS_ACC1_S-1-0243,충정로(경기대입구),Y,37.55974,126.96447,1,1


In [48]:
Stop_rail.loc[34, ['si', 'gu']] = ['서울특별시', '동작구']
Stop_rail.loc[52, ['si', 'gu']] = ['서울특별시', '서대문구']
Stop_rail.loc[59, ['si', 'gu']] = ['서울특별시', '은평구']
Stop_rail.loc[126, ['si', 'gu']] = ['서울특별시', '노원구']
Stop_rail.loc[850, ['si', 'gu']] = ['대구광역시', '수성동']

### (4) 서울, 경기, 인천에 위치한 역만 추출하여 Stop_rail_d 로 저장

In [51]:
si_list = ['서울특별시', '인천광역시', '경기도']
Stop_rail_d = Stop_rail_d[Stop_rail_d['si'].isin(si_list)]
Stop_rail_d.drop('Data_SEQ', axis=1, inplace=True)
Stop_rail_d.reset_index(drop=True, inplace=True)
Stop_rail_d

Unnamed: 0,Month,Stop_ID,Stop_NM,Gate,Lati,Long,si,gu
0,201903,RS_ACC1_S-1-0150,서울역(지하),Y,37.555870,126.972100,서울특별시,중구
1,201903,RS_ACC1_S-1-0151,시청,Y,37.565710,126.977120,서울특별시,중구
2,201903,RS_ACC1_S-1-0152,종각,Y,37.570160,126.982920,서울특별시,종로구
3,201903,RS_ACC1_S-1-0153,종로3가,Y,37.570420,126.992110,서울특별시,종로구
4,201903,RS_ACC1_S-1-0154,종로5가,Y,37.570930,127.001850,서울특별시,종로구
...,...,...,...,...,...,...,...,...
624,202103,RS_ACC1_S-1-2563,미사,Y,37.563053,127.192935,경기도,하남시
625,202103,RS_ACC1_S-1-2564,하남풍산,Y,37.552150,127.203927,경기도,하남시
626,202103,RS_ACC1_S-1-2565,하남시청,Y,37.541913,127.206404,경기도,하남시
627,202103,RS_ACC1_S-1-2566,하남검단산,Y,37.539746,127.223271,경기도,하남시


In [53]:
Stop_rail_d.to_excel('./Stop_rail_d.xlsx', encoding='utf-8')