In [1]:
# 山的基本資訊:
# 名稱(name)、海拔(elevation)、介紹(introduction)、地址or區域(region)
# 匯入模組
from time import sleep
from datetime import datetime, timedelta
from tqdm import tqdm
import random
import os
import json
import requests
from bs4 import BeautifulSoup
import pygsheets
import pandas as pd
from openpyxl import load_workbook

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}

In [2]:
# 抓金鑰
gc = pygsheets.authorize(service_file='ccclub_api_key.json')
# 連接雲端 sheet
sht = gc.open_by_url(
'https://docs.google.com/spreadsheets/d/1t7U4pvptpD_cfTMD4tUwDeCffAopPQF2hJEt386J18A/'
)

In [3]:
#匯入Excel(去重複+排序)
def append_to_excel(new_data, sheet_name, pk):
    try:
        # 嘗試讀取現有的 Excel 檔案
        ws = sht.worksheet_by_title(sheet_name)
        existing_df = pd.DataFrame(ws.get_all_records())
    except Exception as e:
        # 如果檔案不存在，創建一個空的 DataFrame
        ws = sht.add_worksheet(sheet_name)
        existing_df = pd.DataFrame()
    # 合併現有的資料和新資料
    combined_df = pd.concat([existing_df, new_data], ignore_index=True)

    # 移除重複的 ID，保留最新的記錄
    combined_df.drop_duplicates(subset=[pk], keep='last', inplace=True)

    # 根据 pk 排序
    if pk in combined_df.columns:
        combined_df.sort_values(by=pk, inplace=True) 
    # 將合併後的資料寫入 Excel 檔案
    ws.set_dataframe(combined_df, (1,1),nan='')

In [4]:
sheet_name = 'mountain_information'
sheet_name2 = 'trail_id'
pk = '山岳ID'
pk2 = '步道ID'

In [5]:
#建立山岳ID index
allMid_url = "https://hiking.biji.co/index.php?q=mountain"
idx_r = requests.get(allMid_url, headers=header)
idx_soup = BeautifulSoup(idx_r.text, 'html.parser')

#取得最大頁數
allPage_num = idx_soup.select('a[class="page-item grid place-content-center p-1.5 bg-white text-gray-600 border border-gray-300 rounded"]')
max_page = 0
for allPage in allPage_num:
    allPage_href = allPage.get('href')
    if "page=" in allPage_href:
        page_id = int(allPage_href.split('page=')[1])
        if page_id > max_page:
            max_page = page_id
print(max_page)

154


In [7]:
allMid_base_url = "https://hiking.biji.co/index.php?q=mountain&page="
all_idx_id = []
for p in tqdm(range(1, max_page+1)):

    sleep(1)

    allMid_page_url = allMid_base_url + str(p)
    P_idx_r = requests.get(allMid_page_url, headers=header)
    P_idx_soup = BeautifulSoup(P_idx_r.text, 'html.parser')

    idx_contents_get = P_idx_soup.select('a[class="text-current"]')
    for idx_content in idx_contents_get:
        idx_Mname = idx_content.text
        idx_href = idx_content.get("href")
        if "id=" in idx_href:
            #獲得這一頁上的山岳連結(ID)
            idx_id = int(idx_href.split('id=')[1])
            if idx_id not in all_idx_id:
                all_idx_id.append(idx_id)
            #print(idx_id, idx_Mname)

            sleep(1)

            #抓取山岳細節資料
            mountain_id = idx_id
            base_url = "https://hiking.biji.co/index.php?q=mountain&act=detail&id="
            url = base_url + str(mountain_id)

            r = requests.get(url, headers=header)
            soup = BeautifulSoup(r.text, 'html.parser')

            #Part1(山岳名稱)
            mountain_name = {}
            mountain_name['山岳ID'] = mountain_id

            all_boards0 = soup.select('div[class="space-y-2.5"]')
            for board0 in all_boards0:
                Mname = board0.select_one('h1').text
                #放入容器
                mountain_name['山岳名稱'] = Mname

                votes = board0.select('li[class = "px-2.5"]')
                for vote in votes:
                    get_vote = vote.text.split()
                    vote_name = get_vote[1]
                    vote_num = int(get_vote[0].replace(',',''))
                    #放入容器
                    mountain_name[vote_name] = vote_num

                try:
                    category = board0.select_one('a').text
                except:
                    category = ''
                #放入容器
                mountain_name['類別'] = category

            #print(mountain_name)

            #Part2(簡介)
            mountain_intro = {}
            all_boards1 = soup.select_one('div[class="leading-relaxed"]').text
            #放入容器
            mountain_intro['簡介'] = all_boards1

            #print(mountain_intro)

            #Part3(基本資料)
            all_boards2 = soup.select('div[class="flex-1 flex"]')
            mountain_info = {}
            for board2 in all_boards2:
                titles = board2.select_one('dt').text
                content = board2.select_one('dd').text
                mountain_info[titles] = content.replace('\n','').strip()
            #print(mountain_info)

            #Part4(基本資料2)
            all_boards3 = soup.select('div[class="flex"]')
            mountain_info2 = {}
            for board3 in all_boards3:
                titles = board3.select_one('dt').text
                content = board3.select_one('dd').text

                #放入容器
                content_list = []
                content2 = content.split('\n')
                for c in content2:
                    if c:
                        content_list.append(c.strip())
                mountain_info2[titles] = ','.join(content_list)
            #print(mountain_info2)

            #額外資料(步道ID)
            trail_data = []
            for board3 in all_boards3:
                content_id = board3.select('a')
                for id in content_id:
                    trail_dict = {}
                    trail = id.text
                    href = id.get('href')
                    if 'id=' in href:
                        trail_id = href.split('id=')[1]
                        #print(trail_id)
                        trail_dict['步道ID'] = int(trail_id)
                    #print(trail)
                    trail_dict['步道名稱'] = trail
                    #print(trail_dict)
                    trail_data.append(trail_dict)

            #放入pandas DataFrame(額外資料)
            trail_df = pd.DataFrame(trail_data)

            #全部合併
            merged_dict = {**mountain_name, **mountain_info, **mountain_info2, **mountain_intro}
            #print(merged_dict)

            #放入pandas DataFrame(山岳基本資料)
            mountain_data = []
            mountain_data.append(merged_dict)

            mountain_df = pd.DataFrame(mountain_data)

            #寫入Excel
            append_to_excel(mountain_df, sheet_name, pk)
            append_to_excel(trail_df, sheet_name2, pk2)

print(all_idx_id)
print(len(all_idx_id))

100%|██████████| 1/1 [01:23<00:00, 83.74s/it]

[103, 106, 684, 685, 686, 687, 688, 7054, 7055, 7207, 7057, 701, 6555, 6554, 7056, 7208, 7209, 102, 6569, 6570]
20



