#### Packages

In [3]:
import os
import re
import sys
import time  # 強制等待
import requests
import numpy as np
import pandas as pd  # 載入 pandas
from bs4 import BeautifulSoup

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options  # 設定 driver 的行為
from selenium.webdriver.support.ui import Select  # 選擇＂下拉式選單＂
from selenium.webdriver.common.keys import Keys  # 鍵盤操作
from selenium.common.exceptions import NoSuchElementException, TimeoutException  # 載入常見錯誤
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities  # 更改載入策略
from selenium.webdriver.support.ui import WebDriverWait  # 等待機制
from selenium.webdriver.support import expected_conditions as EC  # 預期事件
from selenium.webdriver.common.by import By  # 找尋元素的方法

#### 環境設置

In [39]:
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get('GOOGLE_CHROME_BIN')
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--headless')

# chrome_options.add_argument('disable-dev-shm-usage')
# chrome_options.add_argument('--no-sandbox')
chrome_capabilities = DesiredCapabilities.CHROME
chrome_capabilities['pageLoadStrategy'] = 'eager'  # 頁面加載策略：HTML 解析成 DOM
def get_chrome():
    return webdriver.Chrome(options=chrome_options,
                            desired_capabilities=chrome_capabilities)

In [40]:

def wait_for_element_clickable(driver, element_position, waiting_time=5, by=By.LINK_TEXT):
    function = sys._getframe().f_code.co_name
    alert_execution_report(function)
    print_all_arguments(locals())
    try:
        node_off()
        element = WebDriverWait(driver, waiting_time).until(EC.element_to_be_clickable((by, element_position)))
    except Exception as e:
        alert_exception_report(function, e)
        return False
    else:
        return element

def accurately_find_table_and_read_it(driver, table_position, table_index=0):
    function = sys._getframe().f_code.co_name
    alert_execution_report(function)
    print_all_arguments(locals())
    try:
        node_off()
        if not wait_for_element_present(driver, table_position):
            alert_exception_report(function, 'not found table')
            return
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table_innerHTML = soup.select(table_position)[table_index]
        tgt = pd.read_html(str(table_innerHTML), encoding='utf-8')[0]
        # tgt['圖書館'], tgt['連結'] = org, driver.current_url
    except Exception as e:
        alert_exception_report(function, e)
        return
    else:
        return tgt

def get_all_tgt_urls(driver, link_text):
    function = sys._getframe().f_code.co_name
    alert_execution_report(function)
    print_all_arguments(locals())
    
    tgt_urls = []

    anchors = driver.find_elements_by_link_text(link_text)
    for anchor in anchors:
        tgt_urls.append(anchor.get_attribute('href'))
    
    return tgt_urls

### 開始爬蟲

##### 上市公司代號

In [11]:
id_Ltd = [1101, 1102, 1103, 1104, 1108, 1109, 1110, 1201, 1203, 1210, 
          1213, 1215, 1216, 1217 ,1218 ,1219, 1220, 1225, 1227, 1229, 
          1231, 1232, 1233, 1234, 1235, 1236, 1256, 1301, 1303, 1304, 
          1305, 1307, 1308, 1309, 1310, 1312, 1313, 1314, 1315, 1316, 
          1319, 1321, 1323, 1324, 1325, 1326, 1339, 1341, 1342, 1402, 
          1409, 1410, 1413, 1414, 1416, 1417, 1418, 1419, 1423, 1432, 
          1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 
          1444, 1445, 1446, 1447, 1449, 1451, 1452, 1453, 1454, 1455, 
          1456, 1457, 1459, 1460, 1463, 1464, 1465, 1466, 1467, 1468, 
          1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1503, 1504, 
          1506, 1512, 1513, 1514, 1515, 1516, 1517, 1519, 1521, 1522, 
          1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 
          1535, 1536, 1537, 1538, 1539, 1540, 1541, 1558, 1560, 1568, 
          1582, 1583, 1587, 1589, 1590, 1597, 1598, 1603, 1604, 1605, 
          1608, 1609, 1611, 1612, 1614, 1615, 1616, 1617, 1618, 1701, 
          1702, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1717, 
          1718, 1720, 1721, 1722, 1723, 1725, 1726, 1727, 1730, 1731, 
          1732, 1733, 1734, 1735, 1736, 1737, 1752, 1760, 1762, 1773, 
          1776, 1783, 1786, 1789, 1795, 1802, 1805, 1806, 1808, 1809, 
          1810, 1817, 1903, 1904, 1905, 1906, 1907, 1909, 2002, 2006, 
          2007, 2008, 2009, 2010, 2012, 2013, 2014, 2015, 2017, 2020,
          2022, 2023, 2024, 2025, 2027, 2028, 2029, 2030, 2031, 2032, 
          2033, 2034, 2038, 2049, 2059, 2062, 2069, 2101, 2102, 2103, 
          2104, 2105, 2106, 2107, 2108, 2109, 2114, 2115, 2201, 2204, 
          2206, 2207, 2208, 2211, 2227, 2228, 2231, 2233, 2239, 2241, 
          2247, 2250, 2301, 2303, 2305, 2308, 2312, 2313, 2314, 2316, 
          2317, 2321, 2323, 2324, 2327, 2328, 2329, 2330, 2331, 2332, 
          2337, 2338, 2340, 2342, 2344, 2345, 2347, 2348, 2349, 2351, 
          2352, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2362, 2363, 
          2364, 2365, 2367, 2368, 2369, 2371, 2373, 2374, 2375, 2376, 
          2377, 2379, 2380, 2382, 2383, 2385, 2387, 2388, 2390, 2392, 
          2393, 2395, 2397, 2399, 2401, 2402, 2404, 2405, 2406, 2408, 
          2409, 2412, 2413, 2414, 2415, 2417, 2419, 2420, 2421, 2423, 
          2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2433, 2434, 
          2436, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2449, 2450, 
          2451, 2453, 2454, 2455, 2457, 2458, 2459, 2460, 2461, 2462, 
          2464, 2465, 2466, 2467, 2471, 2472, 2474, 2476, 2477, 2478, 
          2480, 2481, 2482, 2483, 2484, 2485, 2486, 2488, 2489, 2491, 
          2492, 2493, 2495, 2496, 2497, 2498, 2501, 2504, 2505, 2506, 
          2509, 2511, 2514, 2515, 2516, 2520, 2524, 2527, 2528, 2530, 
          2534, 2535, 2536, 2537, 2538, 2539, 2540, 2542, 2543, 2545, 
          2546, 2547, 2548, 2597, 2601, 2603, 2605, 2606, 2607, 2608, 
          2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 
          2630, 2633, 2634, 2636, 2637, 2642, 2701, 2702, 2704, 2705, 
          2706, 2707, 2712, 2722, 2723, 2727, 2731, 2739, 2748, 2753, 
          2801, 2809, 2812, 2816, 2820, 2832, 2834, 2836, 2838, 2845, 
          2849, 2850, 2851, 2852, 2867, 2880, 2881, 2882, 2883, 2884, 
          2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2897, 2901, 
          2903, 2904, 2905, 2906, 2908, 2910, 2911, 2912, 2913, 2915, 
          2923, 2939, 2945, 3002, 3003, 3004, 3005, 3006, 3008, 3010, 
          3011, 3013, 3014, 3015, 3016, 3017, 3018, 3019, 3021, 3022, 
          3023, 3024, 3025, 3026, 3027, 3028, 3029, 3030, 3031, 3032, 
          3033, 3034, 3035, 3036, 3037, 3038, 3040, 3041, 3042, 3043, 
          3044, 3045, 3046, 3047, 3048, 3049, 3050, 3051, 3052, 3054, 
          3055, 3056, 3057, 3058, 3059, 3060, 3062, 3090, 3092, 3094, 
          3130, 3138, 3149, 3164, 3167, 3189, 3209, 3229, 3231, 3257, 
          3266, 3296, 3305, 3308, 3311, 3312, 3321, 3338, 3346, 3356, 
          3376, 3380, 3406, 3413, 3416, 3419, 3432, 3437, 3443, 3450, 
          3454, 3481, 3494, 3501, 3504, 3515, 3518, 3528, 3530, 3532, 
          3533, 3535, 3543, 3545, 3550, 3557, 3563, 3576, 3583, 3588, 
          3591, 3592, 3593, 3596, 3605, 3607, 3617, 3622, 3645, 3652, 
          3653, 3661, 3665, 3669, 3673, 3679, 3682, 3686, 3694, 3701, 
          3702, 3703, 3704, 3705, 3706, 3708, 3711, 3712, 3714, 3990, 
          4104, 4106, 4108, 4119, 4133, 4137, 4142, 4148, 4155, 4164, 
          4306, 4414, 4426, 4438, 4440, 4526, 4532, 4536, 4540, 4545, 
          4551, 4555, 4560, 4562, 4564, 4566, 4572, 4576, 4581, 4720, 
          4722, 4736, 4737, 4739, 4746, 4755, 4764, 4766, 4770, 4807, 
          4904, 4906, 4912, 4915, 4916, 4919, 4927, 4930, 4934, 4935, 
          4938, 4942, 4952, 4956, 4960, 4961, 4967, 4968, 4976, 4989, 
          4994, 4999, 5007, 5203, 5215, 5222, 5225, 5234, 5243, 5244, 
          5258, 5269, 5283, 5284, 5285, 5306, 5388, 5434, 5469, 5471, 
          5484, 5515, 5519, 5521, 5522, 5525, 5531, 5533, 5534, 5538, 
          5607, 5608, 5706, 5876, 5880, 5907, 6005, 6024, 6108, 6112, 
          6115, 6116, 6117, 6120, 6128, 6133, 6136, 6139, 6141, 6142, 
          6152, 6153, 6155, 6164, 6165, 6166, 6168, 6176, 6177, 6183, 
          6184, 6189, 6191, 6192, 6196, 6197, 6201, 6202, 6205, 6206, 
          6209, 6213, 6214, 6215, 6216, 6224, 6225, 6226, 6230, 6235, 
          6239, 6243, 6257, 6269, 6271, 6277, 6278, 6281, 6282, 6283, 
          6285, 6288, 6405, 6409, 6412, 6414, 6416, 6426, 6431, 6438, 
          6442, 6443, 6446, 6449, 6456, 6464, 6472, 6477, 6491, 6504, 
          6505, 6515, 6525, 6531, 6533, 6552, 6558, 6573, 6579, 6581, 
          6582, 6591, 6592, 6605, 6625, 6641, 6655, 6666, 6668, 6669, 
          6670, 6671, 6672, 6674, 6691, 6698, 6706, 6715, 6719, 6743, 
          6754, 6756, 6768, 6770, 6776, 6790, 6792, 6806, 8011, 8016, 
          8021, 8028, 8033, 8039, 8046, 8070, 8072, 8081, 8101, 8103, 
          8104, 8105, 8110, 8112, 8114, 8131, 8150, 8163, 8201, 8210, 
          8213, 8215, 8222, 8249, 8261, 8271, 8341, 8367, 8374, 8404, 
          8411, 8422, 8429, 8438, 8442, 8443, 8454, 8462, 8463, 8464, 
          8467, 8473, 8476, 8478, 8926, 8940, 8996, 9802, 9902, 9904, 
          9905, 9906, 9907, 9908, 9910, 9911, 9912, 9914, 9917, 9918, 
          9919, 9921, 9924, 9925, 9926, 9927, 9928, 9929, 9930, 9931, 
          9933, 9934, 9935, 9937, 9938, 9939, 9940, 9941, 9942, 9943, 
          9944, 9945, 9946, 9955, 9958]

id_Ltd = [str(id) for id in id_Ltd]

#### 整理爬到的資料

In [32]:
def clean_company_data(company_data):
    """
    Remove "\n" characters from the keys and values in the given company data dictionary.

    Args:
    - company_data (dict): A dictionary containing company data where values may contain "\n" characters.

    Returns:
    - dict: A new dictionary with "\n" characters removed from the values.
    """
    cleaned_data = {key.replace("\n", ""): value.replace("\n", "") for key, value in company_data.items()}
    return cleaned_data

#### 分析公司的表面功夫(?)

In [66]:
speech_keywords = ["勞資協調", "勞資溝通", "意見信箱", "意見箱", "員工意見", "言論自由", "資料隱私", "商業機密保護", "客戶隱私權", "資料隱私", "NDA", "保密協議", "營業秘密", "機密資訊"]
train_keywords = ["外訓", "外部訓練", "員工培訓", "教育訓練", "進修", "內訓"]
occuhealth_keywords = ["45001", "18001", "18000", "職業安全衛生管理系統"]
salary_keywords = ['員工分紅', "員工貢獻程度", "員工獎金", "年度獎金", "工作績效"]
consumer_neg_keywords = ["產品責任險", "產品責任保險", "產品標示", "產品行銷與標示", "RoHS", "如實標註", "SGS"]
consumer_pos_keywords = ["有害物質流程管理", "輻射檢測"]
supplier_esg_keywords = ["供應商", "EICC", "責任商業聯盟行為準則", "RBA準則", "供應鏈"]
risk_keywords = ["風險管理", "財務風險", "資訊風險", "風險管控", "匯率風險", "31000"]
nocorrupt_keywords = ["貪腐", "反貪", "廉潔", "舞弊", "37001"]
conservation_keywords = ["保育", "淨灘", "生態", "生物多樣性", "種樹", "動物保護"]
social_keywords = ["婦女保障", "鄰里關懷", "兩性平等", "生育補助", "兩性工作平等"]

In [65]:
def policy_info_df(data_dict, id_Ltd, 
                speech_keywords, train_keywords, occuhealth_keywords, consumer_neg_keywords, social_keywords, salary_keywords, 
                consumer_pos_keywords, supplier_esg_keywords, risk_keywords, nocorrupt_keywords, conservation_keywords):
    datalist = []
    for idx, company_data in enumerate(data_dict, start=1):
        free_speech = 1 if any(word in value for value in company_data.values() for word in speech_keywords) else 0
        staff_train = 1 if any(word in value for value in company_data.values() for word in train_keywords) else 0
        occupation_health = 1 if any(word in value for value in company_data.values() for word in occuhealth_keywords) else 0
        consumer_neg = 1 if any(word in value for value in company_data.values() for word in consumer_neg_keywords) else 0    
        consumer_pos = 1 if any(word in value for value in company_data.values() for word in consumer_pos_keywords) else 0
        supplier_esg = 1 if any(word in value for value in company_data.values() for word in supplier_esg_keywords) else 0 
        risk_management = 1 if any(word in value for value in company_data.values() for word in risk_keywords) else 0
        social = 1 if any(word in value for value in company_data.values() for word in social_keywords) else 0
        salary = 1 if any(word in value for value in company_data.values() for word in salary_keywords) else 0
        nocorrupt = 1 if any(word in value for value in company_data.values() for word in nocorrupt_keywords) else 0
        conservation = 1 if any(word in value for value in company_data.values() for word in conservation_keywords) else 0          
        datalist.append({'代號': id_Ltd[idx - 1], 
                    'free_speech': free_speech, 'staff_train': staff_train, 
                    'occupation_health': occupation_health, 'consumer_neg': consumer_neg, 'salary': salary, 
                    'consumer_pos': consumer_pos, 'supplier_esg': supplier_esg, 'social': social,
                    'risk_management': risk_management, 'nocorrupt': nocorrupt, 'conservation': conservation
                    })    
    return pd.DataFrame(datalist)

##### 110

In [45]:
# Set up Selenium WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://mops.twse.com.tw/mops/web/t100sb12'

driver.get(url)
# 申報年度=111, 揭露年度=110
year_input = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.NAME, "year")))
year_input.clear()
year_input.send_keys("111")

# List to store data for all companies
all_company_data = []

for company_id in id_Ltd:
    try:
        print("Processing company ID:", company_id)
        
        # 公司代號或簡稱
        id_input = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "co_id")))   
        id_input.clear()
        id_input.send_keys(str(company_id))
        time.sleep(2)
                
        search_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "/html/body/center/table/tbody/tr/td/div[4]/table/tbody/tr/td/div/table/tbody/tr/td[3]/div/div[3]/form/table/tbody/tr/td[4]/table/tbody/tr/td[2]/div/div/input")))
        search_button.click()

        # Wait for the search results to load
        time.sleep(1)  # Adjust sleep time as needed
        
        # Find the table containing the data (assuming the table reloads with new data)
        center_table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='table01']/center/table")))

        # Extract data from the table
        company_data = {}
        rows = center_table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            header = row.find_element(By.TAG_NAME, "th").text.strip()  # Extract the header
            try:
                content_cell = row.find_element(By.TAG_NAME, "td")  # Find the <td> cell in the row
                content = content_cell.find_element(By.TAG_NAME, "pre").text.strip()  # Extract the content from <pre>
            except NoSuchElementException:
                content = company_id  # Placeholder value if <td> or <pre> not found
            company_data[header] = content
    
        # Append the data for this company to the list
        all_company_data.append(company_data)
        
        id_input.clear() 
        
        print("Data collected for company ID:", company_id)
        
    except Exception as e:
        print("Error processing company ID:", company_id)
        print("Error message:", str(e))
                  
driver.quit()

Processing company ID: 1101
Data collected for company ID: 1101
Processing company ID: 1102
Data collected for company ID: 1102
Processing company ID: 1103
Data collected for company ID: 1103
Processing company ID: 1104
Data collected for company ID: 1104
Processing company ID: 1108
Data collected for company ID: 1108
Processing company ID: 1109
Data collected for company ID: 1109
Processing company ID: 1110
Data collected for company ID: 1110
Processing company ID: 1201
Data collected for company ID: 1201
Processing company ID: 1203
Data collected for company ID: 1203
Processing company ID: 1210
Data collected for company ID: 1210
Processing company ID: 1213
Data collected for company ID: 1213
Processing company ID: 1215
Error processing company ID: 1215
Error message: Message: 

Processing company ID: 1216
Data collected for company ID: 1216
Processing company ID: 1217
Data collected for company ID: 1217
Processing company ID: 1218
Data collected for company ID: 1218
Processing compa

In [46]:
cleaned_all_company_data = [clean_company_data(company_data) for company_data in all_company_data]

# Print the cleaned data for all companies
for idx, cleaned_company_data in enumerate(cleaned_all_company_data, start=1):
    print("Company", idx, "cleaned data:", cleaned_company_data)

Company 1 cleaned data: {'項目': '1101', '員工福利政策': '台泥自成立以來一向重視員工權益及福利，針對有關勞資關係之制度與措施，皆遵照並落實勞工法令規定辦理，以確保公司員工權益保障之透明度。透過勞資會議、部門會議、員工大會、意見箱等管道，建立員工溝通之機制，了解員工需求，達到有效溝通及維持和諧勞資關係。本公司提除供優質工作場域及具市場競爭力之薪酬福利制度，更積極於培育人才；對於員工各項福利、進修、訓練、退休制度等，皆以優於或符合法令之前提下進行規劃執行，致力於打造勞資和諧之工作環境。', '員工權益維護措施': '1)台泥提供具競爭力的薪資與獎酬，並規劃完整職等、職級制度，不因性別、種族、宗教、政治立場、婚姻狀況而有差別待遇，以保障員工權益，並吸引及保留人才。2)本公司恪守全球各營運據點所在地之勞動相關法規，保障員工之合法權益，並遵循《聯合國全球盟約》、《聯合國世界人權宣言》及《國際勞工組織工作基本原則與權利宣言》等各項國際人權公約所揭櫫之人權保護精神與基本原則，體現尊重與保護人權之責任。3)本公司依法設置「勞工退休準備金監督委員會」定期提撥退休金提存於台灣銀行股份有限公司(原中央信託局)，且定期召開委員會，審核退休金提撥及運用情形，以保障同仁之權益。另針對選擇採用勞工退休金新制之員工，依法令規定每月提繳至員工在勞保局之個人帳戶,以維護員工權益。4)為維護友善安心的工作環境，設立員工安心平台及意見信箱，透過設置多元溝通管道，建立良好的溝通及諮詢交流，使員工之想法與意見得以及時獲得反應與處理，促進勞資雙向溝通。', '勞資糾紛情形': '截至110年度及年報刊印日止,本公司並無因勞資糾紛;遭受重大損失。', '履行社會責任': '1)本公司依勞基法及相關法令規定進行員工管理規則之制定。2)依「性別工作平等法」設立性騷擾申訴專用信箱，提供員工申訴管道。3)設置員工餐廳及休閒健身房，定期進行辦公大樓安檢相關作業，以提供安全及健康之工作環境。4)不定期舉行職能教育訓練，提昇員工職涯競爭力。5)訂定及定期執行台泥人權盡職調查程序，透過人權議題風險辨識與評估、依評估結果設計管理措施及風險減緩措施，進行改善及後續追蹤，以利有效降低人權風險之影響與衝擊。6)本公司對社會公益參與不遺餘力，與社福團體合作，物資募捐、關懷偏鄉弱勢學童及獨居長輩居家

In [None]:
df110_raw = pd.DataFrame(cleaned_all_company_data)
df110_raw.rename(columns={'項目': '代號'})
df110_raw.to_csv('database\\110_raw_policyInfo.csv', index=False, encoding='utf-8-sig')

In [64]:
df_110 = policy_info_df(cleaned_all_company_data, id_Ltd, 
                     speech_keywords, train_keywords, occuhealth_keywords, consumer_neg_keywords, social_keywords, salary_keywords, 
                     consumer_pos_keywords, supplier_esg_keywords, risk_keywords, nocorrupt_keywords, conservation_keywords)
df_110

AttributeError: 'int' object has no attribute 'values'

In [49]:
df_110.to_csv('database\\110_policyInfo.csv', index=False, encoding='utf-8-sig')

##### 111

In [50]:
# Set up Selenium WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://mops.twse.com.tw/mops/web/t100sb12'

driver.get(url)
# 申報年度=112, 揭露年度=111
year_input = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.NAME, "year")))
year_input.clear()
year_input.send_keys("112")

# List to store data for all companies
all_company_data = []

for company_id in id_Ltd:
    try:
        print("Processing company ID:", company_id)
        
        # 公司代號或簡稱
        id_input = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "co_id")))   
        id_input.clear()
        id_input.send_keys(str(company_id))
        
        time.sleep(2)
        
        search_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "/html/body/center/table/tbody/tr/td/div[4]/table/tbody/tr/td/div/table/tbody/tr/td[3]/div/div[3]/form/table/tbody/tr/td[4]/table/tbody/tr/td[2]/div/div/input")))
        search_button.click()

        # Wait for the search results to load
        time.sleep(2)  # Adjust sleep time as needed
        
        # Find the table containing the data (assuming the table reloads with new data)
        center_table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='table01']/center/table")))

        # Extract data from the table
        company_data = {}
        rows = center_table.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            header = row.find_element(By.TAG_NAME, "th").text.strip()  # Extract the header
            try:
                content_cell = row.find_element(By.TAG_NAME, "td")  # Find the <td> cell in the row
                content = content_cell.find_element(By.TAG_NAME, "pre").text.strip()  # Extract the content from <pre>
            except NoSuchElementException:
                content = company_id  # Placeholder value if <td> or <pre> not found
            company_data[header] = content
    
        # Append the data for this company to the list
        all_company_data.append(company_data)
        
        id_input.clear() 
        
        print("Data collected for company ID:", company_id)
        
    except Exception as e:
        print("Error processing company ID:", company_id)
        print("Error message:", str(e))
                  
driver.quit()

Processing company ID: 1101
Data collected for company ID: 1101
Processing company ID: 1102
Data collected for company ID: 1102
Processing company ID: 1103
Data collected for company ID: 1103
Processing company ID: 1104
Data collected for company ID: 1104
Processing company ID: 1108
Data collected for company ID: 1108
Processing company ID: 1109
Data collected for company ID: 1109
Processing company ID: 1110
Data collected for company ID: 1110
Processing company ID: 1201
Data collected for company ID: 1201
Processing company ID: 1203
Data collected for company ID: 1203
Processing company ID: 1210
Data collected for company ID: 1210
Processing company ID: 1213
Data collected for company ID: 1213
Processing company ID: 1215
Data collected for company ID: 1215
Processing company ID: 1216
Data collected for company ID: 1216
Processing company ID: 1217
Data collected for company ID: 1217
Processing company ID: 1218
Data collected for company ID: 1218
Processing company ID: 1219
Data collect

In [51]:
cleaned_all_company_data = [clean_company_data(company_data) for company_data in all_company_data]

# Print the cleaned data for all companies
for idx, cleaned_company_data in enumerate(cleaned_all_company_data, start=1):
    print("Company", idx, "cleaned data:", cleaned_company_data)

Company 1 cleaned data: {'項目': '1101', '員工福利政策': '台泥自成立以來一向重視員工權益及福利，針對有關勞資關係之制度與措施，皆遵照並落實勞工法令規定辦理，以確保公司員工權益保障之透明度。透過勞資會議、部門會議、員工大會、意見箱等管道，建立員工溝通之機制，了解員工需求，達到有效溝通及維持和諧勞資關係。本公司提除供優質工作場域及具市場競爭力之薪酬福利制度，更積極於培育人才；對於員工各項福利、進修、訓練、退休制度等，皆以優於或符合法令之前提下進行規劃執行，致力於打造勞資和諧之工作環境。', '員工權益維護措施': '1)台泥提供具競爭力的薪資與獎酬，並規劃完整職等、職級制度，不因性別、種族、宗教、政治立場、婚姻狀況而有差別待遇，以保障員工權益，並吸引及保留人才。2)本公司恪守全球各營運據點所在地之勞動相關法規，保障員工之合法權益，並遵循《聯合國全球盟約》、《聯合國世界人權宣言》及《國際勞工組織工作基本原則與權利宣言》等各項國際人權公約所揭櫫之人權保護精神與基本原則，體現尊重與保護人權之責任。3)本公司依法設置「勞工退休準備金監督委員會」定期提撥退休金提存於台灣銀行股份有限公司(原中央信託局)，且定期召開委員會，審核退休金提撥及運用情形，以保障同仁之權益。另針對選擇採用勞工退休金新制之員工，依法令規定每月提繳至員工在勞保局之個人帳戶,以維護員工權益。4)為維護友善安心的工作環境，設立員工安心平台及意見信箱，透過設置多元溝通管道，建立良好的溝通及諮詢交流，使員工之想法與意見得以及時獲得反應與處理，促進勞資雙向溝通。', '勞資糾紛情形': '民國一百一十一年及截至年報刊印日止，本公司並無因勞資糾紛而遭受損失。然而，本公司有因勞動檢查結果被裁定違反法令事項，被處以罰鍰如下：《111年7月22日北市勞動字第11160281571號函》裁定因正常工作時間連同延長工作時間超過12小時，違反勞動基準法第32條第2項，被處以罰鍰新台幣5萬元。本公司針對員工的工時管理除既有的每日下班提醒予全體員工進行宣導，已再次進行內部傳達，亦保持暢通的溝通管道。若因工作需求，須得延長工時，提醒員工可事先提出加班申請，並依其意願選擇加班費或補休假。台泥公司將持續加強勞雇關係，致力於支持員工兼顧工作與家庭照顧。', '履行社會責任': '1)本公司依勞基法及相關法

In [52]:
df111_raw = pd.DataFrame(cleaned_all_company_data)
df111_raw.rename(columns={'項目': '代號'})
df111_raw.to_csv('database\\111_raw_policyInfo.csv', index=False, encoding='utf-8-sig')

In [53]:
df_111 = policy_info_df(cleaned_all_company_data, id_Ltd, 
                     speech_keywords, train_keywords, occuhealth_keywords, consumer_neg_keywords, social_keywords, salary_keywords, 
                     consumer_pos_keywords, supplier_esg_keywords, risk_keywords, nocorrupt_keywords, conservation_keywords)
df_111

Unnamed: 0,代號,free_speech,staff_train,occupation_health,consumer_neg,consumer_pos,supplier_esg,social,risk_management,nocorrupt,conservation
0,1101,1,1,0,0,0,0,0,0,0,0
1,1102,0,1,0,0,0,0,1,0,0,0
2,1103,0,1,0,0,0,0,0,0,0,0
3,1104,1,1,0,0,0,0,0,0,0,0
4,1108,1,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
897,9914,0,1,0,0,0,0,0,0,0,0
898,9917,0,1,0,0,0,0,0,0,0,0
899,9918,0,1,0,0,0,0,0,0,0,0
900,9919,0,1,0,0,0,0,1,0,0,0


In [54]:
df_111.to_csv('database\\111_policyInfo.csv', index=False, encoding='utf-8-sig')