In [None]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import csv
import time
import random

headers = {
    'User-Agent': 'Mozilla/5.0'
}
base_url = "https://www.tdcj.texas.gov"
list_url = f"{base_url}/death_row/dr_executed_offenders.html"

# 1. 获取所有详情页链接
response = requests.get(list_url, headers=headers, timeout=1000, verify=False)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
soup = BeautifulSoup(response.content, "html.parser")

links = []

table_rows = soup.select("table tr")[1:]  # 跳过表头
# 示例：https://www.tdcj.texas.gov/death_row/dr_info/mullistravis.html
for row in table_rows:
    link_tag = row.find_all("td")[1].find("a")  # 第二列
    if link_tag and "href" in link_tag.attrs:
        href = link_tag['href']
        full_link = base_url + "/death_row/" + href
        links.append(full_link)

In [None]:
links

In [None]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


def create_session_with_retries(retries=30, backoff_factor=0.3, status_forcelist=(500, 502, 503, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

session = create_session_with_retries()

def extract_info(url):
    try:
        r = session.get(url, headers=headers, timeout=10, verify=False)  # 禁用 SSL 验证
        s = BeautifulSoup(r.content, "html.parser")
        text = s.get_text(separator="\n")

        return {
            "URL": url,
            "text": text
        }

    except Exception as e:
        print(f"错误处理 {url} ：{e}")
        return None

# 3. 批量爬取并保存
results = []
for i, url in enumerate(links):
    print(f"正在处理第 {i+1}/{len(links)} 个: {url}")
    data = extract_info(url)
    if data:
        results.append(data)
    time.sleep(1)

# 4. 保存为 CSV
with open("tdcj_on_death_row.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=results[0].keys())
    writer.writeheader()
    writer.writerows(results)

In [None]:
not_found = []
for i in results:
    if '.jpg' in i['URL']:
        not_found.append(i)
    else:
        if 'Error 404' in i['text']:
            not_found.append(i)
not_found

In [None]:
not_found_urls = [item['URL'] for item in not_found]
finresults = [item for item in results if item['URL'] not in not_found_urls]
finresults

In [None]:
len(not)

In [None]:
import re

def parse_death_row_info(text):
    """
    Parses the death row information from the provided text and returns a dictionary of key fields.

    Args:
        text (str): The raw text containing death row information.

    Returns:
        dict: A dictionary containing the parsed fields.
    """
    fields = {
        "Name": re.search(r"Name\s+([\w, ]+)", text),
        "TDCJ Number": re.search(r"TDCJ Number\s+(\d+)", text),
        "Date of Birth": re.search(r"Date of Birth\s+([\d/]+)", text),
        "Date Received": re.search(r"Date Received\s+([\d/]+)", text),
        "Education Level": re.search(r"Education Level \(Highest Grade Completed\)\s+(\d+)", text),
        "Date of Offense": re.search(r"Date of Offense\s+([\d/]+)", text),
        "County": re.search(r"County\s+([\w ]+)", text),
        "Race": re.search(r"Race\s+(\w+)", text),
        "Gender": re.search(r"Gender\s+(\w+)", text),
        "Height": re.search(r"Height \(in Feet and Inches\)\s+([\d′″ ]+)", text),
        "Weight": re.search(r"Weight \(in Pounds\)\s+(\d+)", text),
        "Eye Color": re.search(r"Eye Color\s+([\w ]+)", text),
        "Summary of Incident": re.search(r"Summary of Incident\s+([\s\S]+?)(?=Co-Defendants|Race and Gender of Victim)", text),
        "Co-Defendants": re.search(r"Co-Defendants\s+([\w ]+)", text),
        "Race and Gender of Victim": re.search(r"Race and Gender of Victim\s+([\w ]+)", text),
    }

    # Extract matched groups and return as a dictionary
    return {key: match.group(1).strip() if match else None for key, match in fields.items()}

In [None]:
new_dict = {}
for i in finresults:
    new_dict[i['URL']] = parse_death_row_info(i['text'])
new_dict

In [None]:
import pandas as pd

# 将嵌套字典转换为记录列表
records = []
for url, info in new_dict.items():
    record = info.copy()  # 复制信息字典
    record['URL'] = url   # 添加URL字段
    records.append(record)

# 使用pandas保存为CSV
df = pd.DataFrame(records)
df.to_csv("death_row_lastwords_more_info.csv", index=False, encoding="utf-8")

print(f"已成功保存{len(records)}条死刑犯信息记录到death_row_inmates_info.csv文件")

In [None]:
with open('not_found_urls.txt', 'w') as f:
    for url in not_found_urls:
        f.write(url + '\n')