In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Step 1: URL of the main page with links to all pinyin categories
main_url = 'https://www.chinese-tools.com/chinese/chengyu/dictionary/all.html'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

# Step 2: Send a request to fetch the HTML content of the main page
response = requests.get(main_url, headers=headers)

print(response.ok)


True


In [2]:
soup = BeautifulSoup(response.text, 'lxml')

print(soup)

# Step 3: Extract all links to Chengyu list pages from the pinyin section
pinyin_links = []
for link in soup.select('div.ctCyPinyin a.ctCyPinyinA'):
    pinyin_links.append(link['href'])

print(pinyin_links)


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head><!-- VSTags Header -->
<script type="text/javascript">
			var vsCFTagsEUFunctions = vsCFTagsEUFunctions || [];
			var vsCFTagsNonEuFunctions = vsCFTagsNonEuFunctions || [];
			function vsQueueAd(adunit) {
				googletag.cmd.push(function() { 
				if (window.deployads) {
				deployads.push(function() {
					deployads.gpt.display(adunit); 
					});
				} else {
					googletag.display(adunit); 
				}
				});
			}
			</script>
<!-- /VSTags Header -->
<!-- Sortable -->
<script type="text/javascript">
			// Initialize the deployads array, for asynchronous use
			window.deployads = window.deployads || [];
			</script>
<script async="" src="//c.amazon-adsystem.com/aax2/apstag.js"></script>
<script>!function(a9,a,p,s,t,A,g){if(a[a9])return;function q(c,r){a[a9]._Q.push([c,r])}a[a9]={unLoaded:1,init:function(){q("i",arguments)},fetchBids:function(){q("f",arguments)},setDispla

In [4]:
# Step 4: Initialize a list to store the extracted data
chengyu_data = []


# Step 5: Loop through each pinyin link to access the Chengyu list pages
for pinyin_link in pinyin_links:
    
    response = requests.get(pinyin_link, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')

    # Step 6: Extract all Chengyu links from the Chengyu list page
    chengyu_links = []
    for link in soup.select('div.ctCyChengyu a.ctCyChengyuA'):
        chengyu_links.append(link['href'])
    
    # Step 7: Loop through each Chengyu link to extract detailed information
    for chengyu_link in chengyu_links:
        response = requests.get(chengyu_link, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        
        # Extract the Chengyu
        chengyu = soup.select_one('div.ctCyC1').get_text(strip=True)
        
        # Extract the Pinyin
        pinyin = soup.select_one('div.ctCyC2').get_text(strip=True)
        
        # Extract the Explanation (meaning, context, example)
        explanation = soup.select_one('div.ctCyC4').get_text(strip=True) if soup.select_one('div.ctCyC4') else None

        if not explanation:
            # time.sleep(0.5)

            continue
        
        # Append the data to the list
        chengyu_data.append({
            'Chengyu': chengyu,
            'Pinyin': pinyin,
            'Explanation': explanation
        })
        
        # Add a delay to avoid overloading the server
        # time.sleep(0.5)


In [5]:
# Step 8: Convert the list to a DataFrame and save to CSV
chengyu_df = pd.DataFrame(chengyu_data)
chengyu_df.to_csv('chengyu_data.csv', index=False, encoding='utf-8-sig')

print("Data collection complete. Saved to 'chengyu_data.csv'.")

Data collection complete. Saved to 'chengyu_data.csv'.
