## ctrip

In [None]:
import requests
import json
import time
# import tqdm

def scrape_ctrip_comments(total_pages=300):
    # Store all collected data
    all_comments = []
    
    # Base URL and headers
    url = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
    
    # Headers that stay consistent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Content-Type": "application/json",
        "_fxpcqlniredt": "09031109310692966540"
        # No need to include x-traceID as it's generated per request
    }
    
    # Customize request payload 

    payload = {"arg":{"channelType":2,"collapseType":0,"commentTagId":0,"pageIndex":1,"pageSize":10,"poiId":10542436,"sourceType":1,"sortType":3,"starType":0},"head":{"cid":"09031109310692966540","ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":"","xsid":"","extension":[]}}
    
    # Loop through pages
    for page in range(1, total_pages + 1):
        # Update page number in payload
        payload["arg"]["pageIndex"] = page
        
        # Make the request
        response = requests.post(url, headers=headers, json=payload)
        
        # Check if request was successful
        if response.status_code == 200:
            # Parse the JSON directly
            data = response.json()
            
            # Extract the comments from the response
            if "result" in data:
                # Check if the result contains items
                if "items" not in data["result"]:
                    print(f"No items found in result for page {page}")
                    break
                comments = data["result"]['items']
                all_comments.extend(comments)
                print(f"Collected {len(comments)} comments from page {page}")
                
                # Handle pagination info if available
                if "totalCount" in data['result']:
                    total_pages_available = data["result"]["totalCount"]
                    if page >= total_pages_available:
                        print("Reached last page")
                        break
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        
        # Add delay to avoid rate limiting
        time.sleep(2)
    
    return all_comments

# Execute the function
comments = scrape_ctrip_comments()

In [6]:
import json

# Save the comments to a JSON file
with open("ctrip_comments.json", "w", encoding="utf-8") as f:
    json.dump(comments, f, ensure_ascii=False, indent=2)
print(f"Total comments collected: {len(comments)}")

Total comments collected: 3000


## cenews

In [8]:
def scrape_cenews(total_pages=300):
    articles = []
    url = "https://cmsapi.cenews.com.cn/cmsapi/searchArticle"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Content-Type": "application/json"
    }
    payload = {
        "keyWord": "生态文明",
        "searchLocation": "0",
        "searchType": "0",
        "beginTime": "2008-01-01 00:00:00",
        "endTime": "2025-05-01 23:59:59",
        "orderType": "1",
        "pageNumber": "1",
        "pageSize": "20",
        "columnID": "-1",
        "subSiteID": "1",
        "siteID": "-1",
        "includeSubNode": "1",
        "total": "39589"
    }

    for page in range(1, total_pages + 1):
        payload["pageNumber"] = str(page)
        
        response = requests.post(url, headers=headers, json=payload)
        
        if response.status_code == 200:
            data = response.json()
            if "list" in data:
                articles.extend(data["list"])
                print(f"Collected {len(data['list'])} articles from page {page}")
                
                # Check if we have reached the last page
                if len(data["list"]) < 20:
                    print("Reached last page")
                    break
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        
        time.sleep(2)
    return articles

# Execute the function
articles = scrape_cenews()
    

Collected 20 articles from page 1
Collected 20 articles from page 2
Collected 20 articles from page 3
Collected 20 articles from page 4
Collected 20 articles from page 5
Collected 20 articles from page 6
Collected 20 articles from page 7
Collected 20 articles from page 8
Collected 20 articles from page 9
Collected 20 articles from page 10
Collected 20 articles from page 11
Collected 20 articles from page 12
Collected 20 articles from page 13
Collected 20 articles from page 14
Collected 20 articles from page 15
Collected 20 articles from page 16
Collected 20 articles from page 17
Collected 20 articles from page 18
Collected 20 articles from page 19
Collected 20 articles from page 20
Collected 20 articles from page 21
Collected 20 articles from page 22
Collected 20 articles from page 23
Collected 20 articles from page 24
Collected 20 articles from page 25
Collected 20 articles from page 26
Collected 20 articles from page 27
Collected 20 articles from page 28
Collected 20 articles from pa

In [9]:
# Save the articles to a JSON file
with open("cenews_articles.json", "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=2)
print(f"Total articles collected: {len(articles)}")

Total articles collected: 6000


## people

In [14]:
def scrape_people(total_pages=100):
    articles = []
    url = "http://search.people.cn/search-platform/front/search"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Content-Type": "application/json"
    }
    payload = {
        "endTime": 0,
        "hasContent": 'true',
        "hasTitle": 'true',
        "isFuzzy": 'true',
        "key": "生态文明",
        "limit": 10,
        "page": 1,
        "sortType": 2,
        "startTime": 0,
        "type": 0,
    }
    
    for page in range(1, total_pages + 1):
        payload["page"] = page
        response = requests.post(url, headers=headers,json=payload)
        
        if response.status_code == 200:
            data = response.json()
            records = data["data"]["records"]
            if not records:
                print(f"No records found for page {page}")
                break
            articles.extend(records)
            print(f"Collected {len(records)} articles from page {page}")
            
            # Check if we have reached the last page
            if len(records) < 10:
                print("Reached last page")
                break
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        
        time.sleep(2)
    return articles

In [15]:
people_articles = scrape_people()

Collected 10 articles from page 1
Failed to fetch page 2: 403
Collected 10 articles from page 3
Failed to fetch page 4: 403
Collected 10 articles from page 5
Failed to fetch page 6: 403
Collected 10 articles from page 7
Failed to fetch page 8: 403
Failed to fetch page 9: 403
Collected 10 articles from page 10
Collected 10 articles from page 11
Collected 10 articles from page 12
Failed to fetch page 13: 403
Collected 10 articles from page 14
Failed to fetch page 15: 403
Collected 10 articles from page 16
Collected 10 articles from page 17
Collected 10 articles from page 18
Collected 10 articles from page 19
Failed to fetch page 20: 403
Collected 10 articles from page 21
Collected 10 articles from page 22
Collected 10 articles from page 23
Collected 10 articles from page 24
Collected 10 articles from page 25
Collected 10 articles from page 26
Collected 10 articles from page 27
Collected 10 articles from page 28
Collected 10 articles from page 29
Collected 10 articles from page 30
Collect

In [16]:
# Save the articles to a JSON file
with open("people_articles.json", "w", encoding="utf-8") as f:
    json.dump(people_articles, f, ensure_ascii=False, indent=2)
print(f"Total articles collected: {len(people_articles)}")

Total articles collected: 920


## mee

In [4]:
!pip install tqdm selectolax



In [7]:
from tqdm import tqdm
from selectolax.parser import HTMLParser
import requests
import time
import json

In [20]:
def scrape_mee(total_pages=30):
    articles = []
    base_url = "https://www.mee.gov.cn/was5/web/search?"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0",
        "Content-Type": 'text/html; charset=UTF-8'
    }

    params = {
        "channelid": "270514",
        "searchword": "",
        "page": "1",
        "orderby": "-docreltime",
        "searchscope": "",
        "timestart": "",
        "timeend": "",
        "period": "",
        "chnls": "3",
        "andsen": "",
        "total": "生态文明",
        "orsen": "",
        "exclude": ""
	}

    with tqdm(total=total_pages, desc="Scraping MEE", unit='page') as pbar:
        for page in range(1, total_pages + 1):
            try:
                response = requests.get(base_url, headers=headers, params={**params, "page": page})
                if response.status_code == 200:
                    tree = HTMLParser(response.text)
                    items = tree.css('li.li')
                    page_articles = []
                    for item in items:
                        url = item.css_first('a').attrs['href']
                        category = item.css_first('em').text(strip=True)
                        title = item.css_first('h2').text().replace(category, '').strip()
                        date = item.css_first('span').text(strip=True)
                        page_articles.append({
                            "title": title,
                            "url": url,
                            "category": category,
                            "date": date
                        })
                    articles.extend(page_articles)

                    pbar.set_postfix({
                        'Collected': len(articles),
                        'Current': len(page_articles),
                    })
                time.sleep(2)  # Respectful delay between requests
            
            except Exception as e:
                tqdm.write(f"Error on page {page}: {str(e)}")
            
            finally:
                pbar.update(1)
    
    return articles

In [21]:
mee_articles = scrape_mee()

Scraping MEE: 100%|██████████| 30/30 [01:31<00:00,  3.06s/page, Collected=261, Current=0] 


In [22]:
# Save the articles to a JSON file
with open("mee_政策文件.json", "w", encoding="utf-8") as f:
    json.dump(mee_articles, f, ensure_ascii=False, indent=2)
print(f"Total articles collected: {len(mee_articles)}")

Total articles collected: 261
