- [ctrip](#ctrip)
- [cenews](#cenews)
- [people](#people)
- [mee](#mee)

## ctrip

In [5]:
import requests
import json
import time
from tqdm import tqdm

In [3]:
parks = {
    # 福建
    "武夷山-test": 126481,
    # 海南
    # "吊罗山": 21835,
    # "尖峰岭": 3233,
    # "黎母山": 18195,
    # "鹦哥岭": 145248513,
    # "五指山": 3211,
    # "五指山水满河": 110178,

    # "霸王岭": 1730760,
    # "霸王岭雅加": 143727908,
    # "霸王岭白石潭": 143748445,
    # "maorui": ,
}

In [None]:
def scrape_ctrip(
        total_pages: int = 302, 
        spot_name: str = "武夷山",
        spot_id: int = 126481,
    ) -> list:
    # Store all collected data
    all_comments = []
    
    # Ctrip API
    url = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
    
    # Headers that stay consistent ACROSS scenic spots
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0",
        "Content-Type": "application/json",
        "_fxpcqlniredt": "09031109310692966540"
        # No need to include x-traceID as it's generated per request
    }
    
    # Customize request payload 
    payload = {     # 手机端参数
        "arg": {
            "channelType": 7,
            "collapseType": 1,
            "commentTagId": 0,
            "pageIndex": 1,
            "pageSize": 10,
            "pageType": 1,
            "poiId": 'null',
            "resourceId": spot_id,
            "resourceType": 11,
            "sortType": 6,
            "sourceType": 1,
            "starType": 0,
            "videoImageSize": "700_392"
        },
        "contentType": "json",
        "head": {
            "auth": "",
            "cid": "09031109310692966540",
            "ctok": "",
            "cver": "1.0",
            "extension": [
                {
                    "name": "source",
                    "value": "web"
                },
                {
                    "name": "technology",
                    "value": "H5"
                },
                {
                    "name": "os",
                    "value": "PC"
                },
                {
                    "name": "application",
                    "value": ""
                }
            ],
            "lang": "01",
            "sid": "8888",
            "syscode": "09",
            "xsid": ""
        }
    }

    with tqdm(total=total_pages, desc=f"Scraping {spot_name}", unit='page') as pbar:
        for page in range(1, total_pages + 1):
            # Update page number in payload
            payload["arg"]["pageIndex"] = page
            response = requests.post(url, headers=headers, json=payload)
            try:
                # Parse the JSON directly
                data = response.json()
                comments = data["result"]['items']
                if not isinstance(comments, list):
                    # print(
                    #     f"Expected a list of comments, \
                    #     got {type(comments)} for page {page}"
                    # )
                    break
                # Append comments to the list
                all_comments.extend(comments)

                pbar.set_postfix({
                    'Collected': len(all_comments),
                    'Current page': page,
                    'Comments': len(comments),
                })
                # Add delay to avoid rate limiting
                # time.sleep(2)

            except Exception as e:
                print(f"Request failed for page {page}: {e}\n")
                print(f"Response: \n{response.status_code}\n{response.json()}")
                break  # Exit loop on error
            
            finally:
                pbar.update(1)

    print(f"Collected {len(all_comments)} for {spot_name} (ID: {spot_id})")
    return all_comments

In [None]:
for park_name, park_id in parks.items():
    comments = scrape_ctrip(
        spot_name=park_name, 
        spot_id=park_id
    )
    # Save comments to a JSON file
    with open(f"ctrip_{park_name}.json", "w", encoding="utf-8") as f:
        json.dump(comments, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(comments)} comments to ctrip_{park_name}.json")


Scraping 武夷山-test:   0%|          | 0/301 [00:00<?, ?page/s]

Scraping 武夷山-test: 100%|██████████| 301/301 [02:32<00:00,  1.97page/s, Collected=3010, Current page=301, Comments=10]


Collected 3010 for 武夷山-test (ID: 126481)
Saved 3010 comments to ctrip_武夷山-test.json


## cenews

In [8]:
def scrape_cenews(total_pages=300):
    articles = []
    url = "https://cmsapi.cenews.com.cn/cmsapi/searchArticle"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Content-Type": "application/json"
    }
    payload = {
        "keyWord": "生态文明",
        "searchLocation": "0",
        "searchType": "0",
        "beginTime": "2008-01-01 00:00:00",
        "endTime": "2025-05-01 23:59:59",
        "orderType": "1",
        "pageNumber": "1",
        "pageSize": "20",
        "columnID": "-1",
        "subSiteID": "1",
        "siteID": "-1",
        "includeSubNode": "1",
        "total": "39589"
    }

    for page in range(1, total_pages + 1):
        payload["pageNumber"] = str(page)
        
        response = requests.post(url, headers=headers, json=payload)
        
        if response.status_code == 200:
            data = response.json()
            if "list" in data:
                articles.extend(data["list"])
                print(f"Collected {len(data['list'])} articles from page {page}")
                
                # Check if we have reached the last page
                if len(data["list"]) < 20:
                    print("Reached last page")
                    break
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        
        time.sleep(2)
    return articles

# Execute the function
articles = scrape_cenews()
    

Collected 20 articles from page 1
Collected 20 articles from page 2
Collected 20 articles from page 3
Collected 20 articles from page 4
Collected 20 articles from page 5
Collected 20 articles from page 6
Collected 20 articles from page 7
Collected 20 articles from page 8
Collected 20 articles from page 9
Collected 20 articles from page 10
Collected 20 articles from page 11
Collected 20 articles from page 12
Collected 20 articles from page 13
Collected 20 articles from page 14
Collected 20 articles from page 15
Collected 20 articles from page 16
Collected 20 articles from page 17
Collected 20 articles from page 18
Collected 20 articles from page 19
Collected 20 articles from page 20
Collected 20 articles from page 21
Collected 20 articles from page 22
Collected 20 articles from page 23
Collected 20 articles from page 24
Collected 20 articles from page 25
Collected 20 articles from page 26
Collected 20 articles from page 27
Collected 20 articles from page 28
Collected 20 articles from pa

In [9]:
# Save the articles to a JSON file
with open("cenews_articles.json", "w", encoding="utf-8") as f:
    json.dump(articles, f, ensure_ascii=False, indent=2)
print(f"Total articles collected: {len(articles)}")

Total articles collected: 6000


## people

In [14]:
def scrape_people(total_pages=100):
    articles = []
    url = "http://search.people.cn/search-platform/front/search"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Content-Type": "application/json"
    }
    payload = {
        "endTime": 0,
        "hasContent": 'true',
        "hasTitle": 'true',
        "isFuzzy": 'true',
        "key": "生态文明",
        "limit": 10,
        "page": 1,
        "sortType": 2,
        "startTime": 0,
        "type": 0,
    }
    
    for page in range(1, total_pages + 1):
        payload["page"] = page
        response = requests.post(url, headers=headers,json=payload)
        
        if response.status_code == 200:
            data = response.json()
            records = data["data"]["records"]
            if not records:
                print(f"No records found for page {page}")
                break
            articles.extend(records)
            print(f"Collected {len(records)} articles from page {page}")
            
            # Check if we have reached the last page
            if len(records) < 10:
                print("Reached last page")
                break
        else:
            print(f"Failed to fetch page {page}: {response.status_code}")
        
        time.sleep(2)
    return articles

In [15]:
people_articles = scrape_people()

Collected 10 articles from page 1
Failed to fetch page 2: 403
Collected 10 articles from page 3
Failed to fetch page 4: 403
Collected 10 articles from page 5
Failed to fetch page 6: 403
Collected 10 articles from page 7
Failed to fetch page 8: 403
Failed to fetch page 9: 403
Collected 10 articles from page 10
Collected 10 articles from page 11
Collected 10 articles from page 12
Failed to fetch page 13: 403
Collected 10 articles from page 14
Failed to fetch page 15: 403
Collected 10 articles from page 16
Collected 10 articles from page 17
Collected 10 articles from page 18
Collected 10 articles from page 19
Failed to fetch page 20: 403
Collected 10 articles from page 21
Collected 10 articles from page 22
Collected 10 articles from page 23
Collected 10 articles from page 24
Collected 10 articles from page 25
Collected 10 articles from page 26
Collected 10 articles from page 27
Collected 10 articles from page 28
Collected 10 articles from page 29
Collected 10 articles from page 30
Collect

In [16]:
# Save the articles to a JSON file
with open("people_articles.json", "w", encoding="utf-8") as f:
    json.dump(people_articles, f, ensure_ascii=False, indent=2)
print(f"Total articles collected: {len(people_articles)}")

Total articles collected: 920


## mee

In [4]:
!pip install tqdm selectolax



In [7]:
from tqdm import tqdm
from selectolax.parser import HTMLParser
import requests
import time
import json

In [20]:
def scrape_mee(total_pages=30):
    articles = []
    base_url = "https://www.mee.gov.cn/was5/web/search?"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0",
        "Content-Type": 'text/html; charset=UTF-8'
    }

    params = {
        "channelid": "270514",
        "searchword": "",
        "page": "1",
        "orderby": "-docreltime",
        "searchscope": "",
        "timestart": "",
        "timeend": "",
        "period": "",
        "chnls": "3",
        "andsen": "",
        "total": "生态文明",
        "orsen": "",
        "exclude": ""
	}

    with tqdm(total=total_pages, desc="Scraping MEE", unit='page') as pbar:
        for page in range(1, total_pages + 1):
            try:
                response = requests.get(base_url, headers=headers, params={**params, "page": page})
                if response.status_code == 200:
                    tree = HTMLParser(response.text)
                    items = tree.css('li.li')
                    page_articles = []
                    for item in items:
                        url = item.css_first('a').attrs['href']
                        category = item.css_first('em').text(strip=True)
                        title = item.css_first('h2').text().replace(category, '').strip()
                        date = item.css_first('span').text(strip=True)
                        page_articles.append({
                            "title": title,
                            "url": url,
                            "category": category,
                            "date": date
                        })
                    articles.extend(page_articles)

                    pbar.set_postfix({
                        'Collected': len(articles),
                        'Current': len(page_articles),
                    })
                time.sleep(2)  # Respectful delay between requests
            
            except Exception as e:
                tqdm.write(f"Error on page {page}: {str(e)}")
            
            finally:
                pbar.update(1)
    
    return articles

In [21]:
mee_articles = scrape_mee()

Scraping MEE: 100%|██████████| 30/30 [01:31<00:00,  3.06s/page, Collected=261, Current=0] 


In [22]:
# Save the articles to a JSON file
with open("mee_政策文件.json", "w", encoding="utf-8") as f:
    json.dump(mee_articles, f, ensure_ascii=False, indent=2)
print(f"Total articles collected: {len(mee_articles)}")

Total articles collected: 261
