In [1]:
import requests
import urllib
import os
import re
import time
import random
import glob
import pandas as pd
from tqdm import tqdm_notebook
from pathlib import Path

from bs4 import BeautifulSoup as soup

In [3]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}

In [157]:
print(words)

['一带一路', '习近平', '李克强', '杨洁篪', '王沪宁', '栗战书', '张高丽', '张德江', '汪洋', '杨晶', '郭声琨', '王晨', '孟建柱', '王岐山', '陈雷', '刘奇葆', '刘鹤', '丁薛祥', '周文重', '高燕', '楼继伟', '高宝玉', '王毅', '何立峰', '孙春兰', '郝明金', '韩正', '肖捷', '夏宝龙', '彭丽媛', '李希', '崔世安']


In [156]:
from requests.exceptions import RequestException, Timeout

def get_search_result(word, page):
    url = f"http://search.people.com.cn/cnpeople/searchForChannel.do?totalPage=999\
&pageNum={page}&keyword={urllib.parse.quote(word.encode('GBK'))}&siteName=people&channelName=politics"
    try:
        r = requests.get(url, timeout=7, headers=headers)
    except (RequestException, Timeout) as e:
        print(f'Error {e}')
        return []
    else:
        print(r.status_code)  
        if r.status_code == 200:
            r.encoding = "GBK"
            doc = soup(r.text, "html5lib")
            res = [(x['href'], x.get_text()) for x in doc.select('.page2_list h2 a')]
        else:
            res = []
    return res

In [41]:
def extract_date(x, fmt=r'.*(\d{4})/(\d{2})(\d{2})'):
    date = re.match(fmt, x)
    return '-'.join(date.groups())

In [42]:
def build_data(titles, cur_dir=None):
    df = pd.DataFrame(titles, columns=['url', 'title'])
    df = df.drop_duplicates()
    df['date'] = pd.to_datetime(df['url'].apply(extract_date, fmt=r'.*(\d{4}-\d{2})/(\d{2})'))
    df[df.date.dt.year == 2019].sort_values(by='date', 
                        ascending=False).to_csv('%s.csv' % word, index=False)
    return f'{word}.csv have been saved in {os.getcwd()}'

## Настройка selenium.webdriver

In [9]:
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome("/usr/loca/bin/chromedriver", options=options)

## Получаем результаты поиска постранично

In [104]:
def get_search_result(word, page):
    # по 50 резльтатов на странице за 2019 год
    url = f'http://www.southcn.com/search/pc/advresult.html?keyword={word}\
&size=50&o=asc&category=南方网pc端&from=2019-01-01&to=2019-12-31&page={page}'
    driver.get(url)
    time.sleep(10) # задержка нужена, чтобы драйвер полностью получил содержимое страницы
    return driver.page_source

In [37]:
def extract_titles(source):
    doc = soup(source, "html5lib")
    return [(x['href'], x.get_text()) for x in 
                    doc.select('.result-box .result-title a')], doc.select('.next')

## Перебор списка запросов и получение результатов

In [152]:
titles = []
for word in words:
    for num_page in tqdm.notebook.tqdm(range(1, 51)):
        source = get_search_result(word, num_page)
        time.sleep(10)
        data, isnext = extract_titles(source)
        titles += data
        if isnext:
            time.sleep(random.randint(1, 6))
        else:
            break
    build_data(titles)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

## Экспорт даных в Excel

In [159]:
csv_path = glob.glob('*.csv')

In [160]:
excel_path = 'southcn.com.xlsx'

In [161]:
df_dict = {Path(file_name).stem: pd.read_csv(file_name) for file_name in csv_path}

In [162]:
with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:    
    for name, df in df_dict.items():
        df.to_excel(writer, name, index=False)   
    writer.save()