In [0]:
!pip install beautifulsoup4



In [0]:
import urllib.request
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup,Comment
import re

In [0]:
url = "http://www.hanban.org/hanbancn/template/ciotab_cn1.htm?v1"
response = urllib.request.urlopen(url)
#webContent = response.read().decode(response.headers.get_content_charset())
webContent = response.read().decode("utf-8")
print(len(webContent))

233918


In [0]:
# info_dict->{name:{'country', 'city', 'type', 'date', 'status', 'url'}}
info_dict = {}

tabContent = BeautifulSoup(webContent).find("div", class_="tabContent")
continents = tabContent.find_all("div", class_="tcon")

for con in continents:
  # 得到此大洲国家名的box
  nations = con.find("div", class_=re.compile(r"nation\d*"))
  
  # 去掉被注释的国家
  for comment_country in nations(text=lambda text: isinstance(text, Comment)):
    comment_country.extract()
  # 得到此大洲国家列表
  counties = [c.string for c in nations.find_all("a")]
  
  # 得到此大洲的学校box
  schools = con.find("div", class_=re.compile("tcon_nationBox\d*"))
  # 得到此大洲里各个国家的学校tab
  # find_all()会忽视被注释的tab，不需要再去掉注释
  schools_nation = schools.find_all("div", class_="tcon_nation")
  
  # 确认国家列表与学校列表是对应的
  if len(schools_nation) != len(counties):
    print("ERROR: schools tab no match the country.")
    break
  
  # 处理各个国家的学校
  for idx, sc in enumerate(counties):
    # 处理孔子学院
    kys = schools_nation[idx].find("div", class_="KY")
    # 检查是否有被注释的学院
    comment_kys = kys.find_all(string=lambda text: isinstance(text, Comment))
    # 处理被注释的学院
    if comment_kys:
      for ckys in comment_kys:
        # 由于注释没有建树，所以需要在创建一个BeautifulSoup进行解析
        ckys_bs = BeautifulSoup(ckys)
        for cky in ckys_bs.find_all("a"):
          ky_name = cky.string
          if ky_name:
            ky_name = ky_name.strip()
            ky_url = cky.get("href") or "NaN"
            info_dict[ky_name] = {'type':"孔子学院", 'country':counties[idx], 'status': 'hide', 'url':ky_url}

    # 处理没有被注释的学院, 如果名字相同会覆盖
    kys = kys.find_all("a")
    # 处理每个学院
    for ky in kys:
      ky_name = ky.string
      if ky_name:
        ky_name = ky_name.strip()
        ky_url = ky.get("href") or "NaN"
        #ky_id = re.findall(r'\d+', ky_url.split('/')[-1])[0]
        # 将信息保存到汇总字典中
        info_dict[ky_name] = {'type':"孔子学院", 'country':counties[idx], 'status': 'show', 'url':ky_url}
    
    # 处理孔子课堂
    coures = schools_nation[idx].find("div", class_="coures")
    # 检查是否有被注释的课堂
    comment_coures = coures.find_all(string=lambda text: isinstance(text, Comment))
    # 处理被注释的课堂
    if comment_coures:
      for ccoures in comment_coures:
        # 由于注释没有建树，所以需要在创建一个BeautifulSoup进行解析
        ccoures_bs = BeautifulSoup(ccoures)
        for ccoure in ccoures_bs.find_all("a"):
          coure_name = ccoure.string
          if coure_name:
            coure_name = coure_name.strip()
            coure_url = ccoure.get("href") or "NaN"
            info_dict[coure_name] = {'type':"孔子课堂", 'country':counties[idx], 'status': 'hide', 'url':coure_url}

    # 处理没有被注释的课题
    coures = coures.find_all("a")
    # 处理每个课堂
    for coure in coures:
      coure_name = coure.string
      if coure_name:
        coure_name = coure_name.strip()
        coure_url = coure.get("href") or "NaN"
        #coure_id = re.findall(r'\d+', coure_url.split('/')[-1])[0]
        info_dict[coure_name] = {'type':"孔子课堂", 'country':counties[idx], 'status': 'show', 'url':coure_url}

print(len(info_dict))

1356


In [0]:
print(info_dict["伊利诺伊大学香槟分校孔子学院"])
print(info_dict["北佛罗里达大学孔子学院"])

{'type': '孔子学院', 'country': '美国', 'status': 'hide', 'url': 'http://www.hanban.org/confuciousinstitutes/node_40583.htm'}
{'type': '孔子学院', 'country': '美国', 'status': 'hide', 'url': 'http://www.hanban.org/confuciousinstitutes/node_45557.htm '}


In [0]:
# 爬取所有子页面内容，保存到一个字典中
subsite_dict = {}

In [0]:
def is_url(string_url):
  urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string_url)
  return urls

for idx, name in enumerate(info_dict.keys()):
  if is_url(info_dict[name]['url']) and name not in subsite_dict:
    # 爬取页面，如果中断则继续
    print(idx, "\tHandling:", name, "\turl:", info_dict[name]['url'])
    response = urllib.request.urlopen(info_dict[name]['url'])
    subwebContent = response.read().decode("utf-8")
    subsite_dict[name] = subwebContent

In [0]:
# 解析各个孔子学院和孔子课题url中的"城市"和"创立时间"
for idx, name in enumerate(info_dict.keys()):
  
  print(idx, "\tHandling:", name)

  # 默认NaN
  info_dict[name]['city'] = "NaN"
  info_dict[name]['date'] = "NaN"
  
  # 处理没有爬取到内容的情况
  if not subsite_dict[name]:
    print("Skip1", name)
    continue

  # 创建解析器
  bs = BeautifulSoup(subsite_dict[name])
  
  # 有两种格式：<p>和<tbody>
  if bs.find("table"):
    all_info = bs.find("div", class_="main_leftCon").find_all("table")
  else:
    all_info = bs.find("div", class_="main_leftCon").find_all("p")
  
  # 如果网页没有目标内容，跳过
  if not all_info:
    print("Skip2", name)
    continue
  
  # 逐条解析
  for line in all_info:
    info = [word for word in line.stripped_strings]
    if not info:
      continue
    if info[0].find("城市") != -1:
      if len(info) >= 2:
        # 确认城市名存在
        info_dict[name]['city'] = info[1]
  
    if info[0].find("时间") != -1:
      # 匹配时间，格式****年**月**日
      date_string = re.findall(r'\d{4}[-/.|年]\d{1,2}[-\/.|月]\d{1,2}[-/.|日]*', info[-1])
      # debug
      # print(info)
      # 确认日期存在
      if date_string:
        # 去掉中文，转成标准格式为 ****-**-**
        date_list = re.findall(r'\d+',date_string[0])
        date = '-'.join(date_list)
        info_dict[name]['date'] = date

In [0]:
print(info_dict["伊利诺伊大学香槟分校孔子学院"])
print(info_dict["北佛罗里达大学孔子学院"])
print(info_dict["南太平洋大学孔子学院"])
print(info_dict["斯科奇•欧克伯恩学院孔子课堂"])

{'type': '孔子学院', 'country': '美国', 'status': 'hide', 'url': 'http://www.hanban.org/confuciousinstitutes/node_40583.htm', 'city': 'NaN', 'date': 'NaN'}
{'type': '孔子学院', 'country': '美国', 'status': 'hide', 'url': 'http://www.hanban.org/confuciousinstitutes/node_45557.htm ', 'city': 'NaN', 'date': 'NaN'}
{'type': '孔子学院', 'country': '斐济', 'status': 'show', 'url': 'http://www.hanban.org/confuciousinstitutes/node_38667.htm', 'city': '苏瓦', 'date': '2011-02-18'}
{'type': '孔子课堂', 'country': '澳大利亚', 'status': 'show', 'url': 'http://zhuanti.hanban.org/videolist/?cat=98&tag=cn', 'city': '朗赛斯顿', 'date': '2015-09-15'}


In [0]:
# Save the information to file
df = pd.DataFrame.from_dict(info_dict, orient='index')
df.to_excel("./hanban.xlsx", encoding='utf-8')