# 分析

[空气质量指数(http://www.tianqihoubao.com/aqi/)](http://www.tianqihoubao.com/aqi/)  

HTML基础  
超链接 `<a href='链接地址'>链接的文字</a>`  
在网页使用F12(或右键审查元素)可以调开控制台，查看网页源代码。

爬虫项目整体代码：  
[高民权_中国城市空气质量数据抓取_Github](https://github.com/fortyMiles/ChineseAirConditionCrawler)

**处理城市编码**  
将`<div class="citychk">`copy下来，进一步处理

In [34]:
html = "".join(open('./citychk.txt').readlines())

In [36]:
import re

m = re.findall('href="/aqi/\w*.html">.{0,5} ', html)

# lambd = lambda x: (x.split('>')[1].strip(), x[11:x.index('.')])
# or use function
def generate_code(x):
    end = x.index('.')
    code = x[11:end]
    name = x.split('>')[1].strip()
    return name,code
    
city_coding = list(map(generate_code, m))

# remove duplicate data
print(len(city_coding), len(set(city_coding)))
city_coding = set(city_coding)

# save
with open('./city_coding.txt', 'w') as f:
    for line in city_coding:
        f.write('\t'.join(line) + '\n')
print('Saved!')

371 367
Saved!


Github中的`get_location_info.py`文件对应city_coding的生成

# 抓取

**首先安装包**  

``` bash
pip install bs4
```

读取city_coding

In [41]:
def get_city_coding(file='./city_coding'):
    city_coding = {}
    with open(file) as f:
        for line in f.readlines():
            line = line.strip()
            try: 
                city, coding = line.split('\t')
                city_coding[city.strip()] = coding.strip()
            except Exception as e:
                continue
    return city_coding

city_coding = get_city_coding()

拼接成自己想要的URL地址  

如果是当前月份可以看到直接使用城市名称即可，如 http://www.tianqihoubao.com/aqi/hangzhou.html  
如果查询的是历史月份，可以看到是这种格式 http://www.tianqihoubao.com/aqi/hangzhou-201702.html

In [45]:
def build_url(city_coding, year=None, month=None):
    BASE = 'http://www.tianqihoubao.com/aqi/'
    city_base_url = BASE + '{}.html'
    city_data_base_url = BASE + '{}-{}{}.html'
    
    if year is not None and month is not None:
        month = str(month) if month >= 10 else '0' + str(month)
        return city_data_base_url.format(city_coding, year, month)
    else:
        return city_base_url.format(city_coding)
    
hangzhou = city_coding['杭州']
print(build_url(hangzhou))
print(build_url(hangzhou, 2018, 5))

http://www.tianqihoubao.com/aqi/hangzhou.html
http://www.tianqihoubao.com/aqi/hangzhou-201805.html


使用python进行数据抓取

[HTTP请求状态](https://www.runoob.com/http/http-status-codes.html)  
了解200 404 503

首先，通过F12查看hangzhou-201805.html请求，可以看到`Content-Type: text/html; charset=gb2312` 所示使用的是GBK编码

然后进行HTML解析  
参考：[Beautiful Soup 4.2.0 文档](https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html)

In [62]:
import requests
from bs4 import BeautifulSoup

hangzhou = city_coding['杭州']
url = build_url(hangzhou, 2018, 5)

# 发送请求
# get post

response = requests.get(url)

# 查看相关信息
# help(reponse)

print(response.status_code, response.ok)

# 打印返回的结果
print(response.encoding)
html = response.text
soup = BeautifulSoup(html)

# 一些属性 
# 网页的title
print(soup.title)
# 网页的文本
# print(soup.text)

# 查找属性
data_table = soup.find_all('table')
print(len(data_table))
# print(data_table)

# 既然只有一个table
# 可以使用下面
data_table = soup.table

# 然后进行更加细化的数据分析

200 True
gb2312
<title>
	2018年5月杭州空气质量指数查询(AQI)_5月份杭州PM2.5历史数据查询_天气后报
</title>
1


In [87]:
# 查看下data_table内容
# print(data_table)

# data.contents 将对象下的元素都获取得到 返回List
# 可以看到第一行是表头 
# 并且隔一行有一个\n元素

name_index = 1
content = data_table.contents[name_index:]

result = []
for index, c in enumerate(content[::2]):
    if index == 0:
        result.append(tuple(['city'] + c.text.split()))
    else:
        result.append(tuple([hangzhou] + c.text.split()))
        
print(len(result), result)

32 [('city', '日期', '质量等级', 'AQI指数', '当天AQI排名', 'PM2.5', 'PM10', 'So2', 'No2', 'Co', 'O3'), ('hangzhou', '2018-05-01', '优', '39', '35', '21', '36', '6', '28', '0.82', '52'), ('hangzhou', '2018-05-02', '良', '57', '143', '35', '57', '6', '21', '0.85', '100'), ('hangzhou', '2018-05-03', '良', '59', '124', '18', '41', '7', '31', '0.61', '104'), ('hangzhou', '2018-05-04', '良', '78', '239', '33', '66', '10', '47', '0.71', '99'), ('hangzhou', '2018-05-05', '优', '48', '73', '29', '48', '7', '43', '0.76', '60'), ('hangzhou', '2018-05-06', '优', '50', '139', '30', '50', '7', '40', '0.91', '43'), ('hangzhou', '2018-05-07', '优', '42', '65', '25', '41', '6', '30', '0.81', '51'), ('hangzhou', '2018-05-08', '良', '51', '103', '24', '43', '7', '31', '0.91', '78'), ('hangzhou', '2018-05-09', '良', '83', '258', '31', '70', '12', '44', '0.76', '102'), ('hangzhou', '2018-05-10', '良', '67', '167', '29', '62', '9', '29', '0.65', '100'), ('hangzhou', '2018-05-11', '良', '73', '263', '36', '69', '10', '47', '0.77',

**完整代码 for 参考**  
[高民权_中国城市空气质量数据抓取_Github](https://github.com/fortyMiles/ChineseAirConditionCrawler)

In [1]:
import requests
from bs4 import BeautifulSoup


def get_city_coding():
    CITY_CODIN = './city_coding'
    city_coding = {}
    with open(CITY_CODIN, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            try:
                city, coding = line.split('\t')
                city_coding[city.strip()] = coding.strip()
            except ValueError as e:
                continue

    return city_coding


def build_url(city_coding, year=None, month=None):
    BASE = 'http://www.tianqihoubao.com/aqi/'
    city_base_url = BASE + "{}.html"
    city_data_base_url = BASE + "{}-{}{}.html"

    if year is not None and month is not None:
        month = str(month) if month >= 10 else '0' + str(month)
        return city_data_base_url.format(city_coding, year, month)
    else:
        return city_base_url.format(city_coding)


def get_from_http(city_coding, year=None, month=None):
    '''
    
    :param city_coding: city Chinese Name, e.g hangzhou 
    :param year: e.g 2016
    :param month: e.g 10
    :param day:  e.g 5
    :return: {
                'city': string,
                'air_conditions': [air_condition]
             }
             
             air_condition = (Date, AQI, Pm2.5, Pm10, No2, So2, Co, O3)
             
    '''

    url = build_url(city_coding, year, month)

    content = get_some_day_air_condition(city_coding, url)

    return content


def get_some_day_air_condition(city_coding, url):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            r.encoding = 'GBK'
            html_file = r.text
            soup = BeautifulSoup(html_file, 'html.parser')

            data_table = soup.find_all('table')
            data_table = soup.table

            return parse(city_coding, data_table)
        else:
            return None
    except Exception as e:
        print('connnect error')
        print(e)
        return None


def parse(city_coding, data):
    #data.contents[1].text.split()
    #data.contents[3].text.split()
    name_index = 1
    content = data.contents[name_index:]

    result = []

    for index, c in enumerate(content[::2]):
        if index == 0:
            result.append(tuple(['city'] + c.text.split()))
        else:
            result.append(tuple([city_coding] + c.text.split()))

    return result


if __name__ == '__main__':
    #get_from_http('杭州', 2015, 10, 6)
    city_coding = get_city_coding()
    assert city_coding['杭州'] == 'hangzhou'

    hangzhou = city_coding['杭州']

    print('testing')

    assert build_url(hangzhou, 2016, 5) == "http://www.tianqihoubao.com/aqi/hangzhou-201605.html"
    assert build_url(hangzhou, 2016) == "http://www.tianqihoubao.com/aqi/hangzhou.html"
    assert build_url(hangzhou) == "http://www.tianqihoubao.com/aqi/hangzhou.html"

    assert get_some_day_air_condition("hanghzhou", "http://www.tianqihoubao.com/aqi/hangzhou-201605.html") is not None

    data = get_some_day_air_condition("hangzhou", "http://www.tianqihoubao.com/aqi/hangzhou-201605.html")
    #print(data)

    city_data = get_from_http('hangzhou', 2015, 10)
    print(city_data)

    print('test done')

testing
[('city', '日期', '质量等级', 'AQI指数', '当天AQI排名', 'PM2.5', 'PM10', 'So2', 'No2', 'Co', 'O3'), ('hangzhou', '2015-10-01', '良', '53', '166', '31', '61', '9', '22', '0.75', '61'), ('hangzhou', '2015-10-02', '良', '70', '199', '32', '82', '13', '34', '0.65', '81'), ('hangzhou', '2015-10-03', '良', '70', '190', '41', '87', '14', '47', '0.80', '70'), ('hangzhou', '2015-10-04', '良', '78', '231', '56', '96', '15', '47', '0.90', '49'), ('hangzhou', '2015-10-05', '优', '43', '95', '29', '40', '10', '33', '0.74', '56'), ('hangzhou', '2015-10-06', '优', '49', '120', '33', '47', '14', '41', '0.81', '30'), ('hangzhou', '2015-10-07', '优', '39', '76', '26', '38', '18', '46', '0.78', '28'), ('hangzhou', '2015-10-08', '优', '35', '51', '23', '33', '11', '36', '0.74', '36'), ('hangzhou', '2015-10-09', '良', '54', '165', '37', '56', '10', '36', '0.74', '60'), ('hangzhou', '2015-10-10', '良', '66', '244', '45', '72', '13', '38', '0.75', '55'), ('hangzhou', '2015-10-11', '良', '78', '292', '52', '96', '22', '50',

**然后使用pycharm进行项目架构介绍**