In [None]:
#!/usr/bin/env python3
#-*- coding: utf-8 -*-

"""
気象庁から過去の気象データを CSV 形式でダウンロードする。
API が提供されていないので、ウェブページを参考にスクリプトを作成した。

http://www.data.jma.go.jp/gmd/risk/obsdl/index.php

"""

from datetime import date, timedelta
import urllib.request
import lxml.html
import pandas as pd
import codecs as cd
import csv
from io import StringIO
from tqdm import tqdm
import os.path

In [None]:
def encode_data(data):
    return urllib.parse.urlencode(data).encode(encoding='ascii')

def get_phpsessid():
    URL="http://www.data.jma.go.jp/gmd/risk/obsdl/index.php"
    xml = urllib.request.urlopen(URL).read().decode("utf-8")
    tree = lxml.html.fromstring(xml)
    return tree.cssselect("input#sid")[0].value
    

In [None]:
# Returns obsevation stations in prefectures

def get_station(pd=0):
    assert type(pd) is int and pd > 0
    
    URL="http://www.data.jma.go.jp/gmd/risk/obsdl/top/station"
    data = encode_data({"pd": "%02d" % pd})
    xml = urllib.request.urlopen(URL, data=data).read().decode("utf-8")
    tree = lxml.html.fromstring(xml)

    def kansoku_items(bits):
        return dict(rain=(bits[0] == "1"),
                    wind=(bits[1] == "1"),
                    temp=(bits[2] == "1"),
                    sun =(bits[3] == "1"),
                    snow=(bits[4] == "1"))

    def parse_station(dom):
        stitle = dom.get("title").replace("：", ":")
        title = dict(filter(lambda y: len(y) == 2,
                            map(lambda x: x.split(":"), stitle.split("\n"))))
                                
        name    = title["地点名"]
        stid    = dom.cssselect("input[name=stid]")[0].value
        stname  = dom.cssselect("input[name=stname]")[0].value
        kansoku = kansoku_items(dom.cssselect("input[name=kansoku]")[0].value)
        assert name == stname
        return (stname, dict(id=stid, flags=kansoku))
    
    
    stations = dict(map(parse_station, tree.cssselect("div.station")))
    
    return stations

In [None]:
# Precture Dictionary
# order 0: id -> name, 1: name -> id

def get_prefs(order, pd=0):
        
    URL="http://www.data.jma.go.jp/gmd/risk/obsdl/top/station"
    data = encode_data({"pd": "%02d" % pd})
    xml = urllib.request.urlopen(URL, data=data).read().decode("utf-8")
    tree = lxml.html.fromstring(xml)

    def parse_prefs(dom):
        name = dom.text
        prid = int(dom.cssselect("input[name=prid]")[0].value)
        if order == 0:
            return (prid, name)
        else:
            return (name, prid)
    
    stations = dict(map(parse_prefs, tree.cssselect("div.prefecture")))
        
    return stations

In [None]:

def get_aggrgPeriods():
    URL="http://www.data.jma.go.jp/gmd/risk/obsdl/top/element"
    xml = urllib.request.urlopen(URL).read().decode("utf-8")  # HTTP GET
    tree = lxml.html.fromstring(xml)

    def parse_periods(dom):
        if dom.find("label") is not None:
            val = dom.find("label/input").attrib["value"]
            key = dom.find("label/span").text
            rng = None
        else:
            val = dom.find("input").attrib["value"]
            key = dom.find("span/label").text
            rng = list(map(lambda x: int(x.get("value")),
                           dom.find("span/select").getchildren()))
        return (key, (val, rng))

    perdoms = tree.cssselect("#aggrgPeriod")[0].find("div/div").getchildren()
    periods = dict(map(parse_periods, perdoms))
    return periods

def get_elements(aggrgPeriods=9, isTypeNumber=1):
    URL="http://www.data.jma.go.jp/gmd/risk/obsdl/top/element"
    data = encode_data({"aggrgPeriod": aggrgPeriods,
                        "isTypeNumber": isTypeNumber})
    xml = urllib.request.urlopen(URL, data=data).read().decode("utf-8")
    open("tmp.html", "w").write(xml)
    tree = lxml.html.fromstring(xml)

    boxes = tree.cssselect("input[type=checkbox]")
    options, items = boxes[0:4], boxes[4:]

    def parse_items(dom):
        if "disabled" in dom.attrib: return None
        if dom.name == "kijiFlag": return None
        name     = dom.attrib["id"]
        value    = dom.attrib["value"]
        options  = None
        select = dom.getnext().find("select")
        if select is not None:
            options = list(map(lambda x: int(x.get("value")),
                               select.getchildren()))
        return (name, (value, options))
    
    items = dict(filter(lambda x: x, map(parse_items, items)))
    return items


def download_hourly_csv(phpsessid, station, element, begin_date, end_date):
    params = {
        "PHPSESSID": phpsessid,
        # 共通フラグ
        "rmkFlag": 1,        # 利用上注意が必要なデータを格納する
        "disconnectFlag": 1, # 観測環境の変化にかかわらずデータを格納する
        "csvFlag": 1,        # すべて数値で格納する
        "ymdLiteral": 1,     # 日付は日付リテラルで格納する
        "youbiFlag": 0,      # 日付に曜日を表示する
        "kijiFlag": 0,       # 最高・最低（最大・最小）値の発生時刻を表示
        # 時別値データ選択
        "aggrgPeriod": 9,    # 時別値
        "stationNumList": '["%s"]' % station,      # 観測地点IDのリスト
        "elementNumList": '[["%s",""]]' % element, # 項目IDのリスト
        "ymdList": '["%d", "%d", "%d", "%d", "%d", "%d"]' % (
            begin_date.year,  end_date.year,
            begin_date.month, end_date.month,
            begin_date.day,   end_date.day),       # 取得する期間
        "jikantaiFlag": 0,        # 特定の時間帯のみ表示する
        "jikantaiList": '[1,24]', # デフォルトは全部
        "interAnnualFlag": 1,     # 連続した期間で表示する
        # 以下、意味の分からないフラグ類
        "optionNumList": [],
        "downloadFlag": "true",   # CSV としてダウンロードする？
        "huukouFlag": 0,
    }

    URL="http://www.data.jma.go.jp/gmd/risk/obsdl/show/table"
    data = encode_data(params)
    csv_data = urllib.request.urlopen(URL, data=data).read().decode("shift-jis")
    return csv_data

In [None]:
# return list of start and end-dates
# As max size of data is about 1 years worth of hourly data it divides period into yearly segments

def get_dates(start_date,end_date):
    
    num = int((end_date - start_date).days/366) + 1

    if num == 1:
        dates_l = [[start_date,end_date]]
    else:
        
        dates_l = []
        dates_l.append([start_date,start_date + timedelta(days = 365)])
        for x in range(0,num-2):
            dates_l.append([dates_l[-1][1] + timedelta(days = 1),dates_l[-1][1] + timedelta(days = 366)])
        dates_l.append([dates_l[-1][1] + timedelta(days = 1),end_date])
        
    return dates_l
    


In [None]:
def get_temps(pref_no,start_date, end_date):
     
    # Get pref name and list of station names
    pref_name = get_prefs(0)[pref_no]
    station_names = get_station(pref_no)
    flag = 0

    for sn in tqdm(station_names):
        try:
            element = get_elements(get_aggrgPeriods()["時別値"][0])["気温"][0]
            station = get_station(pref_no)[sn]["id"]
            phpsessid = get_phpsessid()
            csv_file = download_hourly_csv(phpsessid, station, element,
                            start_date,end_date)
            # Delete the Japanese headers
            csv_file = csv_file.split("\n",5)[5]
            csv_file = StringIO(csv_file)
            col_names = ['Date_Time',sn,'x1','x2']
            pdv = pd.read_csv(csv_file, sep=',', header= None, names = col_names).set_index('Date_Time').drop(['x1','x2'], axis=1)

            if flag == 0:
                pdvf = pdv
                flag = 1
            else:
                pdvf = pdvf.merge(pdv,how = 'outer', on= 'Date_Time')
        except:
            pass

    pdvf.dropna(axis = 1, how = 'all')
    
    return pdvf

In [None]:
# list of the numbers of prefectures to download
# use get_prefs(0) for prefecture codes

co1 = [13,
 14]

In [None]:
for prefs in co1:

    pref_no = prefs
    pref_name = get_prefs(0)[pref_no]
    start_date = date(2014, 1, 1)
    end_date = date(2019, 6, 1)
    flag = 0
    my_dates = get_dates(start_date, end_date)
    no_dates = len(my_dates)

    for d in my_dates:
        if flag == 0:
            print(flag + 1,'/',no_dates,'Downloading temps for',pref_name, d[0],d[1])
            data_df = get_temps(pref_no,d[0], d[1])
            flag = flag + 1
        else:
            print(flag + 1,'/',no_dates,'Downloading temps for ',pref_name,d[0],d[1])
            data_df = pd.concat([data_df,get_temps(pref_no,d[0], d[1])], sort = True)
            flag = flag + 1

    data_df = data_df.dropna(axis = 1, how = 'all')   
    path = r"C:\Users\phil.richards\Documents\Temp_Data"
    filename = f'{pref_no}_{pref_name}_{d[1]}'
    dest = os.path.join(path,filename)
    data_df.to_csv(dest)