In [1]:
import requests
from pathlib import Path
import xmltodict
import os
import pandas as pd
import io
import datetime
import pymongo
import json
import gzip
import dns
from os import listdir
from os.path import isfile, join

urlBase='https://tisvcloud.freeway.gov.tw/history/motc20/VD/'  # 20190624/cms_value_0000.xml.gz   20190624/cms_value_0014.xml.gz
baseDir='../../../data/vd2.0/'
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client.traffic
trType='vd20'
coll=db['vd20']
errorLog=db['errorLog']
#建立 unique index 以防重複 insert doc to mongodb, 並且加速查詢, 如果同樣的 compound unique index已經存在則不作用
coll.create_index([("VDID",1),("DataCollectTime",1)],unique=True)


'VDID_1_DataCollectTime_1'

In [2]:
#確認該 url 是否可以下載, 例如 20190230 就不存在, 無法下載; 而且必須是附件形式的方可
def isDownloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get('content-type')
    if content_type is None:
        return False
    if 'text' in content_type.lower():
        return False
    if 'html' in content_type.lower():
        return False
    return True

In [3]:
#下載某url檔案後, 放在指定目錄下
def downloadFileFromUrl(url, directory):
    filename = directory+'/'+ url.rsplit('/', 1)[1]
    if not os.path.exists(filename):
        r = requests.get(url, allow_redirects=True)
        open(filename, 'wb').write(r.content)

In [4]:
#抓取 CMS 的某一天每1分鐘一次 所有檔案 並下載到指定目錄
def downloadVD20ByDay(day, baseDir):
    downloads=[]
    for hour in range(0,24):
        for mininute in range(0,60,1):  #VD v2.0 每1分鐘一次
            downloads.append(urlBase + day + '/VDLive_' + format(hour, '02d')+ format(mininute, '02d') +'.xml.gz' )
    p=baseDir+day
    Path(p).mkdir(parents=True, exist_ok=True)
    for url in downloads:
        if isDownloadable(url):
            downloadFileFromUrl(url,p)
            print(url)  

In [18]:
def file2Mongo(i,directory,file):
    print(i,directory,file)
    f = gzip.open(directory+file, 'rb')
    doc = xmltodict.parse(f)
    Y=doc['VDLiveList']['VDLives']['VDLive']
    if Y is not None:
        json_data = json.dumps(Y)
        cc = json.loads(json_data)
        tmpDF=pd.DataFrame(cc)
        tmpDF['DataCollectTime']=tmpDF['DataCollectTime'].apply(lambda x:datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S%z"))
        
        #linkflows= json.loads(tmpDF["LinkFlows"])
        tmpDF["LinkID"]=tmpDF["LinkFlows"].apply(lambda x:x["LinkFlow"]["LinkID"])
        tmpDF["Lanes"]=tmpDF["LinkFlows"].apply(lambda x:x["LinkFlow"]["Lanes"]["Lane"])
        #print(tmpDF['lane'][0],'\n',tmpDF['lane'][1],'\n',tmpDF['lane'][2],'\n',tmpDF['lane'][3])
        #a=cc[1]
        #print(a['@vdid'],a['@datacollecttime'],a['@status'],'\n',splitLane2DF(a['lane'])) 
        records = tmpDF.to_dict('records')
        try:
            coll.insert_many(records)
        except Exception as err:
            errorLog.insert_one({"process":"VD2.0_ByDayDL2Mong", "type":"insert_many" ,"message": "err.with_traceback", "data":file, "source":directory, "time":datetime.datetime.now()})
            print ("collect.insert_many ERROR:", err)    
    else:
        errorLog.insert_one({"process":"VD2.0_ByDayDL2Mong", "type":"None Value", "message":Y, "data":file, "source":directory, "time":datetime.datetime.now()})
        print('TypeError: ',Y)
    f.close()
    ######3##########TODO 

In [6]:
def insertFiles2Mongo(files, directory):
    i=1
    for file in files:
        file2Mongo(i, directory, file)
        i +=1

In [7]:
from urllib.parse import urlparse
def downnload1HourFiles2Mongo(day, hour, baseDir):
    downloads=[]
    p=baseDir+day+'/'
    Path(p).mkdir(parents=True, exist_ok=True)
    
    for mininute in range(0,60,1):  #vd 每1分鐘一次
        downloads.append(urlBase + day + '/VDLive_' + format(hour, '02d')+ format(mininute, '02d') +'.xml.gz' )
    
    for url in downloads:
        if isDownloadable(url):
            downloadFileFromUrl(url,p)
            insertFiles2Mongo([os.path.basename(urlparse(url).path)], p)
    

In [8]:
          
def getAllFiles2Mongo(day, baseDir):
    
    directory=baseDir+day+'/'
    files = [f for f in listdir(directory) if isfile(join(directory, f))]
    insertFiles2Mongo(files, directory)

In [9]:
def downloadV20Insert(day, baseDir):
    downloadVD20ByDay(day, baseDir)
    getAllFiles2Mongo(day, baseDir)


In [10]:
downloadV20Insert('20200124',baseDir)

1 ../../../data/vd2.0/20200124/ VDLive_1910.xml.gz
2 ../../../data/vd2.0/20200124/ VDLive_0032.xml.gz
3 ../../../data/vd2.0/20200124/ VDLive_0529.xml.gz
4 ../../../data/vd2.0/20200124/ VDLive_0116.xml.gz
5 ../../../data/vd2.0/20200124/ VDLive_0445.xml.gz
6 ../../../data/vd2.0/20200124/ VDLive_0321.xml.gz
7 ../../../data/vd2.0/20200124/ VDLive_0307.xml.gz
8 ../../../data/vd2.0/20200124/ VDLive_0333.xml.gz
9 ../../../data/vd2.0/20200124/ VDLive_1844.xml.gz
10 ../../../data/vd2.0/20200124/ VDLive_0256.xml.gz
11 ../../../data/vd2.0/20200124/ VDLive_1344.xml.gz
12 ../../../data/vd2.0/20200124/ VDLive_1001.xml.gz
13 ../../../data/vd2.0/20200124/ VDLive_1212.xml.gz
14 ../../../data/vd2.0/20200124/ VDLive_0554.xml.gz
15 ../../../data/vd2.0/20200124/ VDLive_1128.xml.gz
16 ../../../data/vd2.0/20200124/ VDLive_1744.xml.gz
17 ../../../data/vd2.0/20200124/ VDLive_0337.xml.gz
18 ../../../data/vd2.0/20200124/ VDLive_0107.xml.gz
19 ../../../data/vd2.0/20200124/ VDLive_2213.xml.gz
20 ../../../data/vd2.

In [78]:
downnload1HourFiles2Mongo('20200124', 22, baseDir)

1 ../../../data/vd2.0/20200124/ VDLive_2200.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2201.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2202.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2203.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2204.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2205.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2206.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2207.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2208.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2209.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2210.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2211.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2212.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2213.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2214.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2215.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2216.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2217.xml.gz
1 ../../../data/vd2.0/20200124/ VDLive_2218.xml.gz
1 ../../../data/vd2.0/20200124/

In [17]:
#1054 ../../../data/vd2.0/20200124/ VDLive_2212.xml.gz
file2Mongo(0,"../../../data/vd2.0/20200124/","VDLive_2212.xml.gz")


0 ../../../data/vd2.0/20200124/ VDLive_2212.xml.gz
collect.insert_many ERROR: batch op errors occurred
