In [2]:
import requests
from pathlib import Path
import os
import tarfile
import pandas as pd
import io
import datetime
urlBase='http://tisvcloud.freeway.gov.tw/history/TDCS'
baseDir='../../../data/'

In [2]:
#確認該 url 是否可以下載, 例如 20190230 就不存在, 無法下載
def isDownloadable(url):
    """
    Does the url contain a downloadable resource
    """
    h = requests.head(url, allow_redirects=True)
    header = h.headers
    content_type = header.get('content-type')
    if content_type is None:
        return False
    if 'text' in content_type.lower():
        return False
    if 'html' in content_type.lower():
        return False
    return True

In [3]:
#下載某url檔案後, 放在指定目錄下
def downloadFileFromUrl(url, directory):
    
    filename = directory+'/'+ url.rsplit('/', 1)[1]
    if not os.path.exists(filename):
        r = requests.get(url, allow_redirects=True)
        open(filename, 'wb').write(r.content)

In [4]:
def downloadTDCSbyDay(trType, day, baseDir):
    url=urlBase + '/' + trType + '/' + trType + '_' + day+'.tar.gz'  #day format '20190630'
    
    directory=baseDir+trType
    Path(directory).mkdir(parents=True, exist_ok=True)
    print(directory)
    downloadFileFromUrl(url, directory)
    
    

In [2]:
import pymongo
import dns
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client.traffic
#建立 unique index 以防重複 insert doc to mongodb, 並且加速查詢, 如果同樣的 compound unique index已經存在則不作用
db['M03A'].create_index([("TimeInterval",1),("GantryId",1),("Direction",1),("VehicleType",1)],unique=True)
db['M04A'].create_index([("TimeInterval",1),("GantryFrom",1),("GantryTo",1),("VehicleType",1)],unique=True)
db['M08A'].create_index([("TimeInterval",1),("GantryFrom",1),("GantryTo",1),("VehicleType",1)],unique=True)

def dataframe2Mongo(select_df, trfType):
    collection = db[trfType]  
    records = select_df.to_dict('records') # 參數 record 代表把列轉成個別物件
    collection.insert_many(records)


In [9]:
def insertMongo(trType, day, baseDir):
    colNames={
            'M03A': ['TimeInterval','GantryId', 'Direction', 'VehicleType', '交通量'],
            'M04A': ['TimeInterval','GantryFrom','GantryTo', 'VehicleType', 'TravelTime', '交通量'],
            'M06A': ['VehicleType','DetectionTime_O', 'GantryID_O', 'DetectionTime_D', 'GantryID_D', 'TripLength', 'TripEnd', 'TripInformation'],
            'M08A': ['TimeInterval','GantryFrom','GantryTo', 'VehicleType', '交通量']
        }
    tarGzFile=baseDir+trType+'/'+ trType + '_' + day+'.tar.gz'
    print(tarGzFile)
    
    result=pd.DataFrame()
    tar = tarfile.open(tarGzFile, "r:gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)

        if f is not None:
            content = f.read()
            df = pd.read_csv(io.StringIO(content.decode('utf-8')), header=None, sep=',')
            
            result=result.append(df, ignore_index=True)
    
    result.columns = colNames[trType]
    result.TimeInterval=result.TimeInterval.apply(lambda dt:datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M")) #M03A, M08A use format: "%Y-%m-%d %H:%M"))
    dataframe2Mongo(result,trType)
    return result

In [10]:
def downloadInsert(trType, day, baseDir):
    downloadTDCSbyDay(trType, day, baseDir) #20190404-07
    insertMongo(trType, day, baseDir)

In [12]:
#days=['0404','0405','0406','0407','0606','0607','0608','0624','0625','0626','0627','0628','0202','0203','0204','0205','0206','0207','0208','0209','0210']#
days=['0918','0910','0919']#['1010','1011','1012','1013','0913','0914','0915']
for d in days:
    downloadInsert('M03A','2019'+d,baseDir)
    downloadInsert('M08A','2019'+d,baseDir)


../../../data/M03A
../../../data/M03A/M03A_20190918.tar.gz
../../../data/M08A
../../../data/M08A/M08A_20190918.tar.gz
../../../data/M03A
../../../data/M03A/M03A_20190910.tar.gz
../../../data/M08A
../../../data/M08A/M08A_20190910.tar.gz
../../../data/M03A
../../../data/M03A/M03A_20190919.tar.gz
../../../data/M08A
../../../data/M08A/M08A_20190919.tar.gz


In [12]:
#days=['0404','0405','0406','0407','0606','0607','0608','0624','0625','0626','0627','0628','0202','0203','0204','0205','0206','0207','0208','0209','0210']#
days=['1010','1011','1012','1013','0913','0914','0915']
for d in days:
    downloadInsert('M04A','2019'+d,baseDir)
    

../../../data/M04A
../../../data/M04A/M04A_20191010.tar.gz
../../../data/M04A
../../../data/M04A/M04A_20191011.tar.gz
../../../data/M04A
../../../data/M04A/M04A_20191012.tar.gz
../../../data/M04A
../../../data/M04A/M04A_20191013.tar.gz
../../../data/M04A
../../../data/M04A/M04A_20190913.tar.gz
../../../data/M04A
../../../data/M04A/M04A_20190914.tar.gz
../../../data/M04A
../../../data/M04A/M04A_20190915.tar.gz
