In [1]:
import threading
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import json
import os
from fake_useragent import UserAgent
from tqdm.notebook import tqdm

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
user_agent = UserAgent()

In [2]:
template = 'https://www.google.com.tw/maps/dir/{}/{}'
input_file = 'address_list.json'
output_file = 'adjacency_matrix.json'

In [3]:
def get_header():
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
        "Accept-Encoding": "gzip, deflate, br", 
        "Accept-Language": "zh-TW,zh;q=0.9", 
        "Sec-Fetch-Dest": "document", 
        "Sec-Fetch-Mode": "navigate", 
        "Sec-Fetch-Site": "none", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": user_agent.random
    }
    return headers

In [4]:
def get_dist(soup):
    doc = soup.prettify()
    if doc.find('公里') != -1:
        pos = doc.find('公里')
        return float(doc[pos-20:pos].split('\"')[-1])

    elif doc.find('公尺') != -1:
        pos = doc.find('公尺')
        return float(doc[pos-20:pos].split('\"')[-1]) / 1000
    else:
        return None

def job(i, j, url, adjacency_matrix):
    html_doc = requests.get(url, headers=get_header())
    if html_doc.status_code != 200:
        print('I\'m a robot')
        return
    soup = BeautifulSoup(html_doc.text, 'html.parser')
    adjacency_matrix[i['name']][j['name']] = get_dist(soup)

def crawler(addr_list, adjacency_matrix):
    threads = []
    
    arr = []
    for i in addr_list:
        for j in addr_list:
            arr.append([i, j])
            
    for index, (i, j) in enumerate(tqdm(arr)):
        if adjacency_matrix[i['name']][j['name']] != None:
            continue
        url = template.format(i['address'], j['address'])
        print('\r{} -> {}'.format(i['name'], j['name']), end='')
#         print('\r{}/{}, {} -> {}, url: {}'.format(
#             index, len(addr_list)*len(addr_list), i['name'], j['name'], url), end='')

        threads.append(threading.Thread(target = job, args = (i, j, url, adjacency_matrix)))
        threads[-1].start()
        sleep(0.01)
    print()
    print('wait a moment')
    for thread in threads:
        thread.join()

def get_valid_edge_size(adjacency_matrix):
    cnt = 0
    for src in adjacency_matrix.keys():
        for dest in adjacency_matrix.keys():
            if adjacency_matrix[src][dest] != None:
                cnt += 1
    return cnt

In [5]:
def init_adjacency_matrix(addr_list):
    adjacency_matrix = {}
    for i in addr_list:
        adjacency_matrix[i['name']] = {}
        for j in addr_list:
            if i['name'] == j['name']:
                adjacency_matrix[i['name']][j['name']] = 0
            else:
                adjacency_matrix[i['name']][j['name']] = None
    return adjacency_matrix

def get_matrix(output_file, addr_list):
    name_set = set(pd.DataFrame(addr_list)['name'])
    
    # cache file
    if os.path.exists(output_file):
        with open(file=output_file, mode='r', encoding='utf-8') as reader:
            adjacency_matrix = json.load(reader)

        # check if the cache file is available
        if set(adjacency_matrix.keys()) == name_set:
            return adjacency_matrix

    adjacency_matrix = init_adjacency_matrix(addr_list)
    # create/overwrite file
    with open(file=output_file, mode='w', encoding='utf-8') as writer:
        json.dump(obj=adjacency_matrix, fp=writer, ensure_ascii=False, indent=4)
    return adjacency_matrix    

### main function

In [6]:
with open(file=input_file, mode='r', encoding='utf-8') as reader:
    addr_list = json.load(reader)
# addr_list = [{"name": "中坡", "address": "台北市南港區中坡南路47號1樓"},
#               {"name": "中研", "address": "台北市南港區研究院路二段128號1樓(學術活動中心)"},
#               {"name": "中貿", "address": "台北市南港區經貿二路186號2樓"},
#               {"name": "玉成", "address": "台北市南港區西新里南港路三段3號1樓"}]

In [7]:
adjacency_matrix = get_matrix(output_file, addr_list)

cnt = get_valid_edge_size(adjacency_matrix) # number of edge that already get distance
print('cnt = {}'.format(cnt))

time_out = 0
while cnt != len(addr_list) * len(addr_list):
    if time_out > 3:
        print('[ERROR] I\'m a robot')
        break
    
    crawler(addr_list, adjacency_matrix)
    newcnt = get_valid_edge_size(adjacency_matrix)
    if cnt == newcnt:
        sleep(2)
        time_out += 1

    cnt = newcnt
    print('cnt = {}'.format(cnt))
    sleep(5)

cnt = 5


  0%|          | 0/25 [00:00<?, ?it/s]

玉德 -> 玉成
wait a moment
cnt = 21


  0%|          | 0/25 [00:00<?, ?it/s]

中貿 -> 玉德
wait a moment
cnt = 25


In [8]:
print('adjacency matrix save at {}'.format(output_file))
with open(file=output_file, mode='w', encoding='utf-8') as writer:
    json.dump(obj=adjacency_matrix, fp=writer, ensure_ascii=False, indent=4)

adjacency matrix save at adjacency_matrix.json


In [9]:
pd.DataFrame(adjacency_matrix)

Unnamed: 0,中坡,中研,中貿,玉成,玉德
中坡,0.0,6.0,4.8,2.5,0.35
中研,5.5,0.0,2.5,4.4,5.3
中貿,4.7,3.2,0.0,2.7,4.5
玉成,2.2,4.8,3.1,0.0,2.0
玉德,0.35,5.7,4.5,2.3,0.0
