# 4 STEP TO GET DATASETS FOR INSIGHT OF EACH AREA IN TOKYO

## 1. Import Libraries and Pre-process
## 2. Get the Names and Locations of Yamanote Line Station
## 3. Get the Names and Locations of Most Popular Sightseeging Spots 
## 4. See What Kind Of and How Many Spots in Each Area in Tokyo

### 1. Import Libraries and Pre-Process

*   Import libraries
*   Upload CSV of Tokyo land price list and save as a DataFrame
*   Pickup one location for each area in Tokyo and Save it as a new DataFrame



In [1]:
# for data processing
import pandas as pd
import numpy as np
import io
from tqdm.notebook import tqdm
from datetime import datetime
import re

# for credential info
from getpass import getpass

# for sending GET requets
import requests
from bs4 import BeautifulSoup

# for uploading and downloading a csv file(if you use Google Colab)
from google.colab import files

In [2]:
# uploading csv of Tokyo land price list
csv = files.upload()

Saving tokyo_land_price_data.csv to tokyo_land_price_data (1).csv


In [3]:
# save the csv to pandas DataFrame
tokyo_land_price_list = pd.read_csv(io.BytesIO(csv['tokyo_land_price_data.csv']), index_col=0)
tokyo_land_price_list.head()

Unnamed: 0,area_name,address_detail,lat,lon,land_price_per_square_meter
0,Chiyoda City,"6-25 Sanbanchō, Chiyoda City, Tōkyō-to 102-007...",35.690086,139.744837,3160000.0
1,Chiyoda City,"3-27 Kioichō, Chiyoda City, Tōkyō-to 102-0094,...",35.682015,139.737615,2020000.0
2,Chiyoda City,"6-1 Rokubanchō, Chiyoda City, Tōkyō-to 102-008...",35.688143,139.732869,4050000.0
3,Chiyoda City,"1-chōme-8-6 Fujimi, Chiyoda City, Tōkyō-to 102...",35.697784,139.745095,1550000.0
4,Chiyoda City,"2 Chome Kudankita, Chiyoda City, Tokyo 102-007...",35.695576,139.746362,2960000.0


In [4]:
# pick up median spot for each area name(for easier processing)
each_area_tokyo_median = tokyo_land_price_list.groupby('area_name').median()
each_area_tokyo_median.head()

Unnamed: 0_level_0,lat,lon,land_price_per_square_meter
area_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adachi City,35.777899,139.80036,312500.0
Akiruno,35.729667,139.291109,97050.0
Akishima,35.707862,139.363728,186000.0
Arakawa City,35.736194,139.777149,545000.0
Bunkyo City,35.717668,139.751204,1040000.0


### 2. Get the Names and Locations of Yamanote Line Station

*   Get the names of station names from a wikipedia page
*   Get the locations of stasion by using Google Geocoding API
*   Save them as a DataFrame

<br>
*To use Google Geocoding API, You need to have your API key ready.<br>
Check the link below for more information.<br>

<a href="" target="_blank">Google Geocoding API</a>


In [5]:
# send GET request to have all the staion names of Yamanote Line
req = requests.get('https://en.wikipedia.org/wiki/Yamanote_Line')
response = BeautifulSoup(req.text, 'html.parser')
table = response.find('table', attrs={'class':'wikitable sortable mw-collapsible'})
rows = table.find_all('tr')

station_name_list = []
for row in rows:
  columns = row.find_all('td')
  if columns != [] and len(columns) >= 2:
    num = 0
    if 'Line' in columns[0].text:
      num = 1
    station_num = int(re.sub('[\r\n]', '', columns[num].text[-3:]))
    station_name = re.sub('\\[.+\\]|[\r\n]', '', columns[num+1].text)
    if [station_num, station_name] not in station_name_list:
      station_name_list.append([station_num, station_name])

In [6]:
# process the station list
station_name_list = [[station[0], '{} Station'.format(station[1])] for station in station_name_list]
station_name_list.sort(key=lambda num:num[0])
print(station_name_list)

[[1, 'Tokyo Station'], [2, 'Kanda Station'], [3, 'Akihabara Station'], [4, 'Okachimachi Station'], [5, 'Ueno Station'], [6, 'Uguisudani Station'], [7, 'Nippori Station'], [8, 'Nishi-Nippori Station'], [9, 'Tabata Station'], [10, 'Komagome Station'], [11, 'Sugamo Station'], [12, 'Ōtsuka Station'], [13, 'Ikebukuro Station'], [14, 'Mejiro Station'], [15, 'Takadanobaba Station'], [16, 'Shin-Ōkubo Station'], [17, 'Shinjuku Station'], [18, 'Yoyogi Station'], [19, 'Harajuku Station'], [20, 'Shibuya Station'], [21, 'Ebisu Station'], [22, 'Meguro Station'], [23, 'Gotanda Station'], [24, 'Ōsaki Station'], [25, 'Shinagawa Station'], [26, 'Takanawa Gateway Station'], [27, 'Tamachi Station'], [28, 'Hamamatsuchō Station'], [29, 'Shimbashi Station'], [30, 'Yūrakuchō Station']]


In [7]:
# save credential info
GOOGLE_API_KEY = getpass('INPUT GOOGLE API KEY: ')

INPUT GOOGLE API KEY: ··········


In [8]:
# Get latitude and longitude foe each station
station_list = []
for station in station_name_list:
  station_num = station[0]
  station_name = station[1]
  link = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}&language=en&region=jp'.format(station_name, GOOGLE_API_KEY)
  response = requests.get(link).json()
  lat = response['results'][0]['geometry']['location']['lat']
  lon = response['results'][0]['geometry']['location']['lng']
  station_list.append([station_num, station_name, lat, lon])

In [9]:
# create new DataFrame from the list
yamanote_station_df = pd.DataFrame(station_list, columns=['station_num', 'station_name', 'latitude', 'longitude'])
yamanote_station_df.head()

Unnamed: 0,station_num,station_name,latitude,longitude
0,1,Tokyo Station,35.681236,139.767125
1,2,Kanda Station,35.691822,139.770932
2,3,Akihabara Station,35.698383,139.773072
3,4,Okachimachi Station,35.707518,139.774856
4,5,Ueno Station,35.714167,139.777409


In [10]:
# dodnload the DataFrame as CSV file
yamanote_station_df.to_csv('yamanote_station.csv')
files.download('yamanote_station.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###3. Get the Names and Locations of Most Popular Sightseeging Spots

*   Same process as Step2

In [11]:
# Send GET requests to have the names of 10 most popular sightseeing spots in Tokyo
req = requests.get('https://matcha-jp.com/en/2589')
response = BeautifulSoup(req.text, 'html.parser')

h2_tags = response.find_all('h2')

popular_sight_name_list = []
row_count = len(h2_tags)

for i in range(row_count):
  if re.search('\d{1,2}', h2_tags[i].text) != None:
    popular_sight_name_list.append([i, re.sub('(-|at)\s', '', re.search('(-|at)\s.+', h2_tags[i].text).group())])

print(popular_sight_name_list)

[[1, 'Sensoji Temple'], [2, 'Tsukiji Fish Market'], [3, 'Ameyoko'], [4, 'Scramble Crossing'], [5, 'Tokyo Skytree'], [6, 'Kabukiza Theater'], [7, 'Oedo Onsen Monogatari'], [8, 'Ghibli Museum'], [9, 'Roppongi Hills'], [10, 'Meiji Shrine']]


In [12]:
# Get latitude and longitude for each spot
popular_sight_list = []

for spot in popular_sight_name_list:
  spot_num = spot[0]
  spot_name = spot[1]
  link = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}&language=en&region=jp'.format(spot_name, GOOGLE_API_KEY)
  response = requests.get(link).json()
  lat = response['results'][0]['geometry']['location']['lat']
  lon = response['results'][0]['geometry']['location']['lng']
  popular_sight_list.append([spot_num, spot_name, lat, lon])

In [13]:
# create new DataFrame from the list
popular_sight_df = pd.DataFrame(popular_sight_list, columns=['spot_num', 'spot_name', 'latitude', 'longitude'])
popular_sight_df.head(10)

Unnamed: 0,spot_num,spot_name,latitude,longitude
0,1,Sensoji Temple,35.714765,139.796655
1,2,Tsukiji Fish Market,35.664997,139.769816
2,3,Ameyoko,35.710333,139.774483
3,4,Scramble Crossing,35.659467,139.700554
4,5,Tokyo Skytree,35.710063,139.8107
5,6,Kabukiza Theater,35.669458,139.767812
6,7,Oedo Onsen Monogatari,35.615839,139.7778
7,8,Ghibli Museum,35.696238,139.570432
8,9,Roppongi Hills,35.660238,139.730077
9,10,Meiji Shrine,35.676398,139.699326


In [14]:
# download the DataFrame as a CSV file
popular_sight_df.to_csv('popular_sight_tokyo.csv')
files.download('popular_sight_tokyo.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### 4. See What Kind Of and How Many Spots in Each Area in Tokyo

*To use Foursquare API, you need to have your credential info to be set up.<br>
Click the link below for more information.
<br><br>

<a hred="https://developer.foursquare.com/docs/places-api/getting-started/" target="_blank">Foursquare API Documentation</a>



In [15]:
# save credential info
CLIENT_ID = getpass('Input CLIENT ID: ')
CLIENT_SECRET = getpass('Input CLIENT SECRET: ')
VERSION = datetime.now().strftime('%Y%m%d')

Input CLIENT ID: ··········
Input CLIENT SECRET: ··········


In [16]:
# check 50 popular spots from each area in Tokyo
row_count = each_area_tokyo_median.shape[0]

url_base = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION)
RADIUS = 5000

place_list = []
for i in tqdm(range(row_count)):
  row = each_area_tokyo_median.iloc[i]
  LL = '{},{}'.format(row[0], row[1])
  link = url_base + '&ll={}&radius={}&limit=50&sortByPopularity=1'.format(LL, RADIUS)
  response = requests.get(link).json()
  try:
    response['response']['warning']
  except KeyError:
    if response['response'] != {}:
      total_item_num = len(response['response']['groups'][0]['items'])
      for j in range(total_item_num):
        place = response['response']['groups'][0]['items'][j]
        category = place['venue']['categories'][0]['name']
        name = place['venue']['name']
        if [each_area_tokyo_median.index[i], name, category] not in place_list:
          place_list.append([each_area_tokyo_median.index[i], name, category])

HBox(children=(FloatProgress(value=0.0, max=62.0), HTML(value='')))




In [17]:
# check the base categories of each spot
url_base = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION)
response = requests.get(url_base).json()
first_categories = response['response']['categories']

category_list = []
for first_cat in first_categories:
  second_categories = first_cat['categories']
  if second_categories != []:
    for second_cat in second_categories:
      third_categories = second_cat['categories']
      if third_categories !=[]:
        for third_cat in third_categories:
          fourth_categories = third_cat['categories']
          if fourth_categories != []:
            for fourth_cat in fourth_categories:
              fifth_categories = fourth_cat['categories']
              if fifth_categories != []:
                for fifth_cat in fifth_categories:
                  sixth_categories = fifth_cat['categories']
                  if sixth_categories != []:
                    print('more than 6')
                  else:
                    category_list.append([first_cat['name'], fifth_cat['name']])                
              else:
                category_list.append([first_cat['name'], fourth_cat['name']])
          else:
            category_list.append([first_cat['name'], third_cat['name']])
      else:
        category_list.append([first_cat['name'], second_cat['name']])
  else:
    category_list.append([first_cat['name'], first_cat['name']])

In [18]:
# create a DataFrame from the list
category_df = pd.DataFrame(category_list, columns=['main_category', 'category'])
category_df.head()

Unnamed: 0,main_category,category
0,Arts & Entertainment,Amphitheater
1,Arts & Entertainment,Aquarium
2,Arts & Entertainment,Arcade
3,Arts & Entertainment,Art Gallery
4,Arts & Entertainment,Bowling Alley


In [19]:
# count the number of spot in each ctegory of each area
place_df = pd.DataFrame(place_list, columns=['area', 'place_name', 'category'])
place_category_df = pd.merge(place_df, category_df, on='category')
place_category_count_df = place_category_df.groupby(['area', 'main_category']).count()
place_category_count_df = place_category_count_df[['category']].rename(columns={'category':'count_num'})
place_category_count_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count_num
area,main_category,Unnamed: 2_level_1
Adachi City,Arts & Entertainment,2
Adachi City,Food,11
Adachi City,Outdoors & Recreation,8
Adachi City,Shop & Service,22
Akiruno,Arts & Entertainment,3


In [20]:
# download the DataFrame as a CSV file
place_category_count_df.to_csv('tokyo_category_count.csv')
files.download('tokyo_category_count.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>