In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("drive/MyDrive/project/soma")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install googletrans==4.0.0-rc1
!pip install transformers

In [5]:
from collections import defaultdict
import pickle

import pandas as pd
import googletrans

from transformers import pipeline
from tqdm import tqdm

In [6]:
reviews = pd.read_csv("tokyo_hotels_review.csv")
data = reviews[["hotel_id","title", "pros", "cons"]].fillna("").values.tolist()
hotel = list(d[0] for d in data)
text = list(' '.join(d[1:]).replace('\n', '') for d in data)

reviews["text"] = text
hotel = reviews[["hotel_id", "text"]]
hotel.head()

Unnamed: 0,hotel_id,text
0,245830,"Good! The staff were very friendly, and helped..."
1,245830,A great Tokyo gem 💎 q This is a fantastic hote...
2,245830,Very good value for money Location is incredib...
3,245830,"nice hotel, very close to the heart of Tokyo e..."
4,245830,Very good near from the station and a clean fa...


In [8]:
def translate_review(fname):
  df = pd.read_csv(f"{fname}.csv")
  trans = googletrans.Translator()
  df["text"] = df["text"].apply(lambda x: trans.translate(x.rstrip(), dest="en").text)
  del df["Unnamed: 0"]
  df.to_csv(f"{fname}_translated.csv", index=False)
  return df

In [9]:
# translate_review("tokyo_restaurant_review")
# translate_review("tokyo_attraction_review")
restaurant = pd.read_csv("tokyo_restaurant_review_translated.csv")
attraction = pd.read_csv("tokyo_attraction_review_translated.csv")

restaurant.head()

Unnamed: 0,location_id,text,name
0,14745403,I didn't expect much of course dishes at a rea...,Danshi Chubo Sakaba Kuraoakari
1,14745403,"A izakaya in Kuramae, a izakaya in Kuramae, Ta...",Danshi Chubo Sakaba Kuraoakari
2,8466418,Nice latte in a pleasant space The Label Creat...,Labelcafe Tokyo
3,8466418,It is a secret cafe located just right after c...,Labelcafe Tokyo
4,8466418,Avoid the hustle and bustle ... quiet cafe@Aza...,Labelcafe Tokyo


In [10]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device=0)
keywords = ["cheap","photo","natural","active", "local", "relaxing", "learn"]

In [11]:
results = classifier(attraction["text"].to_list(), keywords)
results[0]

{'labels': ['active',
  'learn',
  'photo',
  'relaxing',
  'cheap',
  'local',
  'natural'],
 'scores': [0.28559544682502747,
  0.260632187128067,
  0.23173579573631287,
  0.15464569628238678,
  0.025186613202095032,
  0.024271536618471146,
  0.017932679504156113],
 'sequence': 'Big Pool Big swimming pool in a nice facility, a bit far from everything and difficult to access.  This place most like will be used during the 2020 olympic games in Tokyo'}

In [12]:
threshold = 0.4
attraction_matched = defaultdict(set)

for id, result in zip(attraction["location_id"].to_list(), tqdm(results)):
  probs = zip(result["labels"], result["scores"])
  selected = list(map(lambda x: x[0], filter(lambda x: x[1] > threshold, probs)))
  for keyword in selected:
    attraction_matched[id].add(keyword)

attraction_matched

100%|█████████▉| 256/257 [00:00<00:00, 263689.05it/s]


defaultdict(set,
            {1626639: {'relaxing'},
             1898820: {'active'},
             1997879: {'active', 'relaxing'},
             2194205: {'local'},
             3782407: {'active', 'relaxing'},
             4294281: {'active'},
             4739787: {'relaxing'},
             4766996: {'cheap', 'local'},
             6699646: {'local'},
             7381280: {'active', 'local'},
             7684847: {'active', 'local', 'relaxing'},
             7720586: {'active'},
             8121684: {'active', 'learn'},
             8281134: {'active'},
             8658835: {'relaxing'},
             8674359: {'active', 'relaxing'},
             8707675: {'local'},
             8746815: {'cheap'},
             9466374: {'learn', 'local'},
             9606088: {'active', 'local'},
             9764643: {'local'},
             9778120: {'local', 'relaxing'},
             9974956: {'learn', 'relaxing'},
             10019642: {'active', 'local'},
             10152321: {'learn', '

In [13]:
with open("attraction_matched.bin", "wb") as f:
  pickle.dump(attraction_matched, f)

In [14]:
threshold = 0.4
results = classifier(restaurant["text"].to_list(), keywords)
restaurant_matched = defaultdict(set)

for id, result in zip(restaurant["location_id"].to_list(), tqdm(results)):
  probs = zip(result["labels"], result["scores"])
  selected = list(map(lambda x: x[0], filter(lambda x: x[1] > threshold, probs)))
  for keyword in selected:
    restaurant_matched[id].add(keyword)

restaurant_matched

 99%|█████████▉| 183/184 [00:00<00:00, 275110.26it/s]


defaultdict(set,
            {1659106: {'active'},
             1660599: {'local', 'relaxing'},
             1661631: {'local'},
             1662658: {'local'},
             1664327: {'local'},
             1671782: {'relaxing'},
             1676958: {'relaxing'},
             1677276: {'cheap'},
             1678464: {'active', 'relaxing'},
             1685918: {'active'},
             1690649: {'active', 'natural'},
             1695583: {'photo'},
             2658457: {'active', 'cheap'},
             3237695: {'relaxing'},
             3687280: {'active'},
             3808431: {'active', 'learn'},
             3827830: {'cheap'},
             3843677: {'cheap'},
             3927586: {'cheap'},
             4663895: {'local'},
             4668846: {'active', 'cheap'},
             4669033: {'relaxing'},
             4671363: {'active', 'relaxing'},
             4946051: {'active'},
             5053678: {'relaxing'},
             5929727: {'active', 'cheap'},
             606

In [15]:
with open("restaurant_matched.bin", "wb") as f:
  pickle.dump(restaurant_matched, f)

In [16]:
threshold = 0.4
results = classifier(hotel["text"].to_list(), keywords)
hotel_matched = defaultdict(set)

for id, result in zip(hotel["hotel_id"].to_list(), tqdm(results)):
  probs = zip(result["labels"], result["scores"])
  selected = list(map(lambda x: x[0], filter(lambda x: x[1] > threshold, probs)))
  for keyword in selected:
    hotel_matched[id].add(keyword)

hotel_matched

100%|█████████▉| 371/372 [00:00<00:00, 281950.86it/s]


defaultdict(set,
            {179801: {'active', 'cheap', 'local', 'photo', 'relaxing'},
             245830: {'active', 'cheap', 'local', 'relaxing'},
             250231: {'active', 'cheap', 'local', 'relaxing'},
             254889: {'active', 'cheap', 'relaxing'},
             914003: {'active', 'cheap', 'local', 'relaxing'},
             1084571: {'active', 'cheap', 'learn', 'local', 'relaxing'},
             2456812: {'active', 'cheap', 'local', 'natural', 'relaxing'},
             2707868: {'active', 'cheap', 'local', 'relaxing'},
             2790681: {'active', 'local', 'relaxing'},
             2890810: {'active', 'cheap', 'local', 'relaxing'},
             3379176: {'active', 'local', 'relaxing'},
             3380553: {'active', 'local'},
             3844861: {'active', 'cheap', 'local', 'relaxing'},
             3932853: {'active', 'cheap', 'local', 'relaxing'},
             4876056: {'active', 'cheap', 'local', 'relaxing'},
             5136503: {'active'},
             

In [17]:
with open("hotel_matched.bin", "wb") as f:
  pickle.dump(hotel_matched, f)