In [1]:
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import json
import os
import numpy as np
from langdetect import detect, LangDetectException
from tqdm import tqdm
from google.cloud import translate_v2 as translate
from multiprocessing import Pool, cpu_count
import swifter
import re
import time

In [2]:
#import reviews
review_file_dir = f'{os.getcwd()}/data/reviews.csv'
reviews = pd.read_csv(review_file_dir)
#import listings
listing_file_dir = f'{os.getcwd()}/data/listings.csv'
listing = pd.read_csv(listing_file_dir)
#import neighborhood name list 
neighbourhood_name_dir = f'{os.getcwd()}/data/neighbourhoods.csv'
neighbourhood_name_list = list(pd.read_csv(neighbourhood_name_dir)['neighbourhood'])

In [3]:
reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,6400,36772,2010-04-19,94463,Hyun,I had such a great stay at 'the studio.' Fran...
1,6400,227727,2011-04-16,415910,Tej,Staying at Francesca's and Alberto's place was...
2,6400,1162608,2012-04-22,415910,Tej,This is my second time staying with Francesca ...
3,6400,11670681,2014-04-11,8464658,Isabelle,"Ein wunderbares Zimmer mit privatem Bad/ WC, a..."
4,6400,11780336,2014-04-14,5427294,Hatice,"I was lucky so I have stayed with Francesca, A..."
...,...,...,...,...,...,...
830431,1178162158561278368,1180348684834926068,2024-06-16,195387166,Simona&Barbara,"Grazioso monolocale curato, situato in una c..."
830432,1178253218362535053,1183211946024086802,2024-06-20,9082474,Franck,"Très bel appartement,<br/>Idéal pour un séjour..."
830433,1178724795174333650,1180342737285847908,2024-06-16,222898684,Giorgia,Abbiamo soggiornato una notte in questo splend...
830434,1178766455827126425,1183896127694719513,2024-06-21,237810072,Alwin,A nice and cosy appartment near the lovely are...


In [4]:
#check column names of 'listing' data frame
list(listing)

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'source',
 'name',
 'description',
 'neighborhood_overview',
 'picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30

In [5]:
# select the property list id and neighborhood name from the listings
listing_id_neighborhood_only = listing[['id', 'neighbourhood_cleansed']]
listing_id_neighborhood_only = listing_id_neighborhood_only.rename(columns={'id': 'listing_id', 'neighbourhood_cleansed': 'neighbourhood'})
listing_id_neighborhood_only

Unnamed: 0,listing_id,neighbourhood
0,6400,TIBALDI
1,304050,XXII MARZO
2,23986,NAVIGLI
3,309905,TICINESE
4,40470,VIALE MONZA
...,...,...
24341,1183404772778230864,GHISOLFA
24342,1183443016816299488,BUENOS AIRES - VENEZIA
24343,1183510554857606594,PARCO SEMPIONE
24344,1183852767989216938,PORTA ROMANA


In [6]:
# make sure if our preexising neighbourhood name list matches to the data we are dealing with.
np.unique(neighbourhood_name_list) == np.unique(listing_id_neighborhood_only['neighbourhood'])

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [7]:
# join the neighborhood name to reviews using listing IDs
reviews_with_neighborhood = pd.merge(reviews, listing_id_neighborhood_only, on='listing_id', how='left')
reviews_with_neighborhood

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,neighbourhood
0,6400,36772,2010-04-19,94463,Hyun,I had such a great stay at 'the studio.' Fran...,TIBALDI
1,6400,227727,2011-04-16,415910,Tej,Staying at Francesca's and Alberto's place was...,TIBALDI
2,6400,1162608,2012-04-22,415910,Tej,This is my second time staying with Francesca ...,TIBALDI
3,6400,11670681,2014-04-11,8464658,Isabelle,"Ein wunderbares Zimmer mit privatem Bad/ WC, a...",TIBALDI
4,6400,11780336,2014-04-14,5427294,Hatice,"I was lucky so I have stayed with Francesca, A...",TIBALDI
...,...,...,...,...,...,...,...
830431,1178162158561278368,1180348684834926068,2024-06-16,195387166,Simona&Barbara,"Grazioso monolocale curato, situato in una c...",LORETO
830432,1178253218362535053,1183211946024086802,2024-06-20,9082474,Franck,"Très bel appartement,<br/>Idéal pour un séjour...",ISOLA
830433,1178724795174333650,1180342737285847908,2024-06-16,222898684,Giorgia,Abbiamo soggiornato una notte in questo splend...,DE ANGELI - MONTE ROSA
830434,1178766455827126425,1183896127694719513,2024-06-21,237810072,Alwin,A nice and cosy appartment near the lovely are...,TICINESE


In [8]:
#define a function to detect languages of review comments
def safe_detect(text):
    try:
        return detect(str(text))
    except LangDetectException:
        return None

In [9]:
# map the function to the review comments 
reviews_with_neighborhood["language"] = (
    reviews_with_neighborhood["comments"]
    .fillna("")
    .swifter.apply(safe_detect)
)

Pandas Apply:   0%|          | 0/830436 [00:00<?, ?it/s]

In [10]:
reviews_with_neighborhood = reviews_with_neighborhood[~reviews_with_neighborhood["language"].isna()].reset_index(drop=True)
reviews_with_neighborhood

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,neighbourhood,language
0,6400,36772,2010-04-19,94463,Hyun,I had such a great stay at 'the studio.' Fran...,TIBALDI,en
1,6400,227727,2011-04-16,415910,Tej,Staying at Francesca's and Alberto's place was...,TIBALDI,en
2,6400,1162608,2012-04-22,415910,Tej,This is my second time staying with Francesca ...,TIBALDI,en
3,6400,11670681,2014-04-11,8464658,Isabelle,"Ein wunderbares Zimmer mit privatem Bad/ WC, a...",TIBALDI,de
4,6400,11780336,2014-04-14,5427294,Hatice,"I was lucky so I have stayed with Francesca, A...",TIBALDI,en
...,...,...,...,...,...,...,...,...
826832,1178162158561278368,1180348684834926068,2024-06-16,195387166,Simona&Barbara,"Grazioso monolocale curato, situato in una c...",LORETO,it
826833,1178253218362535053,1183211946024086802,2024-06-20,9082474,Franck,"Très bel appartement,<br/>Idéal pour un séjour...",ISOLA,fr
826834,1178724795174333650,1180342737285847908,2024-06-16,222898684,Giorgia,Abbiamo soggiornato una notte in questo splend...,DE ANGELI - MONTE ROSA,it
826835,1178766455827126425,1183896127694719513,2024-06-21,237810072,Alwin,A nice and cosy appartment near the lovely are...,TICINESE,en


In [11]:
# clean comments with unnecessary HTML tags
def clean_review_text(text):
    if not isinstance(text, str):
        return text
    # Remove line breaks and HTML tags like <br/>, <br>, etc.
    text = re.sub(r"(\\r)?<br\s*/?>", " ", text, flags=re.IGNORECASE)
    text = re.sub(r"&nbsp;", " ", text)
    text = re.sub(r"\s+", " ", text)  # Collapse multiple spaces
    return text.strip()

In [12]:
# cleaning comments
reviews_with_neighborhood["comments_cleaned"] =reviews_with_neighborhood["comments"] =reviews_with_neighborhood["comments"].apply(clean_review_text)
reviews_with_neighborhood

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,neighbourhood,language,comments_cleaned
0,6400,36772,2010-04-19,94463,Hyun,I had such a great stay at 'the studio.' Franc...,TIBALDI,en,I had such a great stay at 'the studio.' Franc...
1,6400,227727,2011-04-16,415910,Tej,Staying at Francesca's and Alberto's place was...,TIBALDI,en,Staying at Francesca's and Alberto's place was...
2,6400,1162608,2012-04-22,415910,Tej,This is my second time staying with Francesca ...,TIBALDI,en,This is my second time staying with Francesca ...
3,6400,11670681,2014-04-11,8464658,Isabelle,"Ein wunderbares Zimmer mit privatem Bad/ WC, a...",TIBALDI,de,"Ein wunderbares Zimmer mit privatem Bad/ WC, a..."
4,6400,11780336,2014-04-14,5427294,Hatice,"I was lucky so I have stayed with Francesca, A...",TIBALDI,en,"I was lucky so I have stayed with Francesca, A..."
...,...,...,...,...,...,...,...,...,...
826832,1178162158561278368,1180348684834926068,2024-06-16,195387166,Simona&Barbara,"Grazioso monolocale curato, situato in una cor...",LORETO,it,"Grazioso monolocale curato, situato in una cor..."
826833,1178253218362535053,1183211946024086802,2024-06-20,9082474,Franck,"Très bel appartement, Idéal pour un séjour à M...",ISOLA,fr,"Très bel appartement, Idéal pour un séjour à M..."
826834,1178724795174333650,1180342737285847908,2024-06-16,222898684,Giorgia,Abbiamo soggiornato una notte in questo splend...,DE ANGELI - MONTE ROSA,it,Abbiamo soggiornato una notte in questo splend...
826835,1178766455827126425,1183896127694719513,2024-06-21,237810072,Alwin,A nice and cosy appartment near the lovely are...,TICINESE,en,A nice and cosy appartment near the lovely are...


In [13]:
# Subset comments written in foreign languages
non_english_reviews = reviews_with_neighborhood[ reviews_with_neighborhood["language"] != "en" ].reset_index(drop=True)
#non_english_reviews.to_csv("non_english_comments.csv")
non_english_reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,neighbourhood,language,comments_cleaned
0,6400,11670681,2014-04-11,8464658,Isabelle,"Ein wunderbares Zimmer mit privatem Bad/ WC, a...",TIBALDI,de,"Ein wunderbares Zimmer mit privatem Bad/ WC, a..."
1,23986,47872586,2015-09-21,17316381,Paolo,La zona è molto comoda e la via è tranquilla. ...,NAVIGLI,it,La zona è molto comoda e la via è tranquilla. ...
2,23986,90028316,2016-07-30,70469005,Morgane,L'appartement de Jérémy est idéal pour séjourn...,NAVIGLI,fr,L'appartement de Jérémy est idéal pour séjourn...
3,23986,104338483,2016-09-26,95139230,Aurélie,"Jérémy est un hôte à recommander, il est très ...",NAVIGLI,fr,"Jérémy est un hôte à recommander, il est très ..."
4,23986,124124406,2017-01-01,2827231,Miriam,"Jeremy no estuvo en nuestra recepción, estaba ...",NAVIGLI,es,"Jeremy no estuvo en nuestra recepción, estaba ..."
...,...,...,...,...,...,...,...,...,...
371291,1177396299056691638,1183220867919411167,2024-06-20,131662771,Michele,Damiano e Vanessa sono stati gli host migliori...,LORETO,it,Damiano e Vanessa sono stati gli host migliori...
371292,1177396299056691638,1183931896768540185,2024-06-21,450184523,Laura,"Vanessa et Damiano était très sympas, nous avo...",LORETO,fr,"Vanessa et Damiano était très sympas, nous avo..."
371293,1178162158561278368,1180348684834926068,2024-06-16,195387166,Simona&Barbara,"Grazioso monolocale curato, situato in una cor...",LORETO,it,"Grazioso monolocale curato, situato in una cor..."
371294,1178253218362535053,1183211946024086802,2024-06-20,9082474,Franck,"Très bel appartement, Idéal pour un séjour à M...",ISOLA,fr,"Très bel appartement, Idéal pour un séjour à M..."


In [14]:
non_english_reviews.iloc()[16]

listing_id                                                      40470
id                                                            8640735
date                                                       2013-11-10
reviewer_id                                                   8064897
reviewer_name                                                Vladimir
comments            The apartment is ideally located, close to eve...
neighbourhood                                             VIALE MONZA
language                                                           bg
comments_cleaned    The apartment is ideally located, close to eve...
Name: 16, dtype: object

In [15]:
non_english_reviews.comments[49]

"Una sensazione di armonia e pace in un'accogliente atmosfera creata da due persone stra-ordinarie. La stanza con balconcino e il bagno forniti di tutto ciò che serve fino all'ultimo dettaglio. A due passi dal centro e circondato da localini interessanti. Lo consiglio vivamente sia a chi si ferma una notte per lavoro che al turista in cerca di quel qualcosa in piú che una stanza dove pernottare. Ringrazio Elena e Philip per la grande ospitalitá"

In [16]:
non_english_reviews.comments_cleaned[49]

"Una sensazione di armonia e pace in un'accogliente atmosfera creata da due persone stra-ordinarie. La stanza con balconcino e il bagno forniti di tutto ciò che serve fino all'ultimo dettaglio. A due passi dal centro e circondato da localini interessanti. Lo consiglio vivamente sia a chi si ferma una notte per lavoro che al turista in cerca di quel qualcosa in piú che una stanza dove pernottare. Ringrazio Elena e Philip per la grande ospitalitá"

# Translating foreign language comments into English.

To run this part, you’ll need access credentials for the Google Translation API: https://cloud.google.com/translate/docs/reference/rest. 
The translation is performed using the `google.cloud` Python package.

You can find the official documentation for `google.cloud` here:
https://github.com/googleapis/google-cloud-python

Note: I’m not executing the code in this notebook, but you’re welcome to try it on your own. If you have any questions, please reach out to me. [minsu.jang@unimib.it](mailto:minsu.jang@unimib.it)

In [None]:
# Google Cloud credential setting
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "YOUR_CREDENTIAL_FILENAME_HERE.json"
translate_client = translate.Client()

In [None]:
# define a function to translate foreign languages to English
def translate_to_en(text, lang):
    if lang != "en" and lang is not None and isinstance(text, str) and text.strip():
        try:
            result = translate_client.translate(text, source_language=lang, target_language="en")
            return result["translatedText"]
        except Exception:
            return None
    return text

In [None]:
# Toss comments per chunk to avoid API being killed.
chunk_size = 5000
wait_seconds = 10
num_chunks = (len(non_english_reviews) // chunk_size) + 1

for i in range(num_chunks):
    print(f"\nTranslating chunk {i + 1}/{num_chunks}...")
    chunk = non_english_reviews.iloc[i * chunk_size:(i + 1) * chunk_size].copy()
    
    chunk["translated_comments"] = chunk.swifter.apply(
        lambda row: translate_to_en(row["comments_cleaned"], row["language"]), axis=1
    )

    chunk_file = f"translated_chunk_{i + 1}.csv"
    chunk.to_csv(chunk_file, index=False)
    print(f"Chunk {i + 1} saved to {chunk_file}.")

    if i < num_chunks - 1:
        print(f"Waiting {wait_seconds} seconds before next chunk...")
        time.sleep(wait_seconds)

print("\nAll chunks translated and saved.")