<a href="https://colab.research.google.com/github/pantso251/DSFSE_Portfolio/blob/main/trip_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TripAdvisor Matching Test with SerpAPI

This notebook uses the first 5 beaches from `Beaches_Matched_1000m-1.csv` and tests how well Google Maps beach names match TripAdvisor places via SerpAPI.


In [None]:
#!pip install serpapi google-search-results pandas openpyxl

In [None]:
import requests
import pandas as pd
import time

# Configuration
API_KEY = "132be331306d1619add51865f414944dec9bddda3e4abeedf375265e566f00b4"
INPUT_FILE = "Beaches_Matched_1000m.csv"  # Use your existing CSV
OUTPUT_FILE = "beach_reviews_test.csv"


In [None]:
df = pd.read_csv(INPUT_FILE)
subset = df.head(5).copy()

print(f"Loaded {len(subset)} beaches\n")

results = []

for index, row in subset.iterrows():
    original_name = row['Original_Beach_Name']
    google_name = row['API_Beach_Name']
    vicinity = row.get('vicinity', '')

    print(f"\n=== Beach #{index + 1}: {original_name} ===")

    match = get_tripadvisor_match(google_name, vicinity, API_KEY)

    results.append({
        "Original_Beach_Name": original_name,
        "Google_Maps_Name": google_name,
        "Vicinity": vicinity,
        "Tripadvisor_Name": match.get("tripadvisor_name") if match else None,
        "Place_ID": match.get("place_id") if match else None,
        "Rating": match.get("rating") if match else None,
        "Review_Count": match.get("review_count") if match else None,
        "Tripadvisor_URL": match.get("link") if match else None
    })

    time.sleep(1)
    print("-" * 60)

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("tripadvisor_matches.csv", index=False)
print(f"\nSaved {len(results)} beach matches to 'tripadvisor_matches.csv'")
results_df


Loaded 5 beaches


=== Beach #1: Aciddara ===
   Query: Spiaggia Aciddara Spiaggia Aciddara, Santa Flavia Italy
   Found 30 results
   -> Match: 'Spiaggia Kafara'
      Rating: 3.4 | Reviews: 14
------------------------------------------------------------

=== Beach #2: Bagni della Regina Giovanna - Sorrento (Na) ===
   Query: Capo di Sorrento Sorrento Italy
   Found 30 results
   -> Match: 'Punta del Capo'
      Rating: 4.6 | Reviews: 183
------------------------------------------------------------

=== Beach #3: baia delle favole ===
   Query: Baia delle Favole Sestri Levante Italy
   Found 30 results
   -> Match: 'Bay of Silence'
      Rating: 4.5 | Reviews: 2883
------------------------------------------------------------

=== Beach #4: Boschetto Steccato di Cutro  ===
   Query: Spiaggia Libera Steccato di Cutro Steccato di Cutro Italy
   Found 30 results
   -> Match: 'Spiaggia Libera Darsena'
      Rating: 3 | Reviews: 74
-----------------------------------------------------------

Unnamed: 0,Original_Beach_Name,Google_Maps_Name,Vicinity,Tripadvisor_Name,Place_ID,Rating,Review_Count,Tripadvisor_URL
0,Aciddara,Spiaggia Aciddara,"Spiaggia Aciddara, Santa Flavia",Spiaggia Kafara,12987654,3.4,14,https://www.tripadvisor.com/Attraction_Review-...
1,Bagni della Regina Giovanna - Sorrento (Na),Capo di Sorrento,Sorrento,Punta del Capo,669863,4.6,183,https://www.tripadvisor.com/Attraction_Review-...
2,baia delle favole,Baia delle Favole,Sestri Levante,Bay of Silence,2537841,4.5,2883,https://www.tripadvisor.com/Attraction_Review-...
3,Boschetto Steccato di Cutro,Spiaggia Libera Steccato di Cutro,Steccato di Cutro,Spiaggia Libera Darsena,4745571,3.0,74,https://www.tripadvisor.com/Attraction_Review-...
4,Cala Santa Maria,Torre Cala Santa Maria,"Via Torre S Maria, 7, Ustica",Torre di Santa Maria Navarrese,10463529,3.9,85,https://www.tripadvisor.com/Attraction_Review-...


In [None]:
# Safe to skip reverse geocoding long/lat with Google Maps
"""
def get_google_place_name(lat, lon, api_key):
    ""
    Uses Google Maps (via SerpApi) to reverse-geocode coordinates into a place name.
    ""
    url = "https://serpapi.com/search"
    params = {
        "engine": "google_maps",
        "q": f"{lat},{lon}",
        "type": "search",
        "api_key": api_key,
        "hl": "en"  # Only ONE hl parameter
    }

    try:
        response = requests.get(url, params=params)
        data = response.json()

        if "place_results" in data:
            return data["place_results"].get("title")
        elif "local_results" in data and len(data["local_results"]) > 0:
            return data["local_results"][0].get("title")

    except Exception as e:
        print(f"Error connecting to Google Maps: {e}")

    return None

    """


'\ndef get_google_place_name(lat, lon, api_key):\n    ""\n    Uses Google Maps (via SerpApi) to reverse-geocode coordinates into a place name.\n    ""\n    url = "https://serpapi.com/search"\n    params = {\n        "engine": "google_maps",\n        "q": f"{lat},{lon}",\n        "type": "search",\n        "api_key": api_key,\n        "hl": "en"  # Only ONE hl parameter\n    }\n\n    try:\n        response = requests.get(url, params=params)\n        data = response.json()\n\n        if "place_results" in data:\n            return data["place_results"].get("title")\n        elif "local_results" in data and len(data["local_results"]) > 0:\n            return data["local_results"][0].get("title")\n\n    except Exception as e:\n        print(f"Error connecting to Google Maps: {e}")\n\n    return None\n\n    '

In [None]:
def get_tripadvisor_match(place_name, vicinity, api_key):
    """
    Find TripAdvisor match and get metadata (rating, review count).
    Does NOT fetch individual reviews (requires different API plan).
    """
    url = "https://serpapi.com/search"

    # Build query
    parts = [place_name]
    if vicinity and isinstance(vicinity, str):
        parts.append(vicinity)
    parts.append("Italy")
    q = " ".join(parts)

    params = {
        "engine": "tripadvisor",
        "q": q,
        "api_key": api_key,
        "ssrc": "A",
        "num": "5"
    }

    print(f"   Query: {q}")

    try:
        response = requests.get(url, params=params)
        data = response.json()
        results = data.get("places", [])

        print(f"   Found {len(results)} results")

        if results:
            top = results[0]
            match_data = {
                "tripadvisor_name": top.get("title"),
                "place_id": top.get("place_id"),
                "rating": top.get("rating"),
                "review_count": top.get("reviews"),
                "location": top.get("location"),
                "link": top.get("link")
            }

            print(f"   -> Match: '{match_data['tripadvisor_name']}'")
            print(f"      Rating: {match_data['rating']} | Reviews: {match_data['review_count']}")

            return match_data
        else:
            print("   -> No match found")

    except Exception as e:
        print(f"   Error: {e}")

    return None


In [None]:
# # Load the CSV that already has Google Maps names
# df = pd.read_csv(INPUT_FILE)
# print(f"Loaded {len(df)} rows from {INPUT_FILE}")

# # Test with first 3 beaches
# subset = df.head(3).copy()
# print(f"Starting process for the first {len(subset)} beaches...\n")

# all_reviews = []

# for index, row in subset.iterrows():
#     original_name = row['Original_Beach_Name']
#     google_name = row['API_Beach_Name']  # Use the name from your CSV
#     vicinity = row.get('vicinity', '')

#     print(f"Processing #{index + 1}: {original_name}")
#     print(f"   Google Maps Name: {google_name}")

#     # Get reviews from TripAdvisor using the Google Name
#     reviews = get_tripadvisor_reviews(google_name, vicinity, API_KEY)

#     if reviews:
#         print(f"   -> Collected {len(reviews)} reviews.")

#         for r in reviews:
#             all_reviews.append({
#                 "Original Beach Name": original_name,
#                 "Google Maps Name": google_name,
#                 "Rating": r.get("rating"),
#                 "Title": r.get("title"),
#                 "Text": r.get("text"),
#                 "Date": r.get("published_date")
#             })
#     else:
#         print("   -> No reviews found on TripAdvisor.")

#     time.sleep(1)
#     print("-" * 40)

# # Save results
# if all_reviews:
#     result_df = pd.DataFrame(all_reviews)
#     result_df.to_csv(OUTPUT_FILE, index=False)
#     print(f"\nSuccess! Saved {len(all_reviews)} reviews to '{OUTPUT_FILE}'.")
# else:
#     print("\nNo reviews collected.")


Loaded 122 rows from Beaches_Matched_1000m.csv
Starting process for the first 3 beaches...

Processing #1: Aciddara
   Google Maps Name: Spiaggia Aciddara
   -> No reviews found on TripAdvisor.
----------------------------------------
Processing #2: Bagni della Regina Giovanna - Sorrento (Na)
   Google Maps Name: Capo di Sorrento
   -> No reviews found on TripAdvisor.
----------------------------------------
Processing #3: baia delle favole
   Google Maps Name: Baia delle Favole
   -> No reviews found on TripAdvisor.
----------------------------------------

No reviews collected.


In [None]:
# Testing url and parameters to see if results get fetched from TripAdvisor SerpAPI's
test_url = "https://serpapi.com/search"
test_params = {
    "engine": "tripadvisor",
    "q": "Positano beach Italy",
    "api_key": API_KEY
}
resp = requests.get(test_url, params=test_params)
print(resp.json())


NameError: name 'API_KEY' is not defined

In [None]:
# print("TEST 1: Print works")

# # Test API call
# import requests
# url = "https://serpapi.com/search"
# params = {
#     "engine": "tripadvisor",
#     "q": "Positano Italy",
#     "api_key": API_KEY
# }

# print("TEST 2: Making API call...")
# response = requests.get(url, params=params)
# print(f"TEST 3: Status code = {response.status_code}")

# data = response.json()
# print(f"TEST 4: Response keys = {list(data.keys())}")

# # Print first 500 characters of response
# import json
# print(f"TEST 5: Response preview:")
# print(json.dumps(data, indent=2)[:500])


TEST 1: Print works
TEST 2: Making API call...
TEST 3: Status code = 200
TEST 4: Response keys = ['search_metadata', 'search_parameters', 'places', 'serpapi_pagination']
TEST 5: Response preview:
{
  "search_metadata": {
    "id": "69668f32919dfa8c122f9bf2",
    "status": "Success",
    "json_endpoint": "https://serpapi.com/searches/06b63ff494060878/69668f32919dfa8c122f9bf2.json",
    "created_at": "2026-01-13 18:30:10 UTC",
    "processed_at": "2026-01-13 18:30:10 UTC",
    "tripadvisor_url": "https://www.tripadvisor.com/Search?q=Positano+Italy&ssrc=a&geo=1&offset=0&limit=30",
    "raw_html_file": "https://serpapi.com/searches/06b63ff494060878/69668f32919dfa8c122f9bf2.html",
    "pretti


In [None]:
# Test run for a single beach name " Spiaggia Aciddara "

print("=== SINGLE BEACH TEST ===")

# Hardcode values from your CSV
test_name = "Spiaggia Aciddara"
test_vicinity = "Spiaggia Aciddara, Santa Flavia"

print(f"Calling function with: {test_name}")

reviews = get_tripadvisor_reviews(test_name, test_vicinity, API_KEY)

print(f"Function returned: {len(reviews)} reviews")


=== SINGLE BEACH TEST ===
Calling function with: Spiaggia Aciddara
   Query: Spiaggia Aciddara Spiaggia Aciddara, Santa Flavia Italy
   Found 30 results
   -> TripAdvisor Match: 'Spiaggia Kafara' (ID: None)
Function returned: 0 reviews


In [None]:
# Test run for a well known beach name " Positano "
print("=== KNOWN BEACH TEST: Positano ===")

reviews = get_tripadvisor_reviews("Spiaggia Grande", "Positano", API_KEY)

print(f"Function returned: {len(reviews)} reviews")


=== KNOWN BEACH TEST: Positano ===
   Query: Spiaggia Grande Positano Italy
   Found 30 results
   -> TripAdvisor Match: 'Spiaggia di Positano Marina Grande' (ID: None)
Function returned: 0 reviews


In [None]:
# Test run with first 10 beaches ORDER BY: row numbers.
df = pd.read_csv(INPUT_FILE)
test_subset = df.head(10).copy()

print(f"Testing {len(test_subset)} beaches...\n")

results = []
success_count = 0
fail_count = 0

for index, row in test_subset.iterrows():
    original_name = row['Original_Beach_Name']
    google_name = row['API_Beach_Name']
    vicinity = row.get('vicinity', '')

    print(f"\n=== Beach #{index + 1}: {original_name} ===")

    match = get_tripadvisor_match(google_name, vicinity, API_KEY)

    if match and match.get("place_id"):
        success_count += 1
        print(f"✅ SUCCESS")
    else:
        fail_count += 1
        print(f"NO MATCH FOUND")

    results.append({
        "Original_Beach_Name": original_name,
        "Google_Maps_Name": google_name,
        "Tripadvisor_Name": match.get("tripadvisor_name") if match else None,
        "Place_ID": match.get("place_id") if match else None,
        "Rating": match.get("rating") if match else None,
        "Review_Count": match.get("review_count") if match else None,
    })

    time.sleep(1)
    print("-" * 60)

# Calculate success rate
success_rate = (success_count / len(test_subset)) * 100

print(f"\n{'='*60}")
print(f"SUCCESS RATE: {success_count}/{len(test_subset)} = {success_rate:.1f}%")
print(f"FAILED: {fail_count}/{len(test_subset)}")
print(f"{'='*60}")

# Save test results
results_df = pd.DataFrame(results)
results_df.to_csv("tripadvisor_test_10beaches.csv", index=False)
print(f"\nTest results saved to 'tripadvisor_test_10beaches.csv'")
results_df


Testing 10 beaches...


=== Beach #1: Aciddara ===
   Query: Spiaggia Aciddara Spiaggia Aciddara, Santa Flavia Italy
   Found 30 results
   -> Match: 'Spiaggia Kafara'
      Rating: 3.4 | Reviews: 14
✅ SUCCESS
------------------------------------------------------------

=== Beach #2: Bagni della Regina Giovanna - Sorrento (Na) ===
   Query: Capo di Sorrento Sorrento Italy
   Found 30 results
   -> Match: 'Punta del Capo'
      Rating: 4.6 | Reviews: 183
✅ SUCCESS
------------------------------------------------------------

=== Beach #3: baia delle favole ===
   Query: Baia delle Favole Sestri Levante Italy
   Found 30 results
   -> Match: 'Bay of Silence'
      Rating: 4.5 | Reviews: 2883
✅ SUCCESS
------------------------------------------------------------

=== Beach #4: Boschetto Steccato di Cutro  ===
   Query: Spiaggia Libera Steccato di Cutro Steccato di Cutro Italy
   Found 30 results
   -> Match: 'Spiaggia Libera Darsena'
      Rating: 3 | Reviews: 74
✅ SUCCESS
--------------

TypeError: sequence item 0: expected str instance, float found

In [None]:
# Simple print function to view total no. of beaches in the dataset,
# No. of missing values in data, no. of missing API naming, no. of missing original naming,
# Total no. of beaches processsed, total no. of beaches unavailable

import pandas as pd

df = pd.read_csv("Beaches_Matched_1000m.csv")

print(f"Total beaches in CSV: {len(df)}")
print(f"\nMissing data check:")
print(f"  Missing API_Beach_Name: {df['API_Beach_Name'].isna().sum()}")
print(f"  Missing Original_Beach_Name: {df['Original_Beach_Name'].isna().sum()}")
print(f"  Missing vicinity: {df['vicinity'].isna().sum()}")

# Count valid beaches (ones you can actually process)
valid_beaches = df[df['API_Beach_Name'].notna() & df['Original_Beach_Name'].notna()]
print(f"\n✅ Valid beaches to process: {len(valid_beaches)}")
print(f"❌ Invalid/missing beaches: {len(df) - len(valid_beaches)}")

# Show which rows have missing data
if len(df) - len(valid_beaches) > 0:
    print("\nRows with missing data:")
    missing = df[df['API_Beach_Name'].isna() | df['Original_Beach_Name'].isna()]
    print(missing[['Original_Beach_Name', 'API_Beach_Name']].to_string())


Total beaches in CSV: 122

Missing data check:
  Missing API_Beach_Name: 31
  Missing Original_Beach_Name: 31
  Missing vicinity: 37

✅ Valid beaches to process: 91
❌ Invalid/missing beaches: 31

Rows with missing data:
    Original_Beach_Name API_Beach_Name
8                   NaN            NaN
9                   NaN            NaN
10                  NaN            NaN
14                  NaN            NaN
17                  NaN            NaN
19                  NaN            NaN
21                  NaN            NaN
23                  NaN            NaN
27                  NaN            NaN
33                  NaN            NaN
35                  NaN            NaN
38                  NaN            NaN
40                  NaN            NaN
42                  NaN            NaN
45                  NaN            NaN
52                  NaN            NaN
53                  NaN            NaN
55                  NaN            NaN
63                  NaN            NaN
