<a href="https://colab.research.google.com/github/pantso251/DSFSE_Portfolio/blob/main/google_maps_reviews_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

WHAT THIS CODE DOES?

This code is used as an extension to the Google Maps Places where here I add the feature to extract specific keywords from the first 5-10 comments from Google Reviews. These keywords may contain letters regarding beach pollution and its scale (e.g clean, dirty, plastic, pollution etc.) Italian comments are directly translated to eng while we provide some additional words in Italian language for better accuracy.






In [None]:
import pandas as pd
import requests
import time
import re
from collections import Counter

# ==================== CONFIGURATION ====================
GOOGLE_API_KEY = "AIzaSyD4zNLt3VtqjGW7ZYDRbGG8lQSvsdY4htA"  # Replace with your key
INPUT_FILE = "merged_dataset_v2.csv"
OUTPUT_FILE = "beach_google_reviews.csv"
NUM_BEACHES = 210

# ==================== KEYWORD DICTIONARY ====================
POLLUTION_KEYWORDS = {
    "clean": [
        # English stems
        "clean", "pristine", "spotless", "clear", "crystal",
        "tidy", "maintain", "sparkl",
        # Italian stems
        "pulit",  # pulito/pulita/puliti/pulite
        "cristall",  # cristallino/cristallina
        "limp",  # limpido/limpida
        "trasparent",  # trasparente
        "curat"  # curato/curata
    ],

    "dirty": [
        # English stems
        "dirt", "trash", "garbage", "litter", "pollut",
        "waste", "plastic", "bottl", "rubbish", "debris", "filth", "mess",
        # Italian stems
        "spor",  # sporco/sporca
        "spazzatur",  # spazzatura
        "rifiut",  # rifiuti
        "plastic",  # plastica/plastiche
        "bottigl",  # bottiglia/bottiglie
        "immondi",  # immondizia
        "inquin"  # inquinato/inquinata/inquinamento
    ],

    "crowded": [
        "crowd", "pack", "busy", "full", "overc", "too many",
        "affoll",  # affollato/affollata
        "pien",  # pieno/piena
        "sovraffoll",  # sovraffollato
        "troppa gente"
    ],

    "beautiful": [
        "beautiful", "gorgeous", "stunning", "amazing", "paradise", "wonderful",
        "bell",  # bello/bella/belli/belle/bellissimo
        "meraviglios",  # meraviglioso/meravigliosa
        "stupend",  # stupendo/stupenda
        "paradis"  # paradise/paradiso
    ]
}

# Negation words
NEGATIONS = ["not", "no", "never", "wasn't", "weren't", "isn't", "aren't",
             "non", "senza", "mai"]  # Italian: non, without, never

# ==================== FUNCTIONS ====================

def find_place_id(beach_name, lat, lng, api_key):
    """
    Find Google Place ID using Nearby Search
    """
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    params = {
        "location": f"{lat},{lng}", # Changed 'lon' to 'lng'
        "radius": "200",  # 200 meters
        "keyword": beach_name,
        "key": api_key
    }

    try:
        response = requests.get(url, params=params)
        data = response.json()

        if data.get("status") == "OK" and data.get("results"):
            return data["results"][0]["place_id"]
        else:
            print(f"    No place found. Status: {data.get('status')}")
            return None
    except Exception as e:
        print(f"    Error finding place: {e}")
        return None


def get_place_reviews(place_id, api_key):
    """
    Get place details including reviews (up to 5)
    """
    url = "https://maps.googleapis.com/maps/api/place/details/json"

    params = {
        "place_id": place_id,
        "fields": "name,rating,user_ratings_total,reviews",
        "key": api_key,
        "language": "en"  # Request English, but will get mixed languages
    }

    try:
        response = requests.get(url, params=params)
        data = response.json()

        if data.get("status") == "OK":
            result = data.get("result", {})
            return {
                "name": result.get("name"),
                "rating": result.get("rating"),
                "total_reviews": result.get("user_ratings_total"),
                "reviews": result.get("reviews", [])
            }
        else:
            print(f"    Error getting details. Status: {data.get('status')}")
            return None
    except Exception as e:
        print(f"    Error getting reviews: {e}")
        return None


def check_negation_context(sentence, keyword):
    """
    Check if keyword appears in a negated context
    """
    # Look for negation words within 3 words before the keyword
    words = sentence.split()

    for i, word in enumerate(words):
        if keyword in word.lower():
            # Check previous 3 words for negation
            start = max(0, i - 3)
            context = words[start:i]

            if any(neg in " ".join(context).lower() for neg in NEGATIONS):
                return True

    return False


def analyze_keywords_in_reviews(reviews):
    """
    Extract and count keywords from reviews with negation handling
    """
    keyword_counts = {
        "clean": 0,
        "dirty": 0,
        "crowded": 0,
        "beautiful": 0
    }

    all_found_keywords = []
    review_languages = []
    review_ratings = []

    for review in reviews:
        text = review.get("text", "")
        text_lower = text.lower()
        rating = review.get("rating", 0)
        language = review.get("language", "unknown")

        review_languages.append(language)
        review_ratings.append(rating)

        # Split into sentences for negation detection
        sentences = re.split(r'[.!?;]', text_lower)

        # Check each keyword category
        for category, keywords in POLLUTION_KEYWORDS.items():
            for keyword in keywords:
                if keyword in text_lower:
                    # Check if it's negated
                    is_negated = False

                    for sentence in sentences:
                        if keyword in sentence:
                            if check_negation_context(sentence, keyword):
                                is_negated = True
                                break

                    if not is_negated:
                        keyword_counts[category] += 1
                        all_found_keywords.append(keyword)

    return {
        "counts": keyword_counts,
        "keywords_found": all_found_keywords,
        "languages": review_languages,
        "ratings": review_ratings
    }


def calculate_scores(place_data, keyword_analysis):
    """
    Calculate various quality scores
    """
    scores = {}

    # 1. Keyword-based cleanliness score
    clean_total = keyword_analysis["counts"]["clean"] + keyword_analysis["counts"]["dirty"]

    if clean_total > 0:
        scores["keyword_cleanliness"] = (
            keyword_analysis["counts"]["clean"] / clean_total * 100
        )
    else:
        scores["keyword_cleanliness"] = None

    # 2. Overall Google rating score
    if place_data.get("rating"):
        scores["rating_score"] = (place_data["rating"] / 5.0) * 100
    else:
        scores["rating_score"] = None

    # 3. Average review rating
    if keyword_analysis["ratings"]:
        avg_rating = sum(keyword_analysis["ratings"]) / len(keyword_analysis["ratings"])
        scores["review_avg_score"] = (avg_rating / 5.0) * 100
    else:
        scores["review_avg_score"] = None

    # 4. Composite score (weighted average of available scores)
    available_scores = []

    if scores["keyword_cleanliness"] is not None:
        available_scores.append((scores["keyword_cleanliness"], 0.4))
    if scores["rating_score"] is not None:
        available_scores.append((scores["rating_score"], 0.4))
    if scores["review_avg_score"] is not None:
        available_scores.append((scores["review_avg_score"], 0.2))

    if available_scores:
        total_weight = sum(w for _, w in available_scores)
        weighted_sum = sum(s * w for s, w in available_scores)
        scores["composite_score"] = weighted_sum / total_weight
    else:
        scores["composite_score"] = None

    return scores


# ==================== MAIN EXECUTION ====================

def main():
    print("="*70)
    print("BEACH REVIEW ANALYSIS - GOOGLE PLACES API")
    print("="*70)

    # Load CSV
    print(f"\nLoading beaches from {INPUT_FILE}...")
    df = pd.read_csv(INPUT_FILE)

    # Print columns to debug
    print("\nDataFrame columns:")
    print(df.columns)

    # Filter valid beaches
    valid_beaches = df[df['API_Beach_Name'].notna() & df['Original_Beach_Name'].notna()]
    print(f"Found {len(valid_beaches)} valid beaches in CSV")

    # Select first N beaches
    subset = valid_beaches.head(NUM_BEACHES).copy()
    print(f"Processing first {len(subset)} beaches...\n")

    results = []

    for index, row in subset.iterrows():
        beach_name = row['API_Beach_Name']
        original_name = row['Original_Beach_Name']
        lat = row['lat']
        lng = row['lon']
        google_rating = row.get('rating', None)
        google_review_count = row.get('user_ratings_total', None)

        print(f"\n{'='*70}")
        print(f"Beach #{index + 1}: {beach_name}")
        print(f"Original name: {original_name}")
        print(f"Location: ({lat}, {lng})")
        print(f"{'='*70}")

        # Step 1: Find place_id
        print("  üîç Searching for place...")
        place_id = find_place_id(beach_name, lat, lng, GOOGLE_API_KEY)

        if not place_id:
            print("  ‚ùå Place not found. Skipping.")
            results.append({
                "Beach_Name": beach_name,
                "Original_Name": original_name,
                "Status": "place_not_found",
                "Google_Rating": google_rating,
                "Total_Review_Count": google_review_count
            })
            continue

        print(f"  ‚úÖ Found place_id: {place_id}")

        # Step 2: Get reviews
        print("  üìñ Fetching reviews...")
        place_data = get_place_reviews(place_id, GOOGLE_API_KEY)

        if not place_data:
            print("  ‚ùå Could not fetch reviews. Skipping.")
            results.append({
                "Beach_Name": beach_name,
                "Original_Name": original_name,
                "Place_ID": place_id,
                "Status": "reviews_unavailable",
                "Google_Rating": google_rating,
                "Total_Review_Count": google_review_count
            })
            continue

        reviews = place_data.get("reviews", [])
        print(f"  ‚úÖ Retrieved {len(reviews)} reviews")

        if len(reviews) == 0:
            print("  ‚ö†Ô∏è  No reviews available")
            results.append({
                "Beach_Name": beach_name,
                "Original_Name": original_name,
                "Place_ID": place_id,
                "Status": "no_reviews",
                "Google_Rating": place_data.get("rating"),
                "Total_Review_Count": place_data.get("total_reviews")
            })
            continue

        # Step 3: Analyze keywords
        print("  üî¨ Analyzing keywords...")
        keyword_analysis = analyze_keywords_in_reviews(reviews)

        print(f"    Clean mentions: {keyword_analysis['counts']['clean']}")
        print(f"    Dirty mentions: {keyword_analysis['counts']['dirty']}")
        print(f"    Crowded mentions: {keyword_analysis['counts']['crowded']}")
        print(f"    Beautiful mentions: {keyword_analysis['counts']['beautiful']}")

        # Step 4: Calculate scores
        scores = calculate_scores(place_data, keyword_analysis)

        print(f"  üìä Scores:")
        if scores["keyword_cleanliness"] is not None:
            print(f"    Keyword cleanliness: {scores['keyword_cleanliness']:.1f}%")
        print(f"    Google rating: {place_data.get('rating')}/5 ({scores['rating_score']:.1f}%)")
        if scores["composite_score"] is not None:
            print(f"    Composite score: {scores['composite_score']:.1f}%")

        # Get top keywords
        keyword_counter = Counter(keyword_analysis['keywords_found'])
        top_keywords = dict(keyword_counter.most_common(5))

        # Determine data quality
        total_keywords = sum(keyword_analysis['counts'].values())
        if total_keywords > 5:
            data_quality = "high"
        elif total_keywords > 0:
            data_quality = "medium"
        else:
            data_quality = "rating_only"

        # Store results
        results.append({
            "Beach_Name": beach_name,
            "Original_Name": original_name,
            "Place_ID": place_id,
            "Status": "success",

            # Google data
            "Google_Rating": place_data.get("rating"),
            "Total_Review_Count": place_data.get("total_reviews"),
            "Reviews_Analyzed": len(reviews),

            # Keyword counts
            "Clean_Mentions": keyword_analysis['counts']['clean'],
            "Dirty_Mentions": keyword_analysis['counts']['dirty'],
            "Crowded_Mentions": keyword_analysis['counts']['crowded'],
            "Beautiful_Mentions": keyword_analysis['counts']['beautiful'],

            # Scores
            "Keyword_Cleanliness_Score": scores["keyword_cleanliness"],
            "Rating_Score": scores["rating_score"],
            "Composite_Score": scores["composite_score"],

            # Metadata
            "Data_Quality": data_quality,
            "Top_Keywords": str(top_keywords),
            "Languages": ", ".join(set(keyword_analysis['languages'])),

            # Location
            "Latitude": lat,
            "Longitude": lng
        })

        # Rate limiting
        time.sleep(1)

    # Save results
    print(f"\n{'='*70}")
    print("SAVING RESULTS")
    print(f"{'='*70}")

    results_df = pd.DataFrame(results)
    results_df.to_csv(OUTPUT_FILE, index=False)

    print(f"‚úÖ Saved {len(results_df)} results to '{OUTPUT_FILE}'")

    # Summary statistics
    print(f"\n{'='*70}")
    print("SUMMARY")
    print(f"{'='*70}")

    successful = results_df[results_df['Status'] == 'success']
    print(f"Successful analyses: {len(successful)}/{len(results_df)}")

    if len(successful) > 0:
        print(f"\nData quality distribution:")
        print(successful['Data_Quality'].value_counts())

        print(f"\nAverage scores:")
        print(f"  Google rating: {successful['Google_Rating'].mean():.2f}/5")

        if successful['Composite_Score'].notna().any():
            print(f"  Composite score: {successful['Composite_Score'].mean():.1f}%")

        print(f"\nTop beaches by composite score:")
        top_beaches = successful.nlargest(5, 'Composite_Score')[
            ['Beach_Name', 'Composite_Score', 'Clean_Mentions', 'Dirty_Mentions']
        ]
        print(top_beaches.to_string(index=False))

    print(f"\n{'='*70}")
    print("ANALYSIS COMPLETE!")
    print(f"{'='*70}\n")

    return results_df


# Run the analysis
if __name__ == "__main__":
    results_df = main()


BEACH REVIEW ANALYSIS - GOOGLE PLACES API

Loading beaches from merged_dataset_v2.csv...

DataFrame columns:
Index(['Unnamed: 0', 'Beach_Code', 'Original_Beach_Name', 'API_Beach_Name',
       'rating', 'user_ratings_total', 'lat', 'lon', 'Date of collection',
       'Period', 'length', 'Trash amount', 'Beach was cleaned',
       'The last time that beach has been cleaned',
       'The last time that was cleaned from another group',
       'further from the reference point above 100 meters ',
       'Trovato un animale abbandonato o morto',
       'Numero di animali abbandonati o morti'],
      dtype='object')
Found 189 valid beaches in CSV
Processing first 189 beaches...


Beach #1: Spiaggia Aciddara
Original name: Aciddara
Location: (38.08036623, 13.53734493)
  üîç Searching for place...
  ‚úÖ Found place_id: ChIJsUOqHnziGRMRDGtnm_hY-70
  üìñ Fetching reviews...
  ‚úÖ Retrieved 5 reviews
  üî¨ Analyzing keywords...
    Clean mentions: 3
    Dirty mentions: 4
    Crowded mentions: 2

In [None]:
# API key froze, just checking if it still works.
import requests


url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
    "location": "41.9028,12.4964",  # Rome
    "radius": "500",
    "keyword": "colosseum",
    "key": GOOGLE_API_KEY
}

response = requests.get(url, params=params)
print(response.json())


{'html_attributions': [], 'results': [], 'status': 'ZERO_RESULTS'}
