In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Setup driver
options = Options()
options.add_argument("--start-maximized")
# options.add_argument("--headless")  # Optional for background run
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 30)

# Step 1: Open Google Maps
driver.get("https://www.google.com/maps")
time.sleep(3)

# Step 2: Search for resturants in Colombo District
search_box = wait.until(EC.presence_of_element_located((By.ID, "searchboxinput")))
search_box.send_keys("Resturants and cafes in Colombo")
driver.find_element(By.ID, "searchbox-searchbutton").click()
time.sleep(8)

# Step 3: Scrollable container
try:
    scrollable_container = wait.until(
        EC.presence_of_element_located((By.XPATH, '//div[@role="feed" and contains(@class, "m6QErb")]'))
    )
except:
    print("⚠️ Scrollable container not found. Trying fallback selector...")
    scrollable_container = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="feed"]'))
    )

# resturant name extraction
resturant_names = set()

def extract_names():
    cards = driver.find_elements(By.CSS_SELECTOR, 'div.Nv2PK')
    for card in cards:
        try:
            name = card.find_element(By.CSS_SELECTOR, 'div.qBF1Pd').text.strip()
            if name and name not in resturant_names:
                resturant_names.add(name)
                print(f"➕ {name}")
        except:
            continue

# Step 4: Infinite scroll loop
scroll_attempts = 0
max_scrolls = 50  # Adjust this if needed
last_count = 0

while scroll_attempts < max_scrolls:
    scroll_results_script = """
        arguments[0].scrollTop = arguments[0].scrollHeight;
    """
    driver.execute_script(scroll_results_script, scrollable_container)
    time.sleep(3)

    extract_names()

    if len(resturant_names) > last_count:
        last_count = len(resturant_names)
        scroll_attempts = 0  # reset if new items loaded
    else:
        scroll_attempts += 1  # no new items loaded

    print(f"🔄 Scroll attempt {scroll_attempts} | Total: {len(resturant_names)}")

# Done scrolling
driver.quit()

# Save to CSV
df = pd.DataFrame(sorted(resturant_names), columns=["Resturant Name"])
df.to_csv("colombo_resturants_full_list.csv", index=False, encoding='utf-8-sig')

print(f"\n✅ Final count: {len(resturant_names)} resturants extracted.")


➕ The Gallery Café
➕ Grind
➕ Peppermint Cafe
➕ Café on the 5th
➕ Seed Cafe
➕ Barefoot Garden Cafe
➕ CAFE COLOMBO
➕ Cafe Kumbuk
➕ The Commons Coffee House
➕ Cafe Noir Blanc
🔄 Scroll attempt 0 | Total: 10
➕ Tea Avenue
➕ t-lounge by Dilmah – Chatham Street ️
➕ The Barnesbury
➕ Oldfort Cafe
➕ Central Cafe
🔄 Scroll attempt 0 | Total: 15
➕ Coffee Colombo
➕ Caramel Pumpkin
➕ KIKU Colombo
➕ Twins Cafesl
➕ Kaapi
🔄 Scroll attempt 0 | Total: 20
➕ The Central Perk
➕ Milk & Honey Café
➕ Graze Kitchen
➕ Blooming Breakfast Resto Bar & Cafe
➕ The Escape Café
🔄 Scroll attempt 0 | Total: 25
➕ Cafe Kinross
➕ Cafe 1959 by Raux Brothers
➕ Dolce Italia
➕ Barista Staple Street
➕ Harpo's Colombo Fort Cafe
🔄 Scroll attempt 0 | Total: 30
➕ Spa Ceylon Café
➕ Cuisine Colombo - Gregory's Road
➕ Brown Sugar
➕ Coffee & Company
➕ Mitsis Delicacies
🔄 Scroll attempt 0 | Total: 35
➕ Coco Veranda
➕ Bowl'd - Colombo
➕ La Luna Colombo
➕ Brew 1867 by Dilmah – Flower Road
➕ SunsetBlu
🔄 Scroll attempt 0 | Total: 40
➕ The Trav

In [2]:
# !pip install dateparser

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import dateparser
import pandas as pd
import os
from datetime import datetime

# Load resturant names from CSV
resturants_df = pd.read_csv("colombo_resturants_full_list.csv")
resturant_names = resturants_df["Resturant Name"].tolist()

# Setup driver
options = Options()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 30)

# Create folder for output
os.makedirs("resturant_reviews_Colombo", exist_ok=True)

# Loop through each resturant
for resturant_name in resturant_names:
    print(f"\n🏨 Processing resturant: {resturant_name}")
    file_path = f"resturant_reviews_Colombo/{resturant_name.replace('/', '_')}.csv"
    
    try:
        # Open Google Maps
        driver.get("https://www.google.com/maps")
        time.sleep(2)

        # Search for the resturant
        search_box = wait.until(EC.presence_of_element_located((By.ID, "searchboxinput")))
        search_box.clear()
        search_box.send_keys(resturant_name)
        driver.find_element(By.ID, "searchbox-searchbutton").click()

        # Wait for results
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "DUwDvf")))
        time.sleep(2)

        # Click the "Reviews" button
        try:
            reviews_tab = wait.until(
                EC.element_to_be_clickable((By.XPATH, '//button[contains(@aria-label,"Reviews")]'))
            )
            reviews_tab.click()
            print("📝 Reviews tab clicked.")
        except:
            print("❌ Reviews tab not available.")
            continue

        # Scroll to load more reviews
        scrollable_div = wait.until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "m6QErb DxyBCb kA9KIf dS8AEf")]'))
        )

        for _ in range(10):
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
            time.sleep(1.5)

        # Extract reviews
        reviews = driver.find_elements(By.XPATH, '//div[contains(@class, "jftiEf")]')

        with open(file_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Review", "Relative Date", "Absolute Date", "Site", "Rating"])

            for review in reviews:
                try:
                    review_text = review.find_element(By.XPATH, './/span[@class="wiI7pd"]').text.strip()

                    # Relative date and source
                    try:
                        date_string = review.find_element(By.XPATH, './/span[contains(text(), "ago")]').text.strip()
                    except:
                        date_string = "No date found"

                    if "ago" in date_string:
                        parts = date_string.split("ago")
                        relative_date = parts[0].strip() + " ago"
                        source = parts[1].replace("on", "").strip().capitalize() if len(parts) > 1 else "Unknown"
                    else:
                        relative_date = date_string
                        source = "Unknown"

                    # Absolute date
                    if "ago" in relative_date:
                        parsed_date = dateparser.parse(relative_date)
                        absolute_date = parsed_date.strftime("%Y-%m-%d") if parsed_date else "Unable to parse"
                    else:
                        absolute_date = "N/A"

                    # Rating
                    try:
                        review_rating = review.find_element(By.XPATH, './/span[@class="Y0EAmc"]/span').get_attribute("aria-label").strip()
                    except:
                        review_rating = "No rating"

                    # Save review
                    writer.writerow([review_text, relative_date, absolute_date, source, review_rating])

                except Exception as e:
                    continue

        print(f"✅ Saved reviews to {file_path}")

    except Exception as e:
        print(f"❌ Error processing {resturant_name}: {e}")
        continue

# Close browser
driver.quit()
print("\n🎉 All reviews collected.")



🏨 Processing resturant: A Healing Café - MaRadha Colombo
📝 Reviews tab clicked.
✅ Saved reviews to resturant_reviews_Colombo/A Healing Café - MaRadha Colombo.csv

🏨 Processing resturant: Avartana
📝 Reviews tab clicked.
✅ Saved reviews to resturant_reviews_Colombo/Avartana.csv

🏨 Processing resturant: BOO Cafe
❌ Error processing BOO Cafe: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7AA16EFA5+77893]
	GetHandleVerifier [0x00007FF7AA16F000+77984]
	(No symbol) [0x00007FF7A9F391BA]
	(No symbol) [0x00007FF7A9F8F16D]
	(No symbol) [0x00007FF7A9F8F41C]
	(No symbol) [0x00007FF7A9FE2237]
	(No symbol) [0x00007FF7A9FB716F]
	(No symbol) [0x00007FF7A9FDF07F]
	(No symbol) [0x00007FF7A9FB6F03]
	(No symbol) [0x00007FF7A9F80328]
	(No symbol) [0x00007FF7A9F81093]
	GetHandleVerifier [0x00007FF7AA427B6D+2931725]
	GetHandleVerifier [0x00007FF7AA422132+2908626]
	GetHandleVerifier [0x00007FF7AA4400F3+3031443]
	GetHandleVerifier [0x00007FF7AA1891EA+184970]
	GetHandleVerifier [0x00007FF7AA19086F+215311]
	

In [4]:
import pandas as pd
import os

# Input and output directories
input_folder = "resturant_reviews_Colombo"
output_folder = "resturant_reviews_Colombo_2025"
os.makedirs(output_folder, exist_ok=True)

# Loop through all CSV files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        input_path = os.path.join(input_folder, filename)
        resturant_name = os.path.splitext(filename)[0]  # Remove ".csv"
        
        try:
            # Load the CSV
            df = pd.read_csv(input_path)
            
            # Ensure 'Absolute Date' is string and filter by year 2025
            df["Absolute Date"] = df["Absolute Date"].astype(str)
            df_2025 = df[df["Absolute Date"].str.startswith("2025-")]

            # Save if there are any 2025 reviews
            if not df_2025.empty:
                output_path = os.path.join(output_folder, f"{resturant_name}_2025.csv")
                df_2025.to_csv(output_path, index=False, encoding="utf-8-sig")
                print(f"✅ {resturant_name}: {len(df_2025)} reviews saved.")
            else:
                print(f"⚠️ {resturant_name}: No reviews from 2025.")

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

print("\n🎯 Done filtering 2025 reviews.")


✅ A Healing Café - MaRadha Colombo: 6 reviews saved.
✅ Agra Colombo: 7 reviews saved.
✅ Avartana: 19 reviews saved.
✅ Baked Colombo: 6 reviews saved.
✅ Barefoot Garden Cafe: 40 reviews saved.
✅ Barista Staple Street: 11 reviews saved.
✅ Bellagio Colombo: 1 reviews saved.
✅ Black Cat Cafe + Stay: 14 reviews saved.
✅ Blooming Breakfast Resto Bar & Cafe: 59 reviews saved.
⚠️ Botanik Rooftop Bistro & Bar: No reviews from 2025.
✅ Bowl'd - Colombo: 18 reviews saved.
✅ Brew 1867 by Dilmah – Flower Road: 7 reviews saved.
✅ BUPATHI VILAS - Malabe: 5 reviews saved.
✅ Cafe 1959 by Raux Brothers: 21 reviews saved.
✅ Cafe 97: 12 reviews saved.
✅ Cafe Kai: 9 reviews saved.
✅ Cafe Kinross: 38 reviews saved.
✅ Cafe Kumbuk: 49 reviews saved.
⚠️ Cafe Noir Blanc: No reviews from 2025.
⚠️ Cafe Seventy Seven: No reviews from 2025.
⚠️ Café Français: No reviews from 2025.
⚠️ Café La Defense: No reviews from 2025.
⚠️ Café on the 5th: No reviews from 2025.
⚠️ CALAMARI COLOMBO: No reviews from 2025.
✅ Capital B

In [5]:
import pandas as pd
import os
import re

# Define keyword patterns related to telecom issues
keywords = [
    "poor wifi", "no wifi", "weak wifi", "wifi didn't work", "bad wifi",
    "signal issue", "no signal", "weak signal", "mobile signal", "internet issue",
    "wifi problem", "internet was slow", "wifi wasn't working", "slow connection",
    "no internet", "unstable wifi", "spotty wifi", "wifi kept dropping","terrible wifi", "horrible wifi", "wifi never worked", "wifi cut off", "wifi goes off", "kept disconnecting",
    "frequent disconnections", "disconnected often", "connection issues", "internet kept dropping", 
    "slow internet", "wifi barely worked", "internet doesn't work", "network issue", "couldn't connect to wifi",
    "difficult to connect", "had trouble connecting", "no network", "internet unavailable", "bad network", 
    "signal strength was low", "poor network coverage", "wifi outage", "no reception", "bad reception",
    "couldn't load anything", "couldn’t get online", "hard to connect", "nothing would load", "pages won't load",
    "pages wouldn't load", "wifi didn’t reach", "wifi didn’t reach room", "no wifi in room", "zero bars", 
    "signal was bad", "laggy internet", "glitchy wifi", "reception was terrible", "connection kept dropping",
    "wifi was down", "no connectivity", "intermittent connection", "limited internet access", "unusable wifi",
    "had no signal", "data didn’t work", "bad cellular coverage", "phone signal was terrible","couldn't stay connected", "couldn’t stay online", "internet kept cutting out", "wifi kept failing",
    "wifi was horrible", "connection was terrible", "wifi didn’t work properly", "wifi broke down",
    "internet sucked", "bad internet service", "no online access", "had no wifi", "wifi didn’t exist",
    "internet connection failed", "wifi went down", "dropped internet", "signal kept dropping", 
    "couldn't stream", "unable to use wifi", "can't connect online", "wifi not working", 
    "connection failed", "wifi trouble", "wifi not available", "wifi inaccessible", 
    "couldn’t browse", "couldn’t use the internet", "wifi was off", "couldn’t get reception", 
    "wifi disconnected", "lost connection", "lost internet", "wifi signal weak", "wifi didn’t load",
    "apps wouldn't open", "net was bad", "no data connection", "cell service was poor", 
    "couldn’t get any bars", "connection cut out", "slow to load", "laggy connection", 
    "wifi flaky", "wifi not reliable", "mobile data was useless", "spotty connection", 
    "zero internet", "internet crashed", "no bandwidth", "pathetic wifi", "crappy wifi", 
    "network failed", "couldn’t even send a message","wifi"
]



# Compile into one regex pattern (case-insensitive)
pattern = re.compile(r"|".join(keywords), re.IGNORECASE)

# Input folder
input_folder = "resturant_reviews_Colombo_2025"
output_path = "colombo_telecom_related_reviews_2025.csv"

# Prepare DataFrame to hold all filtered reviews
all_matching_reviews = []

# Loop through all CSV files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        resturant_name = filename.replace("_2025.csv", "")
        input_path = os.path.join(input_folder, filename)

        try:
            df = pd.read_csv(input_path)

            # Filter rows where review contains any keyword
            matches = df[df["Review"].astype(str).apply(lambda x: bool(pattern.search(x)))]

            if not matches.empty:
                matches["Resturant Name"] = resturant_name
                all_matching_reviews.append(matches[["Resturant Name", "Absolute Date", "Review"]])
                print(f"📌 {resturant_name}: {len(matches)} matching reviews found.")
            else:
                print(f"✅ {resturant_name}: No telecom-related reviews.")

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

# Combine and save to a single CSV
if all_matching_reviews:
    result_df = pd.concat(all_matching_reviews, ignore_index=True)
    result_df.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"\n📁 Saved {len(result_df)} telecom-related reviews to '{output_path}'")
else:
    print("\n🟢 No telecom-related reviews found in any resturant.")


✅ A Healing Café - MaRadha Colombo: No telecom-related reviews.
✅ Agra Colombo: No telecom-related reviews.
✅ Avartana: No telecom-related reviews.
✅ Baked Colombo: No telecom-related reviews.
✅ Barefoot Garden Cafe: No telecom-related reviews.
✅ Barista Staple Street: No telecom-related reviews.
✅ Bellagio Colombo: No telecom-related reviews.
✅ Black Cat Cafe + Stay: No telecom-related reviews.
✅ Blooming Breakfast Resto Bar & Cafe: No telecom-related reviews.
✅ Bowl'd - Colombo: No telecom-related reviews.
✅ Brew 1867 by Dilmah – Flower Road: No telecom-related reviews.
✅ BUPATHI VILAS - Malabe: No telecom-related reviews.
📌 Cafe 1959 by Raux Brothers: 2 matching reviews found.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches["Resturant Name"] = resturant_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches["Resturant Name"] = resturant_name


✅ Cafe 97: No telecom-related reviews.
✅ Cafe Kai: No telecom-related reviews.
📌 Cafe Kinross: 1 matching reviews found.
✅ Cafe Kumbuk: No telecom-related reviews.
✅ Capital Bar & Grill: No telecom-related reviews.
✅ Ceylon Curry Club: No telecom-related reviews.
✅ Chaiwala Colombo - One Galle Face Mall: No telecom-related reviews.
✅ Cinnamon Grand Colombo: No telecom-related reviews.
✅ Cinnamon Lakeside Colombo: No telecom-related reviews.
✅ DDD Char Siu - Authentic Chinese Malaysian BBQ Grab & Go & Patio Casual Dining BYOB: No telecom-related reviews.
✅ Dolce Italia: No telecom-related reviews.
✅ Food Studio - Colombo City Centre: No telecom-related reviews.
✅ Gardenia Coffeeshop: No telecom-related reviews.
✅ Gentz Residency & Restaurant: No telecom-related reviews.
✅ Graze Kitchen: No telecom-related reviews.
✅ Harbour Court: No telecom-related reviews.
✅ Harpo's Colombo Fort Cafe: No telecom-related reviews.
✅ Hela Bojun Hala Battaramulla: No telecom-related reviews.
✅ Indian Ocea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches["Resturant Name"] = resturant_name


✅ Monsoon Colombo: No telecom-related reviews.
✅ Nara Thai: No telecom-related reviews.
✅ Nihonbashi By Dharshan: No telecom-related reviews.
✅ Oak Ray Flower Drum: No telecom-related reviews.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches["Resturant Name"] = resturant_name


📌 Oldfort Cafe: 1 matching reviews found.
✅ PapaDough: No telecom-related reviews.
✅ Pizza Hut - Borella: No telecom-related reviews.
✅ Poké 65: No telecom-related reviews.
✅ Ports Of Call: No telecom-related reviews.
✅ Rakna Lanka Restaurant: No telecom-related reviews.
✅ Risi Bawana Restaurant රිසි බවන: No telecom-related reviews.
✅ Royal Colombo Golf Club Restaurant: No telecom-related reviews.
✅ Royal Thai - Cinnamon Lakeside: No telecom-related reviews.
✅ SakeColombo By Tsukiji Uoichi Japanese Restaurant: No telecom-related reviews.
✅ Sappers' Leisure Bay: No telecom-related reviews.
✅ Sapphyr Lounge: No telecom-related reviews.
✅ Seconds Hostel -Colombo: No telecom-related reviews.
✅ Shang Palace: No telecom-related reviews.
✅ Shangri-La Colombo: No telecom-related reviews.
✅ Siam House Restaurant: No telecom-related reviews.
✅ The Avenue: No telecom-related reviews.
✅ The Bayleaf: No telecom-related reviews.
✅ The Brick Lane Coffeehouse: No telecom-related reviews.
✅ The Dining 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matches["Resturant Name"] = resturant_name
