## Imports

In [5]:
!source proj_env/bin/activate

In [6]:
import logging
import os
import sys
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import googlemaps
from google.oauth2 import service_account
from googleapiclient.discovery import build
import numpy as np

ModuleNotFoundError: No module named 'selenium'

## Set up Logging

In [4]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scrape_google_maps_reviews.log', mode='w'),  # Log to file
        logging.StreamHandler(sys.stdout)  # Log to console (for Jupyter/real-time output)
    ]
)

## Read Addresses from Google Sheet

In [122]:
# Load credentials from the JSON key file
SERVICE_ACCOUNT_FILE = '/Users/madhumithakumar/Documents/bgorg_clients/google_sheets_service_account_key.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES
)

# Google Sheets ID and Range
SPREADSHEET_ID = '11-bAMl8nkpcQK1uxoRZ2pCZ1AKm70Wuu56McK0Uk5PU'
RANGE_NAME = 'Naturals!A1:G200'  # Adjust as needed

# Load Google Sheets data into a DataFrame
def fetch_data_from_gsheet():
    try:
        # Build the Google Sheets API service
        service = build('sheets', 'v4', credentials=credentials)
        sheet = service.spreadsheets()

        # Fetch data from the specified range
        result = sheet.values().get(spreadsheetId=SPREADSHEET_ID, range=RANGE_NAME).execute()
        values = result.get('values', [])

        # Convert the fetched data to a DataFrame
        if values:
            df = pd.DataFrame(values[1:], columns=values[0])  # First row as header
        else:
            df = pd.DataFrame()

        return df

    except Exception as e:
        print(f"Error fetching data: {e}")
        return None

In [123]:
df = fetch_data_from_gsheet()

2024-12-19 10:32:58,294 - INFO - file_cache is only supported with oauth2client<4.0.0


In [38]:
#df.set_index("BGorgeous_Client_Name", inplace=True)

In [39]:
#business_name, address = df.loc["Naturals_Nookampalayam", ["Business Name", "Address"]]

In [40]:
#business_name

'Naturals Signature Salon'

In [41]:
#address

'1, Sristi Arcade, Sristi First Avenue, Cheran Nagar, Perumbakkam, Chennai, Tamil Nadu 600100'

## Places API

In [16]:
api_key = "AIzaSyB-wyVE7zyFMPwRKDQmw5fLrTluF_nAAxQ"

In [None]:
def scrape_google_maps_reviews(api_key, business_name, city_name, max_locations):
    """
    Extract metadata for up to 200 locations using pagination in the Google Places API.
    Extract all reviews into a single CSV file and save Google Maps URLs to a text file.

    Parameters:
        api_key (str): Google API key.
        business_name (str): Business name to query.
        city_name (str): City name for the search.
        max_locations (int): Total number of locations to extract.

    Returns:
        None: Saves the data to CSV files and URLs to a text file.
    """
    logging.info("Starting the scrape_google_maps_reviews function")
    gmaps = googlemaps.Client(key=api_key)
    location_data = []
    all_reviews = []
    urls = []  # To store the URLs

    try:
        processed_locations = 0
        next_page_token = None

        while processed_locations < max_locations:
            # Fetch places using the API
            query = f"{business_name}, {city_name}"
            if next_page_token:
                response = gmaps.places(query=query, page_token=next_page_token)
            else:
                response = gmaps.places(query=query)

            # Handle API errors
            if response.get("status") != "OK":
                logging.error(f"Places API failed: {response.get('status')}")
                break

            # Process results
            results = response.get("results", [])
            for place in results:
                if processed_locations >= max_locations:
                    break

                place_id = place["place_id"]
                name = place.get("name", "N/A")
                address = place.get("formatted_address", "N/A")
                rating = place.get("rating", "N/A")
                location_id = f"LOC_{processed_locations + 1}"
                user_ratings_total = place.get("user_ratings_total", 0)

                # Generate Google Maps URL
                url = f"https://www.google.com/maps/place/?q=place_id:{place_id}"
                urls.append(url)  # Add URL to list

                location_data.append({
                    "Location ID": location_id,
                    "City Area": city_name,
                    "Name": name,
                    "Address": address,
                    "Rating": rating,
                    "Total Reviews": user_ratings_total,
                    "Place ID": place_id,
                    "Google Maps URL": url
                })

                # Fetch reviews for the current location
                try:
                    place_details = gmaps.place(place_id=place_id, fields=["reviews"])
                    reviews = place_details.get("result", {}).get("reviews", [])

                    for review in reviews:
                        reviewer_name = review.get("author_name", "N/A")
                        review_text = review.get("text", "No text")
                        review_rating = review.get("rating", "N/A")
                        review_timestamp = review.get("time")
                        review_date_actual = datetime.utcfromtimestamp(review_timestamp).strftime('%Y-%m-%d %H:%M:%S') if review_timestamp else "N/A"

                        all_reviews.append({
                            "Location ID": location_id,
                            "City Area": city_name,
                            "Business Name": name,
                            "Address": address,
                            "Reviewer Name": reviewer_name,
                            "Review": review_text,
                            "Rating": review_rating,
                            "Review Date": review_date_actual,
                            "Place ID": place_id
                        })

                except Exception as e:
                    logging.warning(f"Failed to fetch reviews for {name}: {e}")

                processed_locations += 1

            # Check for pagination token
            next_page_token = response.get("next_page_token")
            if not next_page_token:
                logging.info("No more pages available.")
                break

            logging.info("Waiting for next page token to activate...")
            time.sleep(3)  # Wait for the token to become active

        # Save location data to CSV
        pd.DataFrame(location_data).to_csv(f"data/location_data/naturals_{city_name}_data.csv", index=False)
        logging.info(f"Successfully saved {processed_locations} locations to f'data/{business_name}_{city_name}_data.csv'.")

        # Save all reviews to a single CSV
        if all_reviews:
            pd.DataFrame(all_reviews).to_csv(f"data/review_data/naturals_{city_name}_reviews.csv", index=False)
            logging.info(f"Successfully saved all reviews to f'data/{business_name}_{city_name}_all_reviews.csv'.")
        else:
            logging.warning("No reviews were found for any location.")

        # Save Google Maps URLs to a text file
        with open(f"data/location_data/maps_urls/{city_name}.txt", "w") as file:
            for url in urls:
                file.write(url + "\n")
        logging.info(f"Successfully saved Google Maps URLs to 'data/location_data/maps_urls/{city_name}.txt'.")

    except Exception as e:
        logging.error(f"An error occurred: {e}")

In [35]:
scrape_google_maps_reviews(api_key, "Naturals Signature", "Chennai South", 100)

2024-12-18 21:05:47,586 - INFO - Starting the scrape_google_maps_reviews function
2024-12-18 21:05:47,587 - INFO - API queries_quota: 60
2024-12-18 21:05:52,588 - INFO - Waiting for next page token to activate...
2024-12-18 21:06:00,100 - INFO - No more pages available.
2024-12-18 21:06:00,108 - INFO - Successfully saved 40 locations to f'data/Naturals Signature_Chennai South_data.csv'.
2024-12-18 21:06:00,117 - INFO - Successfully saved all reviews to f'data/Naturals Signature_Chennai South_all_reviews.csv'.
2024-12-18 21:06:00,118 - INFO - Successfully saved Google Maps URLs to 'data/location_data/maps_urls/Chennai South.txt'.


## Go from Google Sheet
Pass address directly

In [89]:
api_key = "AIzaSyB-wyVE7zyFMPwRKDQmw5fLrTluF_nAAxQ"

In [206]:
def scrape_google_maps_metadata(api_key, business_name, city_name, max_locations):
    """
    Extract metadata for up to `max_locations` using pagination in the Google Places API.

    Parameters:
        api_key (str): Google API key.
        business_name (str): Business name to query.
        city_name (str): City name for the search.
        max_locations (int): Total number of locations to extract.

    Returns:
        list[dict]: List of metadata dictionaries for the locations.
    """
    import googlemaps
    import logging
    import time

    logging.info("Starting the scrape_google_maps_metadata function")
    gmaps = googlemaps.Client(key=api_key)
    location_data = []

    try:
        processed_locations = 0
        next_page_token = None

        while processed_locations < max_locations:
            # Fetch places using the API
            query = f"{business_name}, {city_name}"
            if next_page_token:
                response = gmaps.places(query=query, page_token=next_page_token)
            else:
                response = gmaps.places(query=query)

            # Handle API errors
            if response.get("status") != "OK":
                logging.error(f"Places API failed: {response.get('status')}")
                break

            # Process results
            results = response.get("results", [])
            for place in results:
                if processed_locations >= max_locations:
                    break

                place_id = place["place_id"]
                name = place.get("name", "N/A")
                address = place.get("formatted_address", "N/A")
                rating = place.get("rating", "N/A")
                user_ratings_total = place.get("user_ratings_total", 0)

                # Generate Google Maps URL
                url = f"https://www.google.com/maps/place/?q=place_id:{place_id}"

                location_data.append({
                    "Location ID": f"LOC_{processed_locations + 1}",
                    "City Area": city_name,
                    "Name": name,
                    "Address": address,
                    "Rating": rating,
                    "Total Reviews": user_ratings_total,
                    "Place ID": place_id,
                    "Google Maps URL": url
                })

                processed_locations += 1

            # Check for pagination token
            next_page_token = response.get("next_page_token")
            if not next_page_token:
                logging.info("No more pages available.")
                break

            logging.info("Waiting for next page token to activate...")
            time.sleep(3)  # Wait for the token to become active

    except Exception as e:
        logging.error(f"An error occurred: {e}")

    return location_data

In [207]:
def scrape_google_maps_metadata(api_key, business_name, city_name, max_locations):
    """
    Extract metadata for up to `max_locations` using pagination in the Google Places API,
    including latitude and longitude.

    Parameters:
        api_key (str): Google API key.
        business_name (str): Business name to query.
        city_name (str): City name for the search.
        max_locations (int): Total number of locations to extract.

    Returns:
        list[dict]: List of metadata dictionaries for the locations.
    """
    import googlemaps
    import logging
    import time

    logging.info("Starting the scrape_google_maps_metadata function")
    gmaps = googlemaps.Client(key=api_key)
    location_data = []

    try:
        processed_locations = 0
        next_page_token = None

        while processed_locations < max_locations:
            # Fetch places using the API
            query = f"{business_name}, {city_name}"
            if next_page_token:
                response = gmaps.places(query=query, page_token=next_page_token)
            else:
                response = gmaps.places(query=query)

            # Handle API errors
            if response.get("status") != "OK":
                logging.error(f"Places API failed: {response.get('status')}")
                break

            # Process results
            results = response.get("results", [])
            for place in results:
                if processed_locations >= max_locations:
                    break

                place_id = place["place_id"]
                name = place.get("name", "N/A")
                address = place.get("formatted_address", "N/A")
                rating = place.get("rating", "N/A")
                user_ratings_total = place.get("user_ratings_total", 0)

                # Extract latitude and longitude
                geometry = place.get("geometry", {}).get("location", {})
                lat = geometry.get("lat", "N/A")
                lng = geometry.get("lng", "N/A")

                # Generate Google Maps URL
                url = f"https://www.google.com/maps/place/?q=place_id:{place_id}"

                location_data.append({
                    "Location ID": f"LOC_{processed_locations + 1}",
                    "City Area": city_name,
                    "Name": name,
                    "Address": address,
                    "Latitude": lat,
                    "Longitude": lng,
                    "Rating": rating,
                    "Total Reviews": user_ratings_total,
                    "Place ID": place_id,
                    "Google Maps URL": url
                })

                processed_locations += 1

            # Check for pagination token
            next_page_token = response.get("next_page_token")
            if not next_page_token:
                logging.info("No more pages available.")
                break

            logging.info("Waiting for next page token to activate...")
            time.sleep(3)  # Wait for the token to become active

    except Exception as e:
        logging.error(f"An error occurred: {e}")

    return location_data

In [208]:
scrape_google_maps_metadata(api_key, "Naturals Salon", "Adambakkam, Brindavan Nagar", 5)

2024-12-19 11:53:33,260 - INFO - Starting the scrape_google_maps_metadata function
2024-12-19 11:53:33,262 - INFO - API queries_quota: 60
2024-12-19 11:53:34,277 - INFO - No more pages available.


[{'Location ID': 'LOC_1',
  'City Area': 'Adambakkam, Brindavan Nagar',
  'Name': 'Naturals Salon',
  'Address': 'N37 First Flr, 2nd Street Shawalace Col, Brindavan Nagar, Adambakkam, Chennai, Tamil Nadu 600088, India',
  'Latitude': 12.9865709,
  'Longitude': 80.2051812,
  'Rating': 4.9,
  'Total Reviews': 352,
  'Place ID': 'ChIJD93nXvxdUjoRinIO5qSoRrc',
  'Google Maps URL': 'https://www.google.com/maps/place/?q=place_id:ChIJD93nXvxdUjoRinIO5qSoRrc'},
 {'Location ID': 'LOC_2',
  'City Area': 'Adambakkam, Brindavan Nagar',
  'Name': 'Naturals Salon',
  'Address': 'No.16A/17, Secretariat Colony Main Road, City Link Road, above Easyday Club, Adambakkam, Chennai, Tamil Nadu 600088, India',
  'Latitude': 12.9973019,
  'Longitude': 80.2072451,
  'Rating': 4.6,
  'Total Reviews': 1363,
  'Place ID': 'ChIJP8-7AF1nUjoR3_YHwKTtq28',
  'Google Maps URL': 'https://www.google.com/maps/place/?q=place_id:ChIJP8-7AF1nUjoR3_YHwKTtq28'}]

In [209]:
area, address = df.loc[2, ["Area", "Address"]]

In [210]:
area

'Adambakkam, Brindavan Nagar'

In [211]:
address

'N37 First Flr, 2nd Street Shawalace Col, Brindavan Nagar, Adambakkam, Adambakkam, Chennai - 600088'

In [213]:
df.head()

Unnamed: 0,Area,Address,Business Name,Status,Franchise Owner,Subscription,BGorgeous_Client_Name
0,Alwarpet,"No 220, Mowbrays Flats, TT Krishnamachari Road...",,,,,
1,Alwarpet,"No.37, 1st Floor, CP Ramaswamy Iyer Rd, opp. t...","Naturals Salon & Spa CP Ramaswamy road , Alwar...",,,,
2,"Adambakkam, Brindavan Nagar","N37 First Flr, 2nd Street Shawalace Col, Brind...",,,,,
3,"Adambakkam,Secretariat Colony rd","No 16A/17, Secretariat Col, City Link Rd, Adam...",,,,,
4,Adyar,"No 31, Ground Floor, Ceebros Enclave, 1st Main...",,,,,


In [214]:
len(df)

148

In [226]:
def process_dataframe(api_key, df):
    """
    Loop through a DataFrame and call `scrape_google_maps_metadata` for each row.

    Parameters:
        api_key (str): Google API key.
        df (pd.DataFrame): DataFrame containing 'Business Name' and 'City Name' columns.

    Returns:
        pd.DataFrame: A new DataFrame with the results.
    """
    import pandas as pd

    results = []

    for index, row in df.iterrows():
        city_name = row["Area"]

        # Call the scrape_google_maps_metadata function
        data = scrape_google_maps_metadata(api_key, "Naturals Salon Chennai", city_name, max_locations=10)  # Adjust max_locations as needed
        if data:
            results.extend(data)  # Extend the list with the results

    # Convert results to a DataFrame
    if results:
        return pd.DataFrame(results)
    else:
        print("No valid data retrieved.")
        return pd.DataFrame()  # Return an empty DataFrame if no results


In [227]:
# Process the DataFrame
result_df = process_dataframe(api_key, df)

2024-12-19 21:11:08,490 - INFO - Starting the scrape_google_maps_metadata function
2024-12-19 21:11:08,497 - INFO - API queries_quota: 60
2024-12-19 21:11:10,122 - INFO - No more pages available.
2024-12-19 21:11:10,125 - INFO - Starting the scrape_google_maps_metadata function
2024-12-19 21:11:10,127 - INFO - API queries_quota: 60
2024-12-19 21:11:11,902 - INFO - No more pages available.
2024-12-19 21:11:11,905 - INFO - Starting the scrape_google_maps_metadata function
2024-12-19 21:11:11,907 - INFO - API queries_quota: 60
2024-12-19 21:11:12,824 - INFO - No more pages available.
2024-12-19 21:11:12,826 - INFO - Starting the scrape_google_maps_metadata function
2024-12-19 21:11:12,828 - INFO - API queries_quota: 60
2024-12-19 21:11:13,403 - INFO - No more pages available.
2024-12-19 21:11:13,405 - INFO - Starting the scrape_google_maps_metadata function
2024-12-19 21:11:13,406 - INFO - API queries_quota: 60
2024-12-19 21:11:14,338 - INFO - No more pages available.
2024-12-19 21:11:14,

In [228]:
len(result_df)

470

In [229]:
result_df = result_df.drop_duplicates(subset=['Place ID'])

In [230]:
len(result_df)

158

In [231]:
result_df.head()

Unnamed: 0,Location ID,City Area,Name,Address,Latitude,Longitude,Rating,Total Reviews,Place ID,Google Maps URL
0,LOC_1,Alwarpet,"Naturals Salon & Spa CP Ramaswamy road , Alwar...","No.37, 1st Floor, CP Ramaswamy Iyer Rd, opp. t...",13.032512,80.256971,4.6,1078,ChIJB0quoslnUjoRf_vm8BiGHsM,https://www.google.com/maps/place/?q=place_id:...
1,LOC_2,Alwarpet,Naturals Lounge TTK,"No.220, Mowbrays Flats, TT Krishnamachari Rd, ...",13.043691,80.259478,4.6,656,ChIJfwi_pjZmUjoRtNJzL20D2-A,https://www.google.com/maps/place/?q=place_id:...
2,LOC_3,Alwarpet,Naturals Salon,"No 220, Mowbrays Flats, TT Krishnamachari Rd, ...",13.043618,80.259485,5.0,1,ChIJ1cEdFYpnUjoRBTQevSN3-mU,https://www.google.com/maps/place/?q=place_id:...
6,LOC_1,"Adambakkam, Brindavan Nagar",Naturals Salon,"N37 First Flr, 2nd Street Shawalace Col, Brind...",12.986571,80.205181,4.9,353,ChIJD93nXvxdUjoRinIO5qSoRrc,https://www.google.com/maps/place/?q=place_id:...
7,LOC_2,"Adambakkam, Brindavan Nagar",Naturals Salon,"No.16A/17, Secretariat Colony Main Road, City ...",12.997302,80.207245,4.6,1363,ChIJP8-7AF1nUjoR3_YHwKTtq28,https://www.google.com/maps/place/?q=place_id:...


In [232]:
result_df["Total Reviews"].sum()

np.int64(178570)

In [233]:
result_df["Rating"].isna().sum()

np.int64(0)

In [234]:
(result_df['Total Reviews'] == 0).sum()

np.int64(0)

In [238]:
result_df["Name"].value_counts()

Name
Naturals Salon                                                                                          93
Naturals Signature Salon                                                                                17
Naturals                                                                                                 2
Naturals salon                                                                                           2
Naturals Salon & Spa CP Ramaswamy road , Alwarpet in Chennai                                             1
Shree Naturals                                                                                           1
Naturals Salon & Spa Neelankarai                                                                         1
Naturals Salon & Spa Siruseri                                                                            1
Naturals Unisex Hair And Style Salon                                                                     1
GREEN TRENDS PERUNGALATHUR      

In [244]:
# List of allowed values
not_allowed_names = ["Shree Naturals", "GREEN TRENDS PERUNGALATHUR", "Sun Naturals Unisex Salon And Spa", "Studieo7 Family Salon & Bridal Studio - Nanganallur",
                    "Allure Unisex Salon", "Isha", "Green Trends Unisex Hair & Style Salon", "Page 3 Luxury Salon Kilpauk", "Vinodhbamaa Makeup Artistry | Beautician Course in Chennai | Makeup Training Academy | Makeup course",
                    "Naturals Corporate Office"]

# Keep rows where 'name' is in the list of allowed values
result_df = result_df[~result_df['Name'].isin(not_allowed_names)]


In [245]:
len(result_df)

148

In [252]:
result_df["Name"].value_counts()

Name
Naturals Salon                                                  93
Naturals Signature Salon                                        17
Naturals salon                                                   2
Naturals                                                         2
Naturals Salon & Spa CP Ramaswamy road , Alwarpet in Chennai     1
Naturals Salon & Spa Chetpet                                     1
Naturals Salon Moulivakkam                                       1
Naturals Salon & Spa Neelankarai                                 1
Naturals Salon & Spa Siruseri                                    1
Naturals Unisex Hair And Style Salon                             1
Naturals Salon & Spa Poonamallee                                 1
NATURALS Salon And Spa Vadapalani                                1
Naturals Salon & Spa , Saidapet                                  1
Naturals Salon Nanmangalam,                                      1
Naturals Unisex Salon Sholinganallur                     

In [253]:
result_df['full_location'] = result_df['City Area'] + " " + result_df['Name']

In [254]:
result_df.to_csv("data/naturals_chennai_locations_metadata.csv")
result_df['Google Maps URL'].to_csv('data/naturals_chennai_maps_urls.txt', index=False)

In [255]:
result_df.head()

Unnamed: 0,Location ID,City Area,Name,Address,Latitude,Longitude,Rating,Total Reviews,Place ID,Google Maps URL,full_location
0,LOC_1,Alwarpet,"Naturals Salon & Spa CP Ramaswamy road , Alwar...","No.37, 1st Floor, CP Ramaswamy Iyer Rd, opp. t...",13.032512,80.256971,4.6,1078,ChIJB0quoslnUjoRf_vm8BiGHsM,https://www.google.com/maps/place/?q=place_id:...,Alwarpet Naturals Salon & Spa CP Ramaswamy roa...
1,LOC_2,Alwarpet,Naturals Lounge TTK,"No.220, Mowbrays Flats, TT Krishnamachari Rd, ...",13.043691,80.259478,4.6,656,ChIJfwi_pjZmUjoRtNJzL20D2-A,https://www.google.com/maps/place/?q=place_id:...,Alwarpet Naturals Lounge TTK
2,LOC_3,Alwarpet,Naturals Salon,"No 220, Mowbrays Flats, TT Krishnamachari Rd, ...",13.043618,80.259485,5.0,1,ChIJ1cEdFYpnUjoRBTQevSN3-mU,https://www.google.com/maps/place/?q=place_id:...,Alwarpet Naturals Salon
6,LOC_1,"Adambakkam, Brindavan Nagar",Naturals Salon,"N37 First Flr, 2nd Street Shawalace Col, Brind...",12.986571,80.205181,4.9,353,ChIJD93nXvxdUjoRinIO5qSoRrc,https://www.google.com/maps/place/?q=place_id:...,"Adambakkam, Brindavan Nagar Naturals Salon"
7,LOC_2,"Adambakkam, Brindavan Nagar",Naturals Salon,"No.16A/17, Secretariat Colony Main Road, City ...",12.997302,80.207245,4.6,1363,ChIJP8-7AF1nUjoR3_YHwKTtq28,https://www.google.com/maps/place/?q=place_id:...,"Adambakkam, Brindavan Nagar Naturals Salon"


In [256]:
result_df[result_df["Name"] == "Natural Fresh"]

Unnamed: 0,Location ID,City Area,Name,Address,Latitude,Longitude,Rating,Total Reviews,Place ID,Google Maps URL,full_location


### Read Data

In [8]:
import pandas as pd

df = pd.read_csv("data/naturals_chennai_locations_metadata.csv")

In [9]:
len(df)

148