In [23]:
# Selenium Imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

# Dataframe and Excel Imports
import pandas as pd
import openpyxl

# Other imports
from datetime import datetime
import random
from icecream import ic
from itertools import cycle
import threading
import queue

In [24]:
# Configure Chrome DevTools options
chrome_options = Options()
chrome_options.add_experimental_option("w3c", False)
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")

In [25]:
PROXY_SERVER = "p.webshare.io"
PROXY_PORT = 80
PROXY_USERNAME = "ddqzftdi-rotate"
PROXY_PASSWORD = "xy9gdpmlqhqs"

In [26]:
options = {
    'proxy': {
        'http' : f'http://{PROXY_USERNAME}:{PROXY_PASSWORD}@{PROXY_SERVER}:{PROXY_PORT}',
        'https' : f'https://{PROXY_USERNAME}:{PROXY_PASSWORD}@{PROXY_SERVER}:{PROXY_PORT}',
        'verify_ssl': False,
    }
}

In [27]:
rotating_proxy_list = ['3.226.168.144:80','54.219.125.50:8080','35.222.50.197:80','148.76.97.250:80','162.223.94.164:80','162.223.94.163:80','12.69.91.227:80','72.169.67.17:87','137.184.41.250:80','47.90.162.160:8080','64.176.5.119:80','129.159.88.228:80','192.241.153.116:3128','147.182.132.21:80','47.88.3.19:8080','160.72.82.101:80','34.238.235.194:80']
# Set up a proxy rotation using itertools.cycle
proxy_pool = cycle(rotating_proxy_list)

In [28]:
cities_excel_file = f'data_{datetime.now().strftime("%Y_%m_%d_%I_%M_%S_%p")}.xlsx'
cities_df = pd.read_excel('cities_data.xlsx', sheet_name='cities')
search_term_df = pd.read_excel('cities_data.xlsx', sheet_name='Search Terms')
display(search_term_df)
timeout = 5

Unnamed: 0,Common,City Wise
0,VOIP phone,VOIP phone in
1,Internet Phone,Internet Phone in


In [29]:
def collect_data(search_term, driver, city=None, latitude=None, longitude=None):
    if ((latitude is not None) and (longitude is not None)):    
        # Set Serach Location
        driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
            'latitude': latitude,
            'longitude': longitude,
            'accuracy': 100
        })
        
    print(f"Getting data for : {search_term}")
        
    # Navigate to search results page
    driver.get(f"https://www.google.com/search?q={search_term}")
    
    # Wait for the page to load
    driver.implicitly_wait(random.randint(5, 20))
    
    # Collect data for sponsored links
    sponsored_links = driver.find_elements(By.XPATH, "//div[@class='uEierd']")
    
    # WebDriverWait(driver, timeout).until(sponsored_links)
    data = []
    if len(sponsored_links) == 0:
        data.append({
                    "city" : city,
                    "latitude" : latitude,
                    "longitude" : longitude,
                    "search term" : search_term,
                    "link_text": None,
                    "link_url": None,
                    "link_description": None,
                    "additional_data": None,
                    "timestamp" : datetime.now()
                })
    else:
        for link in sponsored_links:
            link_element = link.find_element(By.TAG_NAME, "a")
            link_text = link_element.text
            link_url = link_element.get_attribute("href")
            link_description = link.find_element(By.TAG_NAME, "span").text
            
            additional_data_elements = link.find_elements(By.CSS_SELECTOR, ".MUxGbd, .yDYNvb, .lyLwlc")
            additional_data = [elem.text for elem in additional_data_elements]
            
            data.append({
                "city" : city,
                "latitude" : latitude,
                "longitude" : longitude,
                "search term" : search_term,
                "link_text": link_text,
                "link_url": link_url,
                "link_description": link_description,
                "additional_data": additional_data,
                "timestamp" : datetime.now()
            })
        
    return data

In [30]:
results_data = []

In [31]:
# for i, row in search_term_df.iterrows():
#     ic(row['Common'])
#     for index, city_data in cities_df.iterrows():
#         ic(f"{row['City Wise']} {city_data['city']}")

In [32]:
with webdriver.Chrome() as driver:

    # driver = webdriver.Chrome()
    driver.options = options
    # driver.options.add_experimental_option("w3c", False)
    # driver.options.add_argument("--disable-extensions")
    # driver.options.add_argument("--disable-gpu")
    # driver.options.add_argument("--headless")
    
    # Iterate over the cities and collect data for each city
    for i, row in search_term_df.iterrows():
        common_search_term = ''
        common_search_term = f"{row['Common']}"
        data = collect_data(
            search_term=common_search_term, 
            driver=driver
        )
        results_data.extend(data)

        for index, city_data in cities_df.iterrows():
            common_search_term = ''
            city = city_data["city"]
            latitude = city_data["latitude"]
            longitude = city_data["longitude"]
            search_term = f"{row['City Wise']} {city}"
            
            data = collect_data(
                search_term=search_term,
                driver=driver, 
                city=city, 
                latitude=latitude,
                longitude=longitude
            )
            results_data.extend(data)

    # Create a dataframe from the collected results data
    results_df = pd.DataFrame(results_data)

    # Append the results dataframe to the Excel sheet
    results_sheet_name = "results"

    try:
        with pd.ExcelWriter(cities_excel_file, mode="a", engine="openpyxl") as writer:
            results_df.to_excel(writer, sheet_name=results_sheet_name, index=False, header=not writer.book)

    except FileNotFoundError:
        results_df.to_excel(cities_excel_file, sheet_name=results_sheet_name, index=False)

    print("Data appended to Excel successfully.")

Getting data for : VOIP phone
Getting data for : VOIP phone in New York
Getting data for : VOIP phone in Los Angeles
Getting data for : VOIP phone in Chicago
Getting data for : VOIP phone in Houston
Getting data for : VOIP phone in Phoenix
Getting data for : VOIP phone in Philadelphia
Getting data for : VOIP phone in San Antonio
Getting data for : VOIP phone in San Diego
Getting data for : VOIP phone in Dallas
Getting data for : VOIP phone in San Jose
Getting data for : VOIP phone in Austin
Getting data for : VOIP phone in Jacksonville
Getting data for : VOIP phone in San Francisco
Getting data for : VOIP phone in Columbus
Getting data for : VOIP phone in Indianapolis
Getting data for : VOIP phone in Fort Worth
Getting data for : VOIP phone in Charlotte
Getting data for : VOIP phone in Seattle
Getting data for : VOIP phone in Denver
Getting data for : VOIP phone in Washington, D.C.
Getting data for : VOIP phone in Boston
Getting data for : VOIP phone in El Paso
Getting data for : VOIP 