<a href="https://colab.research.google.com/github/menphes/rpa_challenge-robocorp/blob/main/RPA_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install selenium
!apt update
!apt install chromium-chromedriver

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.1-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.1-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m28.

In [14]:
# =|=|=|=|=|=|=|=|=|= Start of Libs and Packs Import =|=|=|=|=|=|=|=|=|= #
import os, json, time, requests, urllib.request, re, os.path
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
# =|=|=|=|=|=|=|=|=|= End of Libs and Packs Import =|=|=|=|=|=|=|=|=|= #

In [3]:
# =|=|=|=|=|=|=|=|=|= Start of Function Definitions =|=|=|=|=|=|=|=|=|= #
class GoogleChrome:
    def __init__(self, urlPath, downloadPath=f"/content/sample_data"):
        """
        Initialize Browser Object.

        Args:
            urlPath (str): The Full URL to open the Browser to.
            downloadPath (str, optional): Path to Where Browser Downloads should be saved. Default is "Windows Standard Download"

        Returns:
            None
        """
        options = webdriver.ChromeOptions()
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        prefs = {"download.default_directory": downloadPath, "directory_upgrade": True, "download.prompt_for_download": False, "plugins.always_open_pdf_externally": True}
        options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(options=options)
        self.driver.get(f"{urlPath}")
        self.action = ActionChains(self.driver)

    def get_current_url(self):
        """
        Get Active Tab Current URL.

        Args:
            None

        Returns:
            current_url (str): The URL for the currently active Chrome Tab
        """
        return self.driver.current_url

    def update_url(self, newURL):
        """
        Update Current Browser URL.

        Args:
            newURL (str): The Full URL to redirect the Browser to.

        Returns:
            None
        """
        self.driver.get(f"{newURL}")

    def take_screenshot(self, downloadPath=f"/content/sample_data"):
        """
        Update Current Browser URL.

        Args:
            downloadPath (str, optional): Path to Where Browser Downloads should be saved. Default is "Windows Standard Download"

        Returns:
            None
        """
        self.driver.get_screenshot_as_file(f'{downloadPath}/screenshot_{datetime.today()}.png')

    def get_element(self, elem, timeWait, searchType, waitType):
        """
        Find Specified Element.

        Args:
            elem (str): The Element's Full XPath, Class_Name or ID
            timeWait (int): Max time in Seconds to wait for Element
            searchType (str): The type of search to be used to find the Element. Options: 'xpath', 'id' or 'class'
            waitType (str): The type of waiting to be done. Options: 'visible', 'clickable', or 'present'

        Returns:
            element (obj): The object of the Element
        """
        if searchType == 'xpath':
            if str(elem)[:1] != '(' and str(elem)[-1:] != ')':
                elem = f"({str(elem).strip()})"
            searchType = By.XPATH
        elif searchType == 'id':
            searchType = By.ID
        elif searchType == 'class':
            elem = str(elem).strip().replace(' ', '.')
            searchType = By.CLASS_NAME
        if waitType == 'visible':
            waitType = EC.visibility_of_element_located
        elif waitType == 'clickable':
            waitType = EC.element_to_be_clickable
        elif waitType == 'present':
            waitType = EC.presence_of_element_located
        return WebDriverWait(self.driver, timeWait).until(waitType((searchType, elem)))

    def get_element_data(self, elem, timeWait, searchType='xpath', waitType='visible'):
        """
        Gets an Element's Properties Information.

        Args:
            elem (str): The Element's Full XPath, Class_Name or ID
            timeWait (int): Max time in Seconds to wait for Element
            searchType (str): The type of search to be used to find the Element. Default: 'xpath'. 'Options: 'xpath', 'id' or 'class'
            waitType (str): The type of waiting to be done. Default: 'visible'. Options: 'visible', 'clickable', or 'present'

        Returns:
            dictReturn (dict): Dictionary with Multiple Properties: ['Text', 'InnerText', 'Value', 'DValue', 'HRef', 'InnerHTML', 'AltText']
        """
        element = self.get_element(elem, timeWait, searchType, waitType)
        try:
            textVal = element.text
        except:
            textVal = ''
        try:
            valueVal = element.get_attribute('value')
        except:
            valueVal = ''
        try:
            dataVal = element.get_attribute('data-value')
        except:
            dataVal = ''
        try:
            hRefVal = element.get_attribute('href')
        except:
            hRefVal = ''
        try:
            innerTXTVal = element.get_attribute('innerText')
        except:
            innerTXTVal = ''
        try:
            innerHTMLVal = element.get_attribute("innerHTML")
        except:
            innerHTMLVal = ''
        try:
            altText = element.get_attribute("alt")
        except:
            altText = ''
        try:
            src = element.get_attribute('src')
        except:
            src = ""
        dictReturn = {'Text': textVal, 'InnerText':innerTXTVal, 'Value': valueVal, 'DValue':dataVal, 'HRef': hRefVal, 'InnerHTML': innerHTMLVal, 'AltText': altText, 'Source': src}
        return dictReturn

    def go_to_element(self, elem, timeWait, click=True, searchType='xpath', waitType='visible'):
        """
        Moves to an Element, while also being able to click it.

        Args:
            elem (str): The Element's Full XPath, Class_Name or ID
            timeWait (int): Max time in Seconds to wait for Element
            click (bool): Clicks the Element. Default: True
            searchType (str): The type of search to be used to find the Element. Default: 'xpath'. 'Options: 'xpath', 'id' or 'class'
            waitType (str): The type of waiting to be done. Default: 'visible'. Options: 'visible', 'clickable', or 'present'

        Returns:
            None
        """
        element = self.get_element(elem, timeWait, searchType, waitType)
        if click == True:
            try:
                self.action.move_to_element(element).click().perform()
            except:
                self.driver.execute_script("arguments[0].click();", element)
        else:
            self.action.move_to_element(element).perform()

    def loop_list(self, elem, timeWait, searchType='xpath', waitType='present'):
        """
        Loops through a HTML List and prints their respective items.

        Args:
            elem (str): The Element's Full XPath, Class_Name or ID
            timeWait (int): Max time in Seconds to wait for Element
            searchType (str): The type of search to be used to find the Element. Default: 'xpath'. 'Options: 'xpath', 'id' or 'class'
            waitType (str): The type of waiting to be done. Default: 'present'. Options: 'visible', 'clickable', or 'present'

        Returns:
            arrList (list): returns list of all List Item Texts
        """
        try:
            x = 0
            arrList = []
            while True:
                if x == 5:
                    print('ERROR: Unable to get Element! Please contact douglas.falcao@merckgroup.com')
                    return arrList
                listView = self.get_element(elem, timeWait, searchType, waitType)
                options = listView.find_elements(By.TAG_NAME, "li")
                if len(options) == 0:
                    x += 1
                    time.sleep(1)
                    continue
                for option in options:
                    arrList.append(option.text)
                return arrList
        except:
            print('ERROR: Issue with Website! Please contact douglas.falcao@merckgroup.com')
            return arrList

    def close(self):
        """
        Fully Closes the Test Chrome Browser and all of it's Tabs.

        Args:
            None

        Returns:
            None
        """
        self.driver.quit()
# =|=|=|=|=|=|=|=|=|= End of Function Definitions =|=|=|=|=|=|=|=|=|= #

In [4]:
param_search_phrase = input('Please insert search phrase: ').lower()
param_search_phrase = str(param_search_phrase).replace(' ','%20')

Please insert search phrase: olympic GOLD MeDaL


In [5]:
param_categ_sect_topic = input('Please insert the news category/section/topic: ').lower()

Please insert the news category/section/topic: SPOrts


In [6]:
param_qty_months = input('Please insert the number of months for which you need to receive news: ')
try:
  param_qty_months = int(param_qty_months)
except:
  print('Not a number.')

Please insert the number of months for which you need to receive news: 2


In [7]:
if param_qty_months <= 1:
  max_date = datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
else:
  max_date = datetime.today().replace(day=1, hour=0, minute=0, second=0, microsecond=0) - relativedelta(months=param_qty_months-1)

In [15]:
browser = GoogleChrome(f"https://www.latimes.com/search?q={param_search_phrase}&s=1")
browser.go_to_element(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/aside/div/div[3]/div[1]/ps-toggler/ps-toggler/button", 15, waitType='clickable')
lst_topics = browser.loop_list(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/aside/div/div[3]/div[1]/ps-toggler/ps-toggler/div/ul", 15)
lst_topics = [x.replace("\n", " ") for x in lst_topics]
lst_topics = [x.lower() for x in lst_topics]
lst_topics = [re.sub(r"\s[(][\d]*[)]", "", x) for x in lst_topics]
df_news = pd.DataFrame(columns =['title', 'description', 'date', 'img_description', 'img_location', 'money_mention'])
try:
  topics_checkbox_index = lst_topics.index(param_categ_sect_topic) + 1
  topic_filter_value = browser.get_element_data(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/aside/div/div[3]/div[1]/ps-toggler/ps-toggler/div/ul/li[{topics_checkbox_index}]/div/div[1]/label/input", 15)['Value']
  browser.go_to_element(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/aside/div/div[3]/div[1]/ps-toggler/ps-toggler/div/ul/li[{topics_checkbox_index}]/div/div[1]/label/input", 15, waitType='clickable')
  time.sleep(5)
except:
  print(f"No topic matching desired search: {param_categ_sect_topic}")
total_query_pages = int(str(browser.get_element_data(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/main/div[2]/div[2]", 15)['Text']).split(' ')[2].replace(',', '')) + 1

for j in range(1, total_query_pages):
  for i in range(1, 11):
    news_title = browser.get_element_data(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/main/ul/li[{i}]/ps-promo/div/div[2]/div/h3/a", 15)['Text']
    news_description = browser.get_element_data(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/main/ul/li[{i}]/ps-promo/div/div[2]/p[1]", 15)['Text']
    try:
      news_date = browser.get_element_data(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/main/ul/li[{i}]/ps-promo/div/div[2]/p[2]", 5)['Text']
      matches = re.findall(r"[$][\d]*[.,]?|[\d]*\sdollars|[\d]*\sUSD", news_description, re.MULTILINE)
      if not matches:
        money_mention = False
      else:
        money_mention = True
    except:
      news_date = news_description
      news_description = ""
      money_mention = False
    try:
      time_value = int(re.findall(r"[\d]+", news_date)[0])
    except:
      pass
    if 'second' in news_date.lower():
      news_date = datetime.today() - timedelta(seconds=time_value)
    elif 'minute' in news_date.lower():
      news_date = datetime.today() - timedelta(minutes=time_value)
    elif 'hour' in news_date.lower():
      news_date = datetime.today() - timedelta(hours=time_value)
    else:
      news_date = news_date
      try:
        news_date = datetime.strptime(news_date, '%b. %d, %Y')
      except:
        news_date = datetime.strptime(news_date, '%B %d, %Y')
    if max_date > news_date:
      break
    news_img_description = browser.get_element_data(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/main/ul/li[{i}]/ps-promo/div/div[1]/a/picture/img", 15, searchType='xpath', waitType='visible')['AltText']
    try:
      news_img = browser.get_element(f"//div[@class='page-content']/ps-search-results-module/form/div[2]/ps-search-filters/div/main/ul/li[{i}]/ps-promo/div/div[1]/a/picture/img", 15, searchType='xpath', waitType='visible')
      src = news_img.get_attribute('src')
      urllib.request.urlretrieve(src, f'/content/sample_data/{param_search_phrase.replace("%20", "_")}-{j}_{i}.png')
      print(f'/content/sample_data/{param_search_phrase.replace("%20", "_")}-{j}_{i}.png')
      news_img_save_location = f'/content/sample_data/{param_search_phrase.replace("%20", "_")}-{j}_{i}.png'
      print(news_img_save_location)
    except:
      pass
    print(f'title: {news_title}')
    print(f'date: {news_date}\n')
    df_news = pd.concat([df_news, pd.DataFrame({'title': [news_title], 'description': [news_description], 'date': [news_date], 'img_description': [news_img_description], 'img_location': [news_img_save_location], 'money_mention': [money_mention]})], ignore_index=True)
  if max_date > news_date:
      break
  try:
    browser.go_to_element(f"/html/body/modality-custom-element//div/div/div/div/a", 10, waitType='clickable')
  except:
    pass
  curr_url = str(browser.get_current_url())
  if '+' in curr_url and '&p=' not in curr_url:
    base_url = curr_url
  curr_url = f"{str(base_url).replace('+', '%20')}&p={j+1}"
  browser.update_url(curr_url)
browser.close()
df_news.to_excel(f'/content/sample_data/{param_search_phrase.replace("%20", "_")}_{param_categ_sect_topic}_{param_qty_months}.xlsx', index=False)
df_news

/content/sample_data/olympic_gold_medal-1_1.png
/content/sample_data/olympic_gold_medal-1_1.png
title: The Sports Report: End the USC-Notre Dame series? Is Lincoln Riley for real?
date: 2024-08-05 00:00:00



  df_news = pd.concat([df_news, pd.DataFrame({'title': [news_title], 'description': [news_description], 'date': [news_date], 'img_description': [news_img_description], 'img_location': [news_img_save_location], 'money_mention': [money_mention]})], ignore_index=True)


/content/sample_data/olympic_gold_medal-1_2.png
/content/sample_data/olympic_gold_medal-1_2.png
title: Letters to Sports: Dodgers should stop buying stars and start growing them
date: 2024-08-03 00:00:00

/content/sample_data/olympic_gold_medal-1_3.png
/content/sample_data/olympic_gold_medal-1_3.png
title: Simone Biles’ memo to Trump: ‘I love my black job.’ No one is replacing gymnast at Olympics
date: 2024-08-02 00:00:00

/content/sample_data/olympic_gold_medal-1_4.png
/content/sample_data/olympic_gold_medal-1_4.png
title: Dodgers Dugout: Big trade deadline deals guarantee ... absolutely nothing
date: 2024-08-02 00:00:00

/content/sample_data/olympic_gold_medal-1_5.png
/content/sample_data/olympic_gold_medal-1_5.png
title: The Sports Report: Is Mike Trout still a Hall of Famer?
date: 2024-08-02 00:00:00

/content/sample_data/olympic_gold_medal-1_6.png
/content/sample_data/olympic_gold_medal-1_6.png
title: The Sports Report: Dodgers are swept by Padres
date: 2024-08-01 00:00:00

/conte

Unnamed: 0,title,description,date,img_description,img_location,money_mention
0,The Sports Report: End the USC-Notre Dame seri...,The last time Los Angeles got a good look at L...,2024-08-05,USC coach Lincoln Riley signals up field while...,/content/sample_data/olympic_gold_medal-1_1.png,False
1,Letters to Sports: Dodgers should stop buying ...,Readers of the Los Angeles Times Sports sectio...,2024-08-03,"LOS ANGELES, CA - JULY 22, 2024: Dodgers manag...",/content/sample_data/olympic_gold_medal-1_2.png,False
2,Simone Biles’ memo to Trump: ‘I love my black ...,"Simone Biles, who has won gold twice at the Pa...",2024-08-02,Simone Biles competes on the uneven bars durin...,/content/sample_data/olympic_gold_medal-1_3.png,False
3,Dodgers Dugout: Big trade deadline deals guara...,The Dodgers made some decent acquisitions at t...,2024-08-02,Detroit Tigers' Jack Flaherty (9) reacts to hi...,/content/sample_data/olympic_gold_medal-1_4.png,False
4,The Sports Report: Is Mike Trout still a Hall ...,An injury has sidelined Mike Trout for the sea...,2024-08-02,Los Angeles Angels' Mike Trout warms up prior ...,/content/sample_data/olympic_gold_medal-1_5.png,False
...,...,...,...,...,...,...
62,Mychal Thompson tried to recruit Klay to Laker...,Former Warrior Klay Thompson’s choice of Maver...,2024-07-02,"Mychal Thompson, left, and son Klay Thompson p...",/content/sample_data/olympic_gold_medal-7_3.png,False
63,Dodgers Dugout: Why doesn’t Shohei Ohtani (and...,The Dodgers annually lead the majors in attend...,2024-07-02,Los Angeles Dodgers' Shohei Ohtani walks throu...,/content/sample_data/olympic_gold_medal-7_4.png,False
64,The Sports Report: UCLA’s DeShaun Foster reope...,"For the first time in years, there’s something...",2024-07-02,"Pasadena, CA - April 27: Head coach DeShaun Fo...",/content/sample_data/olympic_gold_medal-7_5.png,False
65,Klay Thompson will join Mavericks on a three-y...,The former Warriors star will leave Golden Sta...,2024-07-01,FILE - Golden State Warriors guard Klay Thomps...,/content/sample_data/olympic_gold_medal-7_6.png,False
