In [51]:
# !pip install selenium

In [52]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException, NoSuchAttributeException
from collections import defaultdict
from time import sleep
import pandas as pd
import numpy as np

In [53]:
def return_element_values(object, chrome): #returns a list of the values of features
    element_values = {}
    all_element_values = []
    for element in chrome.find_elements(By.TAG_NAME, object):
        element_values['tag'] = element.tag_name
        element_values['size'] = element.size
        element_values['location'] = element.location
        element_values['text'] = element.text
        
        for col in ['id','name','class']:
            try:
                # WebDriverWait(chrome).until(EC.presence_of_element_located((By.ID, element.id)))
                attribute = element.get_attribute(col)
                if attribute and attribute.strip():
                    element_values[col] = attribute
                
            except (TimeoutException, StaleElementReferenceException, NoSuchElementException):
                element_values[col] = np.nan
        try:
            element_values['first_class_div'] = chrome.find_element(By.CLASS_NAME, element_values["class"]).id
        except NoSuchAttributeException:
            try:
                element_values['first_class_div'] = chrome.find_element(By.CLASS_NAME, element_values["class"]).get_attribute('name')
            except:
                element_values['first_class_div'] = np.nan
        except (TimeoutException, StaleElementReferenceException, NoSuchElementException, KeyError):
            element_values['first_class_div'] = np.nan
                
        all_element_values.append(element_values)
    return all_element_values

In [54]:
def make_dataset(url, columns = ['tag', 'name', 'class','id', 'text']): #Creates dataset using return_element_values(object, chrome)
    with webdriver.Chrome() as chrome:
        chrome.maximize_window()
        chrome.get(url) #opens a url in chrome
        WebDriverWait(chrome,5).until(EC.presence_of_all_elements_located((By.TAG_NAME,'*')))
        object_values = []
        for object in ['button','form','field','input', 'nav']:
            # try:
            object_values.append(pd.DataFrame(return_element_values(object, chrome), columns= columns).fillna(np.nan))
            # except (StaleElementReferenceException, NoSuchElementException) as e:
            #     continue
        return pd.concat(object_values)
        # return pd.concat(map(lambda object: pd.DataFrame(return_element_values(object, chrome), columns= columns), ['button','form','field']))


In [55]:
###########driver code
web_data = pd.DataFrame(columns = ['tag', 'name', 'class','id', 'text','location','size', 'first_class_div'])
for url in [r"https://www.reddit.com/r/learnprogramming/top/?t=month",r'https://www.mercadolivre.com.br/#from=homecom',r'https://www.indeed.com/',r'https://www.tripadvisor.com/',r'https://www.yellowpages.com/',r'https://www.ebay.com/',r'https://www.ebay.com/b/PC-Gaming/bn_7000259657']:
    web_data = pd.concat( [web_data, make_dataset(url, web_data.columns)])
print(web_data.info())
web_data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279 entries, 0 to 1
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tag              279 non-null    object
 1   name             31 non-null     object
 2   class            273 non-null    object
 3   id               157 non-null    object
 4   text             279 non-null    object
 5   location         279 non-null    object
 6   size             279 non-null    object
 7   first_class_div  80 non-null     object
dtypes: object(8)
memory usage: 19.6+ KB
None


Unnamed: 0,tag,name,class,id,text,location,size,first_class_div
0,button,,_1LHxa-yaHJwrPK8kuyv_Y4 _2iuoyPiKHN3kfOoeIQalD...,t3_yv51u5-overflow-menu,Join,"{'x': 1182, 'y': 1241}","{'height': 32, 'width': 106}",
1,button,,_1LHxa-yaHJwrPK8kuyv_Y4 _2iuoyPiKHN3kfOoeIQalD...,t3_yv51u5-overflow-menu,Join,"{'x': 1182, 'y': 1241}","{'height': 32, 'width': 106}",
2,button,,_1LHxa-yaHJwrPK8kuyv_Y4 _2iuoyPiKHN3kfOoeIQalD...,t3_yv51u5-overflow-menu,Join,"{'x': 1182, 'y': 1241}","{'height': 32, 'width': 106}",
3,button,,_1LHxa-yaHJwrPK8kuyv_Y4 _2iuoyPiKHN3kfOoeIQalD...,t3_yv51u5-overflow-menu,Join,"{'x': 1182, 'y': 1241}","{'height': 32, 'width': 106}",
4,button,,_1LHxa-yaHJwrPK8kuyv_Y4 _2iuoyPiKHN3kfOoeIQalD...,t3_yv51u5-overflow-menu,Join,"{'x': 1182, 'y': 1241}","{'height': 32, 'width': 106}",
...,...,...,...,...,...,...,...,...
4,input,_nkw,btn btn-prim gh-spr,crid,,"{'x': 0, 'y': 0}","{'height': 0, 'width': 0}",
5,input,_nkw,btn btn-prim gh-spr,crid,,"{'x': 0, 'y': 0}","{'height': 0, 'width': 0}",
6,input,_nkw,btn btn-prim gh-spr,crid,,"{'x': 0, 'y': 0}","{'height': 0, 'width': 0}",
0,nav,,pagination,,Results Pagination - Page 1\n1\n2\n3\n4\n5\n6\...,"{'x': 516, 'y': 4575}","{'height': 44, 'width': 536}",57417244-594d-48f5-99e7-066aad84c141
