In [251]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd

In [259]:
def details_scrape(url):
    """
    Uses selenium to scrape additional information contained in a sub-menu
    
    Clicks the first box, guru throws up a log-in, goes back, opens all the boxes
    and then finishes by extracting all the html from each box.
    
    This takes in a single URL (string) and extracts from that. One problem that may arise
    is if it doesn't throw the log-in box and the driver tries to go back a page.
    However, I think we can get around this by just loading each URL new. 
    
    May be more efficient to page through? That's for another time.
    
    RETURNS: list of names and list of HTML for each details box.
    
    Test URL:
    "https://www.guru.com/d/freelancers/lc/united-states/california/los-angeles/pg/1/"
    """
    
    driver = webdriver.Firefox()
    driver.get(url)
    login_form = driver.find_elements_by_xpath('//button[@class="tabControls__button"]')

    time.sleep(2) # Wait to click
    login_form[2].click()

    time.sleep(2) # Wait for log-in prompt
    driver.back() # Go back to main page
    time.sleep(2) # Wait to scrape

    # Pulling the paths again (they change for some reason)
    login_form = driver.find_elements_by_xpath('//button[@class="tabControls__button"]') 
    counter = 2
    
    # There are multiple buttons so here I am just clicking the ones I want (2,6,10,...)
    for i, val in enumerate(login_form):    
        if i == counter:
            login_form[i].click()
            counter += 4
    
    print("Completed opening all the tabs")

    # Pull elements in the now opened detail boxes
        # //div[@class="avatarinfo"]
    user_name = driver.find_elements_by_xpath('//div[@class="module_avatar freelancerAvatar"]')
    user_detail = driver.find_elements_by_xpath('//ul[@class="feedback__stats clearfix"]')

    # Extract text from the names and HTML from the details
    # Will parse the detail_html further with beautiful soup
    # Each is a list of length equal to the number of users on the page
    names = []
    for i, val in enumerate(user_name):
        names.append(val.get_attribute('innerHTML'))

    detail_html = []
    for i, val in enumerate(user_detail):
        detail_html.append(val.get_attribute('innerHTML'))

    driver.close()
    print("Finished")
    
    return names, detail_html

In [188]:
def details_to_soup(x):
    """
    Takes in a list of raw htmls and parses them with BeautifulSoup.
    
    Returns a list of cleaner HTMLs (soup objects), 
    
    """
    soups = []
    for i, val in enumerate(x):
        soups.append(BeautifulSoup(val,'html.parser'))
        
    return soups

In [244]:
def soup_values_to_list(x):
    """
    Takes in a list of soups and extracts the em from them.
    
    Returns a list with lists where the first element has a list containing the em elements.
    The second element also has a list containing the em elements, and so on.
    
    """
    values = []
    for i, val in enumerate(x):
        values.append(val.find_all('em'))
        
    html_to_string = lambda x: x.string
    values_strings = []
    for i, val in enumerate(values):
        values_strings.append(list(map(html_to_string, val)))
        
    return values_strings

In [250]:
def soup_urls_to_list(x):
    """
    Takes in a list of soups and extracts user htmls from them.
    
    Returns a list with user htmls
    
    """
    
    user_htmls = []
    for i, val in enumerate(x):
        user_htmls.append(val.a['href'])
        
    return user_htmls

In [268]:
def combine_clean_data(names_list,details_list):
    """
    Combines the names and details into a single list of lists.
    Dealing with them separately is difficult to follow so I want to combine them ASAP
    
    Returns single list of lists
    """
    for i, val in enumerate(details_list):
        val.insert(0, names_list[i])
        
    return details_list

In [273]:
def combine_into_dataframe(x):
    """
    Takes two lists, the urls and the raw data, and puts them into a pandas dataframe.
    
    x is URLs
    y is raw data
    
    """
    
    df = pd.DataFrame(data = x, columns = ["url","member_since","earnings_pst_yr","earnings_ever",
                                       "employers","invoices_paid","largest_employ"])
    
    return df
    

In [226]:
# Testing Scraper
test = details_scrape("https://www.guru.com/d/freelancers/lc/united-states/california/los-angeles/pg/1/")

Completed opening all the tabs
Finished


In [270]:
a = details_to_soup(test[0])
b = details_to_soup(test[1])

In [271]:
a2 = soup_urls_to_list(a)
b2 = soup_values_to_list(b)

In [260]:
print(a2)

['/freelancers/dugale', '/freelancers/mariana-franzetti', '/freelancers/alp-bakir', '/freelancers/super-writer-guy', '/freelancers/bplanningcom', '/freelancers/caes', '/freelancers/kathleen-keithley', '/freelancers/rachel-manning', '/freelancers/stuppi', '/freelancers/joy-david', '/freelancers/amykochjohnson', '/freelancers/michael-ramstead', '/freelancers/twchrist', '/freelancers/mimirose', '/freelancers/digital-industry', '/freelancers/xponential-outsourcery-inc', '/freelancers/yiran-zhang-1', '/freelancers/power-edits', '/freelancers/h9d-studio', '/freelancers/7-design-group']


In [272]:
c = combine_clean_data(a2,b2)

['/freelancers/dugale', ' Aug, 2006', ' $35,176 ', '$173,052 ', '2 ', '89 ', ' $168,412 ']


In [274]:
df_selen = combine_into_dataframe(c)

In [275]:
df_selen.head()

Unnamed: 0,url,member_since,earnings_pst_yr,earnings_ever,employers,invoices_paid,largest_employ
0,/freelancers/dugale,"Aug, 2006","$35,176","$173,052",2,89,"$168,412"
1,/freelancers/mariana-franzetti,"Apr, 2019","$23,618","$23,618",1,24,"$23,618"
2,/freelancers/alp-bakir,"Dec, 2019","$23,525","$23,525",1,22,"$23,525"
3,/freelancers/super-writer-guy,"Feb, 2008","$13,740","$57,024",205,527,"$29,700"
4,/freelancers/bplanningcom,"Dec, 2008","$6,230","$59,710",40,98,"$16,400"
