In [386]:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
import time
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import numpy as np

In [405]:
def login_form_click():
    """
    Bandaid on issue where login_form resets sometimes
    
    """
    
    try:
        login_form[2].click()
    except:
        login_form = driver.find_elements_by_xpath('//button[@class="tabControls__button"]')
        login_form[2].click()

In [404]:
def details_scrape():
    """
    Uses selenium to scrape additional information contained in a sub-menu
    
    Clicks the first box, guru throws up a log-in, goes back, opens all the boxes
    and then finishes by extracting all the html from each box.
    
    This takes in a single URL (string) and extracts from that. One problem that may arise
    is if it doesn't throw the log-in box and the driver tries to go back a page.
    However, I think we can get around this by just loading each URL new. 
    
    May be more efficient to page through? That's for another time.
    
    RETURNS: list of names and list of HTML for each details box.
    
    Test URL:
    "https://www.guru.com/d/freelancers/lc/united-states/california/los-angeles/pg/1/"
    """
    first_url = driver.current_url
    login_form = driver.find_elements_by_xpath('//button[@class="tabControls__button"]')
    
    # Checking if the site loaded
    if len(login_form) == 0:
        return None
    
    time.sleep(2) # Wait to click
    login_form_click()

    if driver.current_url != first_url:
        time.sleep(2) # Wait for log-in prompt
        driver.back() # Go back to main page
        time.sleep(1) # Wait to scrape
        login_form = driver.find_elements_by_xpath('//button[@class="tabControls__button"]')
    else:
        login_form_click()
    
    counter = 2
    # There are multiple buttons so here I am just clicking the ones I want (2,6,10,...)
    for i, val in enumerate(login_form): 
        #loop one
        if i == counter:
            login_form[i].click()
            counter += 4
    
    # Pull elements in the now opened detail boxes
    user_name = driver.find_elements_by_xpath('//div[@class="module_avatar freelancerAvatar"]')
    user_detail = driver.find_elements_by_xpath('//div[@class="feedback"]')

    # Extract text from the names and HTML from the details
    # Will parse the detail_html further with beautiful soup
    # Each is a list of length equal to the number of users on the page
    names = []
    for i, val in enumerate(user_name):
        names.append(val.get_attribute('innerHTML'))

    detail_html = []
    for i, val in enumerate(user_detail):
        detail_html.append(val.get_attribute('innerHTML'))
        
    # Now let's go get the about page
    counter = 3
    # There are multiple buttons so here I am just clicking the ones I want (2,6,10,...)
    for i, val in enumerate(login_form):    
        if i == counter:
            login_form[i].click()
            counter += 4
    
    time.sleep(2)
    
    # Pull elements in the now opened detail boxes
    user_about = driver.find_elements_by_xpath('//div[@class="profile-about"]')
    about_html = []
    for i, val in enumerate(user_about):
        about_html.append(val.get_attribute('innerHTML'))
            
    return names, detail_html, about_html

In [388]:
def raw_to_soup(x):
    """
    Takes in a list of raw htmls and parses them with BeautifulSoup.
    
    Returns a list of cleaner HTMLs (soup objects), 
    
    """
    soups = []
    for i, val in enumerate(x):
        soups.append(BeautifulSoup(val,'html.parser'))
        
    return soups

In [389]:
def soup_urls_to_list(x):
    """
    Takes in a list of soups and extracts user htmls from them.
    
    Returns a list with user htmls
    
    """
    
    user_htmls = []
    for i, val in enumerate(x):
        user_htmls.append(val.a['href'])
        
    return user_htmls

In [390]:
def soup_details_to_list(x):
    """
    Takes in a list of soups and extracts the em from them.
    
    Returns a list with lists where the first element has a list containing the em elements.
    The second element also has a list containing the em elements, and so on.
    
    """
    values = []
    for i, val in enumerate(x):
        values.append(val.find_all('em'))
        
    html_to_string = lambda x: x.string
    values_strings = []
    for i, val in enumerate(values):
        values_strings.append(list(map(html_to_string, val)))
        
    return values_strings

In [391]:
def soup_about_to_list(x):
    """
    Takes in a list of soups and extracts the relevant detail.
    This is specific to the user about section. This will extract the bio
    information and return it as a list of long strings.
    
    """
    
    about_vals = []
    for i, val in enumerate(user_about_soup):
        try:
            tmp = val.find_all('pre')[0].text
        except:
            tmp = 'NA'
        about_vals.append(tmp)
        
    return about_vals

In [392]:
def combine_clean_data(names_list,details_list, about_list):
    """
    Combines the names and details into a single list of lists.
    Dealing with them separately is difficult to follow so I want to combine them ASAP
    
    Returns single list of lists
    """
    for i, val in enumerate(details_list):
        val.insert(0, names_list[i])
        
    for i, val in enumerate(details_list):
        val.insert(len(val),about_list[i])
        
    return details_list

In [393]:
def combine_into_dataframe(x):
    """
    Takes two lists, the urls and the raw data, and puts them into a pandas dataframe.
    
    x is URLs
    y is raw data
    
    """
    # First have to check the lengths of the lists in the list
    for i, val in enumerate(x):
        val.extend([float("NaN")]*(7-len(val)))
    
    df = pd.DataFrame(data = x, columns = ["profile_url","member_since","earnings_pst_yr","earnings_ever",
                                       "employers","invoices_paid","largest_employ","bio"])
    
    
    return df
    

In [394]:
def pagination():
    """
    Creates a database of where the page numbers will send us.
    """
    
    a = driver.find_element_by_xpath('//*[@id="ctl00_guB_ulpaginate"]')
    soup = BeautifulSoup(a.get_attribute('innerHTML'),'html.parser')
    soup.find_all('a')
    page_list = []
    for i, val in enumerate(soup):
        page_list.append(val.text)
        
    return page_list

In [395]:
def add_table_to_db(dataframe,table_name):
    """
    Adds the data to a new table (details_table) in freelance_db.
    
    """
    # Try to figure out how to put these into a config file later.
    dbname = 'freelance_db'
    username = 'Metaverse'
    pswd = 'arcifice91'
    
    # Connect to the database
    engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
    print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))

    ## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
    ## df is any pandas dataframe 
    dataframe.to_sql(table_name, engine, if_exists='replace')
    
    print("Added data to %s"%(dbname))

In [425]:
# Putting it all together
# Creating list of urls
# html_core = "https://www.guru.com/d/freelancers/l/united-states/pg/"
# pg_nums = list(map(str,list(range(1,945))))
# tmp = [s + "/" for s in pg_nums]
# htmls = [html_core + s for s in tmp]


driver = webdriver.Firefox()
driver.get("https://www.guru.com/d/freelancers/l/united-states/pg/1/")

# Scraping
for j in range(0,51):
        
    
    raw_html = details_scrape()
    
    if len(raw_html[2])==0:
        print("OH NO!")
        time.sleep(10)
        raw_html = details_scrape()

    user_urls_soup = raw_to_soup(raw_html[0])
    user_details_soup = raw_to_soup(raw_html[1])
    user_about_soup = raw_to_soup(raw_html[2])

    user_urls_clean = soup_urls_to_list(user_urls_soup)
    user_details_clean = soup_details_to_list(user_details_soup)
    user_about_clean = soup_about_to_list(user_about_soup)

    combined_data = combine_clean_data(user_urls_clean, user_details_clean, user_about_clean)

    if j == 0:
        df_tmp = combine_into_dataframe(combined_data)
        df_tmp.fillna(value=np.nan, inplace=True)
    else:
        tmp = combine_into_dataframe(combined_data)
        tmp.fillna(value=np.nan, inplace=True)
        df_tmp = pd.concat([df_tmp,tmp])
    
    print("Finished page " + str(j+1))
    
    # Changing the page. Difficult because no next button.
        # First figuring out what page I'm on
    cur_page_num = int(pg_nums[j])
    goal_page_num = str(cur_page_num+1)

    # Then make a dataset of all page listings we can reach
    button_directory = pagination()
    go_to = button_directory.index(goal_page_num)

    # Create URL based on current page number
    xpath_click = '/html/body/form/main/main/section/div/div[2]/div[2]/ul/li['+str(go_to+1)+']/a'
    driver.find_element_by_xpath(xpath_click).click()
            
add_table_to_db(df_selen,"user_details_table")

Finished page 41
Finished page 42
Finished page 43
Finished page 44
Finished page 45
Finished page 46
Finished page 47
Finished page 48
Finished page 49
Finished page 50
Finished page 51
