In [296]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

source =  requests.get("https://www.guru.com/d/freelancers/c/programming-development/sc/math-algorithms-analytics/ssc/data-analysis/l/united-states/").text

In [250]:
soup = BeautifulSoup(source, 'html.parser')

In [252]:
def freelancer_extraction(x):
    """
    Extracts freelancer information from a webpage.
    
    x is a parsed soup object
    
    Returns a list of freelancers. Requires further cleaning.
    See header_content_extraction for next steps.
    
    """
    
    # extracts the section that contains the freelancer data
    a = x.body.form.main.main.section.find_all('ul')[1] 

    # Create a list where each element is a freelancer.
    # The element has a number of different tags for each data point.
    b = a.find_all('div',class_ = "record__details")
    
    return b
    

In [253]:
def header_content_extraction(x):
    """
    Extract header and content tags from a given freelancer.
    
    x is a single freelancers HTML information
    
    Returns a header and content object.
        header: contains un-processed information on name, url, location
        content: contains un-processed information on skills, rate, description
    
    """
    header = x.div
    content = x.div.next_sibling.next_sibling
    
    return header, content
    

In [294]:
def header_data_extract(x):
    """
    Extract data from header.
    
    x is the header for a given freelancer.
    
    Data being extracted:
        - profile url
        - city
        - state
        - country
    """
    
    # Extracting data
    profile_url = "https://www.guru.com" + x.a['href']
    
    city = x.find('span',class_="freelancerAvatar__location--city").string
    state = x.find('span',class_="freelancerAvatar__location--state").string
    country = x.find('span',class_="freelancerAvatar__location--country").string
    
    # Clearning commas off city and state
    city = city.replace(',','')
    state = state.replace(',','')
    
    header = {"profile_url": profile_url, "city": city,
              "state": state, "country": country}
    
    return header

In [295]:
def content_data_extract(x):
    """
    Extracting data from the content
    x is the content tag for a given freelancer
    
    Data being extracted:
        - Rates
        - Shortened user description (need to go into their profile for detail)
        - Skills list
    
    """
    
    rates = x.find_all('p')[0].string
    user_description_short = x.find_all('p')[1].string
    skills_list = x.find_all('a', class_="skillsList__skill skillsList__skill--hasHover")
                                   
    # Cleaning skills_list from messy html list to list of clean strings
    string_clean = lambda skills_list: skills_list.string
    skills_list_strings = list(map(string_clean, skills_list))
                                   
    # Cleaning rates
    p = re.compile(r'\d+')
    result = p.findall(rates)
    hourly_rate = int(result[0]) # First number is always hourly rate
    
    # Cleaning user description of indents and return
    user_description_short = user_description_short.replace('\t','')
    user_description_short = user_description_short.replace('\r','')
    user_description_short = user_description_short.replace('\n','')
    
    # Creating Dictionary
    content = {"hourly_rate": hourly_rate, "skills_list": skills_list_strings,
               "user_description": user_description_short}

    return content

In [348]:
# Testing Functions
freelancers = freelancer_extraction(soup) # Clean HTML

final = pd.DataFrame(columns = ["profile_url","city","state","country","hourly_rate","skills_list","user_description"])
for i, value in enumerate(freelancers):
    
    header_content = header_content_extraction(value) # Extract two boxes of interest
    
    results = header_data_extract(header_content[0]) # Extract header data
    content = content_data_extract(header_content[1]) # Extract content data
    results.update(content) # Combine into one dictionary
    results = pd.DataFrame(results)
    final = tmp.append(results)
    
final

Unnamed: 0,profile_url,city,state,country,hourly_rate,skills_list,user_description
0,https://www.guru.com/freelancers/mv-squared,Weston,Massachusetts,United States,90,Analytics,"Data extraction, cleaning, transformation, ana..."
1,https://www.guru.com/freelancers/mv-squared,Weston,Massachusetts,United States,90,Data Management,"Data extraction, cleaning, transformation, ana..."
2,https://www.guru.com/freelancers/mv-squared,Weston,Massachusetts,United States,90,Database Design,"Data extraction, cleaning, transformation, ana..."
3,https://www.guru.com/freelancers/mv-squared,Weston,Massachusetts,United States,90,Excel,"Data extraction, cleaning, transformation, ana..."
4,https://www.guru.com/freelancers/mv-squared,Weston,Massachusetts,United States,90,Google Spreadsheet,"Data extraction, cleaning, transformation, ana..."
...,...,...,...,...,...,...,...
0,https://www.guru.com/freelancers/david-margraf,Minneapolis,Minnesota,United States,60,Analytics,Experienced Clinical Data Scientist skilled in...
1,https://www.guru.com/freelancers/david-margraf,Minneapolis,Minnesota,United States,60,Biostatistics,Experienced Clinical Data Scientist skilled in...
2,https://www.guru.com/freelancers/david-margraf,Minneapolis,Minnesota,United States,60,Data Analysis,Experienced Clinical Data Scientist skilled in...
3,https://www.guru.com/freelancers/david-margraf,Minneapolis,Minnesota,United States,60,Epidemiology,Experienced Clinical Data Scientist skilled in...


In [391]:
final[final['skills_list'] == 'Analytics']['hourly_rate'].desribe()

AttributeError: 'Series' object has no attribute 'desribe'