In [1]:
# Packages for Scraping
from bs4 import BeautifulSoup
import requests

# Packages for cleaning Data
import re
import pandas as pd
import numpy as np
import time

# Packages for PostgreSQL Import
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [27]:
# Creating list of urls
html_core = "https://www.guru.com/d/freelancers/lc/united-states/california/los-angeles/pg/"
pg_nums = list(map(str,list(range(1,945))))
tmp = [s + "/" for s in pg_nums]
htmls = [html_core + s for s in tmp]

In [28]:
def html_extract(x):
    """
    Extracts and cleans the html from a website
    
    Returns soup object from beautiful soup
    """
    source =  requests.get(x).text
    soup = BeautifulSoup(source, 'html.parser')
    
    return soup

In [20]:
def freelancer_extraction(x):
    """
    Extracts freelancer information from a webpage.
    
    x is a parsed soup object
    
    Returns a list of freelancers. Requires further cleaning.
    See header_content_extraction for next steps.
    
    """
    
    # extracts the section that contains the freelancer data
    a = x.body.form.main.main.section.find_all('ul')[1] 

    # Create a list where each element is a freelancer.
    # The element has a number of different tags for each data point.
    b = a.find_all('div',class_ = "record__details")
    
    return b
    

In [21]:
def header_content_extraction(x):
    """
    Extract header and content tags from a given freelancer.
    
    x is a single freelancers HTML information
    
    Returns a header and content object.
        header: contains un-processed information on name, url, location
        content: contains un-processed information on skills, rate, description
    
    """
    header = x.div
    content = x.div.next_sibling.next_sibling
    
    return header, content
    

In [22]:
def header_data_extract(x):
    """
    Extract data from header.
    
    x is the header for a given freelancer.
    
    Data being extracted:
        - profile url
        - city
        - state
        - country
    """
    
    # Extracting data
    profile_url = "https://www.guru.com" + x.a['href']
    
    city = x.find('span',class_="freelancerAvatar__location--city").string
    state = x.find('span',class_="freelancerAvatar__location--state").string
    country = x.find('span',class_="freelancerAvatar__location--country").string
    
    # Clearning commas off city and state
    city = city.replace(',','')
    state = state.replace(',','')
    
    header = {"profile_url": profile_url, "city": city,
              "state": state, "country": country}
    
    return header

In [23]:
def content_data_extract(x):
    """
    Extracting data from the content
    x is the content tag for a given freelancer
    
    Data being extracted:
        - Rates
        - Shortened user description (need to go into their profile for detail)
        - Skills list
    
    """
    
    rates = x.find_all('p')[0].string
    user_description_short = x.find_all('p')[1].string
    skills_list = x.find_all('a', class_="skillsList__skill skillsList__skill--hasHover")
                                   
    # Cleaning skills_list from messy html list to list of clean strings
    string_clean = lambda skills_list: skills_list.string
    skills_list_strings = list(map(string_clean, skills_list))
                                   
    # Cleaning rates
    p = re.compile(r'\d+')
    result = p.findall(rates)
    hourly_rate = int(result[0]) # First number is always hourly rate
    
    # Cleaning user description of indents and return
    user_description_short = user_description_short.replace('\t','')
    user_description_short = user_description_short.replace('\r','')
    user_description_short = user_description_short.replace('\n','')
    
    # Creating Dictionary
    content = {"hourly_rate": hourly_rate, "skills_list": skills_list_strings,
               "user_description": user_description_short}

    return content

In [36]:
# Putting it all together
df = pd.DataFrame(columns = ["profile_url","city","state","country","hourly_rate","skills_list","user_description"])

for k, page in enumerate(htmls[0:50]):
    
    if k%20 == 0:
        print(k)

    time.sleep(np.random.randint(1,5))
        
    soup = html_extract(page)
    freelancers = freelancer_extraction(soup) # Clean HTML

    for k, value in enumerate(freelancers):

        header_content = header_content_extraction(value) # Extract two boxes of interest

        results = header_data_extract(header_content[0]) # Extract header data
        content = content_data_extract(header_content[1]) # Extract content data
        results.update(content) # Combine into one dictionary
        results = pd.DataFrame(results)
        df = df.append(results)

0
20
40


In [37]:
print(df.shape)
df.tail()

(3406, 7)


Unnamed: 0,profile_url,city,state,country,hourly_rate,skills_list,user_description
0,https://www.guru.com/freelancers/melanie-collins,Los Angeles,California,United States,25,Administrative Assistant,I offer a multitude of office services aimed t...
1,https://www.guru.com/freelancers/melanie-collins,Los Angeles,California,United States,25,Data Entry,I offer a multitude of office services aimed t...
2,https://www.guru.com/freelancers/melanie-collins,Los Angeles,California,United States,25,Excel,I offer a multitude of office services aimed t...
3,https://www.guru.com/freelancers/melanie-collins,Los Angeles,California,United States,25,Executive Assistant,I offer a multitude of office services aimed t...
4,https://www.guru.com/freelancers/melanie-collins,Los Angeles,California,United States,25,Outlook,I offer a multitude of office services aimed t...


In [39]:
dbname = 'freelance_db'
username = 'Metaverse'
pswd = 'arcifice91'

In [40]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

postgresql://Metaverse:arcifice91@localhost/freelance_db
postgresql://Metaverse:arcifice91@localhost/freelance_db


In [41]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)

True
postgresql://Metaverse:arcifice91@localhost/freelance_db


In [42]:
## insert data into database from Python (proof of concept - this won't be useful for big data, of course)
## df is any pandas dataframe 
df.to_sql('freelance_db', engine, if_exists='replace')