In [1]:
# Keys - broken, can't even access PROJECT_DIR in terminal
import os

project_dir = os.getenv("PROJECT_DIR")
env = os.getenv("CONDA_DEFAULT_ENV")

username = os.getenv("DATABASE_USR")
pswd = os.getenv("DATABASE_PASS")
dbname = os.getenv("DATABASE_NAME")

In [2]:
# Packages for Scraping
from bs4 import BeautifulSoup
import requests
import random

# Packages for cleaning Data
import re
import pandas as pd
import numpy as np
import time

# Packages for PostgreSQL Import
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [3]:
def html_extract(x):
    """
    Extracts and cleans the html from a website
    
    Returns soup object from beautiful soup
    """
    source =  requests.get(x).text
    soup = BeautifulSoup(source, 'html.parser')
    
    return soup

In [4]:
def freelancer_extraction(x):
    """
    Extracts freelancer information from a webpage.
    
    x is a parsed soup object
    
    Returns a list of freelancers. Requires further cleaning.
    See header_content_extraction for next steps.
    
    """
    
    # extracts the section that contains the freelancer data
    a = x.body.form.main.main.section.find_all('ul')[1] 

    # Create a list where each element is a freelancer.
    # The element has a number of different tags for each data point.
    b = a.find_all('div',class_ = "record__details")
    
    return b
    

In [5]:
def header_content_extraction(x):
    """
    Extract header and content tags from a given freelancer.
    
    x is a single freelancers HTML information
    
    Returns a header and content object.
        header: contains un-processed information on name, url, location
        content: contains un-processed information on skills, rate, description
    
    """
    header = x.div
    content = x.div.next_sibling.next_sibling
    
    return header, content
    

In [6]:
def header_data_extract(x):
    """
    Extract data from header.
    
    x is the header for a given freelancer.
    
    Data being extracted:
        - profile url
        - city
        - state
        - country
    """
    
    # Extracting data
    profile_url = x.a['href']
    
    city = x.find('span',class_="freelancerAvatar__location--city").string
    state = x.find('span',class_="freelancerAvatar__location--state").string
    country = x.find('span',class_="freelancerAvatar__location--country").string
    
    try:
        rating = x.find('span', class_="freelancerAvatar__feedback").text
    except:
        rating = "NA"
        
    try:
        earnings = x.find('span', class_="freelancerAvatar__earnings").text
    except:
        earnings = "NA"
        
    # Clearning commas off city and state
    city = city.replace(',','')
    state = state.replace(',','')
    
    header = {"profile_url": profile_url, "city": city,
              "state": state, "country": country, "rating": rating,
              "earnings": earnings}
    
    return header

In [7]:
def content_data_extract(x):
    """
    Extracting data from the content
    x is the content tag for a given freelancer
    
    Data being extracted:
        - Rates
        - Shortened user description (need to go into their profile for detail)
        - Skills list
    
    """
    
    rates = x.find_all('p')[0].string
    user_description_short = x.find_all('p')[1].string
    skills_list = x.find_all('a', class_="skillsList__skill skillsList__skill--hasHover")
                                   
    # Cleaning skills_list from messy html list to list of clean strings
    string_clean = lambda skills_list: skills_list.string
    skills_list_strings = list(map(string_clean, skills_list))
                                   
    # Cleaning rates
    p = re.compile(r'\d+')
    result = p.findall(rates)
    hourly_rate = int(result[0]) # First number is always hourly rate
    
    # Cleaning user description of indents and return
    user_description_short = user_description_short.replace('\t','')
    user_description_short = user_description_short.replace('\r','')
    user_description_short = user_description_short.replace('\n','')
    
    # Creating Dictionary
    content = {"hourly_rate": hourly_rate, "skills_list": skills_list_strings,
               "user_description": user_description_short}

    return content

In [8]:
def add_table_to_db(dataframe,table_name):
    """
    Adds the data to a new table (details_table) in freelance_db.
    
    """
    dbname = "freelance_db"
    username = "Metaverse"
    pswd = "arcifice91"
    
    # Connect to the database and save data to it
    engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
    dataframe.to_sql(table_name, engine, if_exists='replace')
    
    print("Added data to %s"%(dbname))

In [9]:
# Creating list of urls
html_core = "https://www.guru.com/d/freelancers/l/united-states/pg/"
pg_nums = list(map(str,list(range(1,945))))
tmp = [s + "/" for s in pg_nums]
htmls = [html_core + s for s in tmp]

In [None]:
# Putting it all together
df = pd.DataFrame(columns = ["profile_url","city","state","country","rating","earnings","hourly_rate","skills_list","user_description"])

for k, page in enumerate(htmls[0:51]):
    
    print(k)
        
    soup = html_extract(page)
    freelancers = freelancer_extraction(soup) # Clean HTML

    for k, value in enumerate(freelancers):

        header_content = header_content_extraction(value) # Extract two boxes of interest

        results = header_data_extract(header_content[0]) # Extract header data
        content = content_data_extract(header_content[1]) # Extract content data
        results.update(content) # Combine into one dictionary
        results = pd.DataFrame(results)
        df = df.append(results)
        
add_table_to_db(df,"freelance_table")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
