In [1]:
import requests_cache
import lxml.html as lx
from urllib.request import urlopen
from bs4 import BeautifulSoup

%matplotlib inline

requests_cache.install_cache("cache")

# Cybercoders.com Links

### Links

In [2]:
analyst_url = "https://www.cybercoders.com/jobs/analyst-jobs/?page=" # Analyst Jobs
data_url = "https://www.cybercoders.com/jobs/data-jobs/?page=" # Data Jobs
BA_url = "https://www.cybercoders.com/jobs/business-analyst-jobs/?page=" # Business Analyst Jobs
big_data_url = "https://www.cybercoders.com/jobs/big-data-jobs/?page=" # Big Data Jobs
BI_url = "https://www.cybercoders.com/jobs/business-intelligence-jobs/?page=" # Business Intelligence Jobs

In [3]:
def links(url, page):
    """
    (Purpose) 
    This function takes the URL for cybercoders job and the number of pages to be scraped
    and returns the URLs for the job listings on each page.

    (Args)
    (url) : url for job type , page : number of pages to be webscraped (searched manually)
    
    Job Types:
    - Analyst Jobs
    - Data Jobs
    - Business Analyst Jobs
    - Big Data Jobs
    - Business Intelligence Jobs

    (Returns)
    A list of the URLs for each article on each page of "Cybercoders"
    """
    
    job_links = []
    
    for i in range(1,page+1):
        open_url = urlopen(url + str(i))
        parse_page = BeautifulSoup(open_url, "lxml")
        content_list = parse_page.find_all(name = "div", attrs = {"class" : "job-title"})

        for j in range(len(content_list)):
            job = content_list[j].find_all("a")[0].attrs["href"]
            job_links.append("https://www.cybercoders.com" + job)
            
    return job_links

In [4]:
analyst_links = links(analyst_url, 10)
data_links = links(data_url, 36)
BA_links = links(BA_url, 3)
big_data_links = links(big_data_url, 8)
BI_links = links(BI_url, 4)

In [5]:
# total number of links extracted -> NOT UNIQUE
len(analyst_links) + len(data_links) + len(BA_links) + len(big_data_links) + len(BI_links)

1166

In [6]:
ulinks = list(set(analyst_links + data_links + BA_links + big_data_links + BI_links))
len(ulinks) # total number of links extracted -> UNIQUE

872

In [7]:
ulinks

['https://www.cybercoders.com/data-scientist-job-414707',
 'https://www.cybercoders.com/kronos-qa-analyst-job-418967',
 'https://www.cybercoders.com/senior-data-architect-job-382804',
 'https://www.cybercoders.com/senior-software-engineer-job-417687',
 'https://www.cybercoders.com/sr-data-systems-architect-remote-availability-job-412309',
 'https://www.cybercoders.com/full-stack-engineer-job-406634',
 'https://www.cybercoders.com/sr-qa-test-analyst-job-417296',
 'https://www.cybercoders.com/big-data-engineer-scala-hadoop-spark-job-419989',
 'https://www.cybercoders.com/senior-data-scientist-job-407455',
 'https://www.cybercoders.com/data-analyst-report-writer-job-409670',
 'https://www.cybercoders.com/financial-analyst-job-420955',
 'https://www.cybercoders.com/sr-data-engineer-direct-hire-job-419216',
 'https://www.cybercoders.com/data-warehouse-developer-ssis-etl-job-402809',
 'https://www.cybercoders.com/audit-senior-job-385650',
 'https://www.cybercoders.com/senior-marketing-analys