In [54]:
#Importing endless packages
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import requests
import random
import csv

In [55]:
#---------------Our Parameters-------------------#
#add any categories you need to this list. This is the part of the url after "/search" on craigslist
categories = ["edu", "fbh"]
locations = ["30033","38139","33101","02802","53558","24501","47906"]

In [56]:
#headers to be used later
headers_list = [
    # Firefox 77 Mac
    {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
    },
    # Chrome 92.0 Win10
    {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.google.com/",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
    },
    # Chrome 91.0 Win10
    {
    "Connection": "keep-alive",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Dest": "document",
    "Referer": "https://www.google.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
    },
    # Firefox 90.0 Win10
    {
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-User": "?1",
    "Sec-Fetch-Dest": "document",
    "Referer": "https://www.google.com/",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9"
    }
]

In [57]:
def getLinks(url):
    chrome_options = webdriver.chrome.options.Options()
    chrome_options.add_argument('--no-sandbox')# operate at the highest authority
    chrome_options.add_argument('--disable-dev-shm-usage')#increase the RAM of chrome to load the page
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = chrome_options)
    #look within 1000 miles of Purdue. Doing this instead of individually searching multiple cities because Craigslist pages 
    #have different html tags by location, making finding links much more difficult
    driver.get(url)
    links = []
    try:
        while True:
            #let the page load for 3 seconds
            time.sleep(3)
            html = driver.page_source
            soup = bs(html, 'html.parser')
            linkheaders = soup.find_all("a", class_="result-title hdrlnk")
            for html_class in linkheaders:
                #grab our quote, author, and tags
                links.append(html_class.get("href"))
                    #try to find and click on a next button and loop
            driver.find_element(By.PARTIAL_LINK_TEXT, "next").click()
    #when we can no longer click on the next button, stop searching and quit out of the browser
    except NoSuchElementException:
        driver.quit() 
    return links

In [58]:
#function to pull HTML text for each individual posting
def getHTML(url):
    #grab a random header to convince craiglist we aren't a bot
    headers = random.choice(headers_list)
    r = requests.Session()
    r.headers = headers
    #pull html from website and return it
    htmltext = r.get(url).text
    return htmltext

In [59]:
#Function to pull out all of the titles and posting text blocks from Craiglist job postings
def scrapeLinks(links):
    titles = []
    postings = []
    numfailures = 0
    for x in links:
        time.sleep(.5)
        if numfailures > 5:
            break
        html = getHTML(x)
        attempts = 1
        #Getting around Craigslist putting us in jail
        #If they block our request, wait 1 second and try again. Do this up to 10 times
        while html.find("Your request has been blocked") >= 1 and attempts < 3:
            attempts += 1
            print("Craiglist blocked ", x, "; making attempt #",attempts, sep="")
            time.sleep(1)
            html = getHTML(x)
        if attempts < 10:  
            try:
                #Use beautiful soup to extract the post text block and title of the job
                soup = bs(html, 'html.parser')
                postsection = soup.find("section", id="postingbody")
                posttitle = soup.find("span", id="titletextonly")
                titles.append(posttitle.string)
                #If we don't recognize this posting as a dup, add it to the list, otherwise, mark it as a dup
                if postsection.text not in postings:
                    print(posttitle.string)
                    postings.append(postsection.text)
                else:
                    print("Duplicate!!!:",posttitle.string)
                    postings.append("Duplicate")
            except AttributeError: 
                print(html)
        #if we couldn't pull this single post because Craigslist is mean, note our failure and proceed to the next posting      
        else:
            print("Unable to pull ",x)
            postings.append("Failed")
            titles.append("Failed")
            numfailures +=1
    #return our list of titles and post texx
    return titles, postings

In [None]:
#Open up the csv that will store our resutls and write headers
Output_File = open("JobPostingsEdu2.csv", 'w',newline='',encoding="utf-8")
writer = csv.writer(Output_File)
writer.writerow(["Link","Posting Title", "Location", "Category", "Posting Text"])

#loop through all requested categories
for cat in categories:
    for zip in locations:
        searchurl = 'https://tippecanoe.craigslist.org/search/' + cat + "?search_distance=250&postal=" + zip
        print(searchurl)
        individuallinks = getLinks(searchurl)
        #scrape the titles and posting details for all of the links we just stored
        jobtitles, jobpostings = scrapeLinks(individuallinks)
        #Barring some error/duplication of job posting, add it as a row to our csv
        for x in range(len(jobpostings)):
            if jobpostings[x] not in ["Duplicate","Failed"]:
                writer.writerow([individuallinks[x],jobtitles[x], individuallinks[x][8:individuallinks[x].find(".")], cat, jobpostings[x].strip().replace("\n"," ")])
#clean up and close our excel file when we are done
Output_File.close()