# Web Scraping - Indeed.com
General steps for Web Scraping
1. Check whether the website allows web scraping
2. Obtain the source code (HTML File) by using the website URL
3. Download the website content
4. Parse the content using keywords tags for elements of interest
5. Extract relevant data/features
6. Organize raw data in structured format (e.g., CSV)

### Import Dependencies 

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By

### Path to webdriver (Firefox, Chrome) 

In [None]:
#Define firefox driver
driver = webdriver.Firefox()

### Define position and location 

In [None]:
## Enter a job position
position = "python analyst"
## Enter a location (City, State or Zip or remote)
locations = "remote"

def get_url(position, location):
    url_template = "https://www.indeed.com/jobs?q={}&l={}"
    url = url_template.format(position, location)
    return url

url = get_url(position, locations)

### Scrape job postings

In [None]:
## Number of postings to scrape (testing with a low number - can increase to 1000 from testing)
postings = 800

dataframe = []
dataframe = pd.DataFrame(columns=["Title", "Company", "Location", "Rating", "Date", "Salary", "Description", "Links"])
jn=0
for i in range(0, postings, 10):
    driver.get(url + "&start=" + str(i))
    driver.implicitly_wait(3)

    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')

    for job in jobs:
        result_html = job.get_attribute('innerHTML')
        soup = BeautifulSoup(result_html, 'html.parser')
        
        jn += 1
        
        liens = job.find_elements(By.TAG_NAME, "a")
        links = liens[0].get_attribute("href")
        
        title = soup.select('.jobTitle')[0].get_text().strip()
        try:
            company = soup.find_all(attrs={'data-testid': 'company-name'})[0].get_text().strip() 

        except:
            company = 'NaN'
            
        try:
            location = soup.find_all(attrs={'data-testid': 'text-location'})[0].get_text().strip()

        except:
            location = 'NaN'
            
        try:
            salary = soup.select('.salary-snippet-container')[0].get_text().strip()
        except:
            salary = 'NaN'
            
        try:
            rating = soup.find("div",{"class":"companyInfo"}).find("span",{"class":"ratingsDisplay"}).text #scrapfly version
        except:
            rating = 'NaN'
            
        try:
            date = soup.find_all('span',attrs={'data-testid': 'myJobsStateDate'})[0].get_text().strip()
            words_posted_today = ["Today" , "Just", "ongoing"]
            if "ago" in date:
                date_temp = date.split()
                date_temp = date_temp[-3:]
                date = (date_temp[0] + ' ' + date_temp[1] + ' ' + date_temp[2])
            elif any(x in date for x in words_posted_today):
                date = "0 days ago"
            else:
                date = 'NaN'
        except:
            date = 'NaN'
            
        try:
            description = soup.select('.job-snippet')[0].get_text().strip()
        except:
            description = ''
       
        dataframe = pd.concat([dataframe, pd.DataFrame([{'Title': title,
                                          "Company": company,
                                          'Location': location,
                                          'Rating': rating,
                                          'Date': date,
                                          "Salary": salary,
                                          "Description": description,
                                          "Links": links}])], ignore_index=True)
        print("Job number {0:4d} added - {1:s}".format(jn,title))

In [None]:
driver.quit()

In [None]:
dataframe

### Scrape full job descriptions

In [None]:
import pandas as pd

def split_dataframe(dataframe, num_parts=8):
    total_rows = len(dataframe)
    part_size = total_rows // num_parts
    dataframe_slices = []

    for i in range(0, total_rows, part_size):
        start_index = i
        end_index = min(i + part_size, total_rows)
        dataframe_slices.append(dataframe.iloc[start_index:end_index])

    return dataframe_slices

In [None]:
dataframe_slices = split_dataframe(dataframe)

In [None]:
def scrape_descriptions(dataframe):
    driver = webdriver.Firefox()
    Links_list = dataframe['Links'].tolist()
    descriptions = []
    indices_to_remove = []

    for index, link in enumerate(Links_list):
        driver.get(link)
        driver.implicitly_wait(random.randint(3, 8))
        
        try:
            jd = driver.find_element(By.XPATH, '//div[@id="jobDescriptionText"]').text
            descriptions.append(jd)
        except NoSuchElementException:
            print(f"No job description found for link at index {index}: {link}. Skipping...")
            indices_to_remove.append(index)
            continue
        
        time.sleep(random.randint(5,10))

    # Close the WebDriver
    driver.quit()
    
    # Create a boolean mask to filter out rows with indices to remove
    mask = ~dataframe.index.isin(indices_to_remove)
    
    # Filter out rows to keep
    dataframe = dataframe[mask].copy()
    
    # Ensure the lengths of descriptions match the length of the dataframe
    if len(descriptions) != len(dataframe):
        if len(descriptions) < len(dataframe):
            # Pad descriptions with empty strings
            descriptions += [''] * (len(dataframe) - len(descriptions))
        else:
            # Truncate descriptions
            descriptions = descriptions[:len(dataframe)]
    
    # Assign descriptions to dataframe
    dataframe['Description'] = descriptions
    
    return dataframe

In [None]:
from concurrent.futures import ThreadPoolExecutor
import random
import time
from selenium.common.exceptions import NoSuchElementException

In [None]:
# Define the number of threads (workers)
num_threads = 4  # Adjust as needed

# Create a ThreadPoolExecutor with the specified number of threads
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit each dataframe slice processing function to the executor
    futures = []
    for df_slice in dataframe_slices:
        futures.append(executor.submit(scrape_descriptions, df_slice))
    
    # Wait for all tasks to complete
    for future in futures:
        result_df_slice = future.result()

In [None]:
result_dataframes = [futures[0].result(), 
 futures[1].result(), 
 futures[2].result(), 
 futures[3].result(), 
 futures[4].result(), 
 futures[5].result(), 
 futures[6].result(), 
 futures[7].result() 
]

In [None]:
concatenated_df = pd.concat(result_dataframes , ignore_index=True)

In [None]:
concatenated_df

### Save results

In [None]:
# Convert the dataframe to a csv file
#date = datetime.today().strftime('%Y-%m-%d')
date = datetime.today().strftime('%Y-%m-%d_%H-%M')
concatenated_df.to_csv(date + "_" + position + "_" + locations + ".csv", index=False)