# Indeed Job Extraction :

### Import Libraries :

In [42]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import concurrent.futures
from concurrent.futures import as_completed, ThreadPoolExecutor

### Create Driver :

In [43]:
def create_driver(proxy):
    chrome_options = Options()
    chrome_options.add_argument(f'--proxy-server={proxy}')
    driver = webdriver.Chrome(executable_path='Path', options=chrome_options)
    return driver

### Create Job Details :

In [41]:
def get_job_details(div):
    
    
    try:
        job_key = div.find_element(By.CSS_SELECTOR, 'h2.jobTitle a').get_attribute('id')
        job_key = job_key.replace('job_', '')
    except NoSuchElementException:
        job_key = None
    try:
        job_title = div.find_element(By.CSS_SELECTOR, 'h2.jobTitle').text.strip()
    except NoSuchElementException:
        job_title = None
    try:
        company_name = div.find_element(By.CSS_SELECTOR, 'span.companyName').text.strip()
    except NoSuchElementException:
        company_name = None
    try:
        location = div.find_element(By.CLASS_NAME, 'companyLocation').text.strip()
    except NoSuchElementException:
        location = None
    try:
        salary = div.find_element(By.CLASS_NAME, 'metadata.salary-snippet-container').text.strip()
    except NoSuchElementException:
        salary = None

    try:
        job_description = div.find_element(By.CSS_SELECTOR, 'div.job-snippet ul').text.strip()
    except NoSuchElementException:
        job_description = None
                
            
    
    job_data = {
                "reference": job_key,
                "name": job_title,
                "sections": [
                    {
                        "name": "description",
                        "title": "Description du poste",
                        "description": job_description
                    },
                    {
                        "name": "profile_background",
                        "title": "Profil cherché",
                        "description": None
                    }
                ],
                "skills": [], 
                "tags": [
                    {"name": "salary_expectation", "value": salary}, 
                    {"name": "company", "value": company_name}, 
                    {"name": "location", "value": location}  
                ]
            }
    
    return job_data

### Extract Data :

In [31]:
def extract_data(page_range, proxy, proxy_pages_limit):
    jobs = []
    driver = None

    for i, page in enumerate(page_range):
        if i % proxy_pages_limit == 0:
            if driver is not None:
                driver.quit()
            driver = create_driver(proxy)

        url = f'https://uk.indeed.com/jobs?q=all&start={page * 10}'
        driver.get(url)

        divs = driver.find_elements(By.CSS_SELECTOR, 'div.job_seen_beacon')

        for div in divs:
            job_data = get_job_details(div)
            jobs.append(job_data)

        time.sleep(5)

    if driver is not None:
        driver.quit()

    df = pd.DataFrame(jobs)
#     print(f"Thread {page_range[0]//10} completed. Jobs found: {df.shape[0]}")
    return df



### Extract Data Using Proxy and Multi Thread :

In [32]:
def new_extract_all_data_pro(job_number=100, proxies=None, num_threads=10, proxy_pages_limit=2, jobs_per_page=15):
    if proxies is None:
        proxies = ['135.181.89.201:8080']
    page_number = -(-job_number // jobs_per_page)  
    pages_per_thread = -(-page_number // num_threads)  

    page_ranges = [range(i*pages_per_thread, min((i+1)*pages_per_thread, page_number)) for i in range(num_threads)]

    dfs = []

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_df = {executor.submit(extract_data, page_range, proxies[i%len(proxies)], proxy_pages_limit): i for i, page_range in enumerate(page_ranges)}
        for future in as_completed(future_to_df):
            dfs.append(future.result())

    return dfs


### Test :

In [33]:
data = new_extract_all_data_pro(89)

  after removing the cwd from sys.path.


In [40]:
data

Unnamed: 0,reference,name,sections,skills,tags
0,2078924de4c3ad24,"Egg Collector / Farm Worker at Worlingworth, S...","[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': '£13...."
1,f82753a46aa10924,Administrator,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': '£25,..."
2,83ae5e98c629ee42,upon-Tyne -Waiting Staff,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': 'Up t..."
3,c4064f46fbda715f,Security Support Officer,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': '£31,..."
4,0182b518262e87e5,Warehouse Operative Mornings,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': '£12...."
...,...,...,...,...,...
84,52a7ed4455649da8,Sales Coordinator Special Markets,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': '£46,..."
85,fd3244632c3893c0,JUNIOR CREATIVE,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': None}..."
86,95ec7cf83d7553cc,Operations Assistant,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': '£26,..."
87,b7358556a3b613cc,Investigators,"[{'name': 'description', 'title': 'Description...",[],"[{'name': 'salary_expectation', 'value': 'From..."
