In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from playwright.sync_api import sync_playwright
import time


In [None]:
# Launch the Playwright and create a new browser instance
with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()

    # Navigate to the target URL
    page.goto("https://career2.successfactors.eu/portalcareer?company=ump&career%5fns=job%5flisting%5fsummary&navBarLevel=JOB%5fSEARCH&_s.crb=TDR5NSydm2PaVGPZuIYVUH1Bo7amz3qFPL45aIxj%2fmA%3d")

    # Store HTML job listings in a list
    html_jobs = []
    for i in range(75):
        # Locate the job search results
        Jobs = page.locator("td.jobSearchResults")

        # Pause execution for 3 seconds to ensure the page is fully loaded
        time.sleep(3)

        # Append the inner HTML of the job listings to the list
        html_jobs.append(Jobs.inner_html())

        # Click on the next page button
        page.locator('li.sfPaginatorArrowContainer.paginationArrowContainer.next a').nth(0).click()

        # Capture a screenshot of the page (optional)
        page.screenshot(path=f"img{i}.png")

    # Close the browser
    browser.close()  

# Write the full HTML job listings to a file
with open("full_html_job.html", "w+", encoding="utf8") as f:
    full_html_job = "".join(html_jobs)
    f.write(full_html_job)

In [3]:
def getData(page):
    # Create a BeautifulSoup object to parse the HTML page
    soup = BeautifulSoup(page, "html.parser")

    # Extract job titles
    Jobs_title_result = soup.select('tr a.jobTitle')
    Job_titles = [r.text for r in Jobs_title_result]

    # Extract job title links
    Jobs_links_result = soup.find_all('a', class_='jobTitle')
    job_title_links = ['https://career2.successfactors.eu' + job_title['href'] for job_title in Jobs_links_result]

    # Extract details for each job
    details = soup.select('tr span.jobContentEM')
    Id = [details[i].text for i in range(0, len(details), 4)]
    Date = [details[i + 1].text[10:] for i in range(0, len(details), 4)]
    Country = [details[i + 2].text for i in range(0, len(details), 4)]
    Departement = [details[i + 3].text for i in range(0, len(details), 4)]

    # Zip all the extracted information into a list of tuples and return it
    return list(zip(Job_titles, Id, Date, Country, Departement, job_title_links))


In [7]:
# Open the HTML file and read its content
with open("full_html_job.html", "r", encoding="utf8") as f:
    pages = f.read()

# Extract job information from the HTML content
Jobs_info = getData(pages)

# Create a DataFrame to store the job information
Jobs_df = pd.DataFrame(Jobs_info, columns=["Title", "Id", "Date", "Country", "Departement", "link"])


In [9]:
Jobs_df.shape


(744, 6)

In [10]:
Jobs_df.head()

Unnamed: 0,Title,Id,Date,Country,Departement,link
0,Postdoctoral Fellow on Environmental Microbiology,8071,04/28/2023,Morocco,Engineering,https://career2.successfactors.eu/career?caree...
1,Business Operations Manager - Phosphate Valley...,9097,04/28/2023,Morocco,Sales Training,https://career2.successfactors.eu/career?caree...
2,Business Operations Manager - Phosphate Valley...,9099,04/28/2023,Morocco,Sales Training,https://career2.successfactors.eu/career?caree...
3,Professeurs affiliés,9057,04/20/2023,Morocco,Engineering,https://career2.successfactors.eu/career?caree...
4,Technicien Agricole,9012,04/20/2023,Morocco,Engineering,https://career2.successfactors.eu/career?caree...


In [12]:
# Loop over each row of the DataFrame
job_descriptions = []
for index, row in Jobs_df.iterrows():
    link = row['link'] # get the value in the 'links' column for this row
    response = requests.get(link) # send a request to the link
    soup = BeautifulSoup(response.text, 'html.parser') # parse the HTML content
    job_description = soup.find('div', class_='joqReqDescription').text
    job_descriptions.append(job_description)


In [19]:
Jobs_df_F = Jobs_df
Jobs_df_F["job_description"] = job_descriptions

In [21]:
Jobs_df_F.head()

Unnamed: 0,Title,Id,Date,Country,Departement,link,job_description
0,Postdoctoral Fellow on Environmental Microbiology,8071,04/28/2023,Morocco,Engineering,https://career2.successfactors.eu/career?caree...,\n\nMohammed VI Polytechnic University is an i...
1,Business Operations Manager - Phosphate Valley...,9097,04/28/2023,Morocco,Sales Training,https://career2.successfactors.eu/career?caree...,\n\nMohammed VI Polytechnic University is an i...
2,Business Operations Manager - Phosphate Valley...,9099,04/28/2023,Morocco,Sales Training,https://career2.successfactors.eu/career?caree...,\n\nMohammed VI Polytechnic University is an i...
3,Professeurs affiliés,9057,04/20/2023,Morocco,Engineering,https://career2.successfactors.eu/career?caree...,\n\nAbout UM6P:\n \nLocated at the heart of th...
4,Technicien Agricole,9012,04/20/2023,Morocco,Engineering,https://career2.successfactors.eu/career?caree...,\n\nMohammed VI Polytechnic University is an i...


In [27]:
Jobs_df_F.to_csv("Um6p_Jobs.csv", sep=',', index=False, encoding='utf-8')