In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import re
from selenium.webdriver.common.by import By
import time
import numpy as np


In [None]:
job_df = pd.DataFrame(columns=['job_title', 'experience_needed', 'career_level', 'education_level', 'min_salary', 'max_salary', 'gender', 'job_categories', 'job_description', 'job_requirements', 'vacancies', 'company_name', 'location', 'job_types', 'job_skills'])
company_df = pd.DataFrame(columns=['company_url', 'company_name', 'location', 'foundation_date', 'min_size', 'max_size', 'specialities', 'industry', 'about'])
company_http = []
http_links = []

In [None]:
#Scraping all the information

In [None]:
def extract_job_info(url):
    driver = webdriver.Edge()

    driver.get(url)
    
    time.sleep(5)
    
    try:
        page_html = driver.page_source

        soup = BeautifulSoup(page_html, 'html.parser')
        
        link_element = soup.select_one("#app > div > main > section.css-dy1y6u > div > strong.css-9geu3q > div > a") 
        if link_element:
            link = link_element.get('href')
            if link not in company_http:
                company_http.append(link)

        job_title_element = soup.select_one('#app > div > main > section.css-dy1y6u > div > h1')
        job_title = job_title_element.get_text().strip() if job_title_element else None

        experience_element = soup.select_one('#app > div > main > section.css-3kx5e2 > div:-soup-contains("Experience Needed") > span.css-47jx3m > span')
        experience_text = experience_element.get_text().strip() if experience_element and experience_element.get_text() != "Not Specified" else None
        experience_needed = None
        if experience_text:
            match = re.search(r'\d+', experience_text)
            if match:
                experience_needed = match.group()

        career_level_element = soup.select_one('#app > div > main > section.css-3kx5e2 > div:-soup-contains("Career Level") > span.css-47jx3m > span')
        career_level = career_level_element.get_text().strip() if career_level_element and career_level_element.get_text() != "Not specified " else None

        education_element = soup.select_one('#app > div > main > section.css-3kx5e2 > div:-soup-contains("Education Level") > span.css-47jx3m > span')
        education_level = education_element.get_text().strip() if education_element and education_element.get_text() != "Not Specified" else None

        gender_element = soup.select_one('#app > div > main > section.css-3kx5e2 > div:-soup-contains("Gender") > span.css-47jx3m > span')
        gender = gender_element.get_text().strip() if gender_element and gender_element.get_text() != "Not Specified" else None

        salary_element = soup.select_one('#app > div > main > section.css-3kx5e2 > div:-soup-contains("Salary") > span.css-47jx3m > span')
        salary_text = salary_element.get_text().strip() if salary_element and salary_element.get_text() != "Not Specified" else None
        min_salary = None
        max_salary = None
        if salary_text and (not ("Confidential" in salary_text)):
            numbers = re.findall(r'\d+', salary_text)
            if numbers:
                min_salary = numbers[0]
                if len(numbers) > 1:
                    max_salary = numbers[1]

        job_categories_element = soup.select('#app > div > main > section.css-3kx5e2 > div.css-13sf2ik > ul > li')
        job_categories = [category.get_text().strip() for category in job_categories_element] if job_categories_element else None

        job_description_element = soup.select_one('#app > div > main > section:-soup-contains("Job Description") > div')
        job_description = job_description_element.get_text().strip() if job_description_element else None

        job_requirements_element = soup.select_one('#app > div > main > section:-soup-contains("Job Requirements") > div')
        job_requirements = job_requirements_element.get_text().strip() if job_requirements_element else None

        vacancies_element = soup.select_one('#app > div > main > section.css-dy1y6u > div > div.css-104dl8g > div > span > span:-soup-contains("open")')
        vacancies_text = vacancies_element.get_text().strip() if vacancies_element and vacancies_element.get_text() != "Not Specified" else None
        vacancies = None
        if vacancies_text:
            match = re.search(r'\d+', vacancies_text)
            if match:
                vacancies = int(match.group())
        
        company_name_element = soup.select_one('#app > div > main > section.css-dy1y6u > div > strong.css-9geu3q > div > a')
        company_name = company_name_element.get_text().strip() if company_name_element else "Confidential Company"

        location_element = soup.select_one('#app > div > main > section.css-dy1y6u > div > strong.css-9geu3q')
        location = location_element.contents[-1] if location_element else None

        job_types_element = soup.select('#app > div > main > section.css-dy1y6u > div > div.css-11rcwxl > a')
        job_types = [job_type.get_text().strip() for job_type in job_types_element] if job_types_element else None

        job_skills_element = soup.select('#app > div > main > section.css-3kx5e2 > div.css-s2o0yh a')
        job_skills = [skill.get_text().strip() for skill in job_skills_element] if job_skills_element else None

        data = {
            'job_title': [job_title],
            'experience_needed': [experience_needed],
            'career_level': [career_level],
            'education_level': [education_level],
            'min_salary': [min_salary],
            'max_salary': [max_salary],
            'gender': [gender],
            'job_categories': [job_categories],
            'job_description': [job_description],
            'job_requirements': [job_requirements],
            'vacancies': [vacancies],
            'company_name': [company_name],
            'location': [location],
            'job_types': [job_types],
            'job_skills': [job_skills]
        }

        current_job_df = pd.DataFrame(data)
        
        global job_df
        job_df = pd.concat([job_df, current_job_df], ignore_index=True)
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()

In [None]:
def extract_company_info(url):
    driver = webdriver.Edge()

    driver.get(url)

    time.sleep(5)

    try:
        page_html = driver.page_source

        soup = BeautifulSoup(page_html, 'html.parser')
        
        company_url_element = soup.select_one('#app > div > div:nth-child(2) > div > div > div.css-12e2e2p > div.css-aqnjlk > div > a')
        company_url = company_url_element.get('href') if company_url_element else None

        company_name_element = soup.select_one('#app > div > div:nth-child(2) > div > div > div.css-12e2e2p > div.css-1eoy87d > h1')
        company_name = company_name_element.get_text().strip() if company_name_element else None

        location_element = soup.select_one('#profile-section > div > span:-soup-contains("Location") > span.css-16heon9')
        location = location_element.get_text().strip() if location_element else None

        foundation_date_element = soup.select_one('#profile-section > div > span:-soup-contains("Founded") > span.css-6whuzn')
        foundation_date = foundation_date_element.get_text().strip() if foundation_date_element else None
        
        size_element = soup.select_one('#profile-section > div > span:-soup-contains("Company Size") > span.css-16heon9')
        size_text = size_element.get_text().strip() if size_element else None
        min_size = None
        max_size = None
        if size_text:
            numbers = re.findall(r'\d+', size_text)
            if numbers:
                min_size = numbers[0]
                if len(numbers) > 1:
                    max_size = numbers[1]

        industry_element = soup.select_one('#profile-section > div > span:-soup-contains("Industry") > span.css-16heon9')
        industry = [indus.get_text().strip() for indus in industry_element.find_all('a')] if industry_element else None

        specialities_element = soup.select_one('#profile-section > div > span:-soup-contains("Specialities") > span.css-16heon9')
        specialities = [speciality.get_text().strip() for speciality in specialities_element.find_all('a')] if specialities_element else None

        try:
            button = driver.find_element(By.CSS_SELECTOR, "#profile-section > p > span")
            button.click()

            p_element = driver.find_element(By.CSS_SELECTOR, "#profile-section > p")
            for span_element in p_element.find_elements(By.TAG_NAME, "span"):
                span_text = span_element.text
                p_text = p_element.text
                about = p_text.replace(span_text, '').strip()
        except:
            about_element = soup.select_one('#profile-section > p')
            about = about_element.contents[0].get_text().strip() if about_element else None      

        data = {
            'company_url': [company_url],
            'company_name': [company_name],
            'location': [location],
            'foundation_date': [foundation_date],
            'min_size': [min_size],
            'max_size': [max_size],
            'specialities': [specialities],
            'industry': [industry],
            'about': [about]
        }

        current_company_df = pd.DataFrame(data)
        
        global company_df
        company_df = pd.concat([company_df, current_company_df], ignore_index=True)
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()

In [None]:
for i in range(0, 132):
    loop_url = "https://wuzzuf.net/search/jobs/?a=hpb&filters%5Broles%5D%5B0%5D=IT%2FSoftware%20Development&page=&start=" + str(i)
    response = requests.get(loop_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        links = soup.find_all('a', class_="css-o171kl")
        
        for link in links:
            href = link.get('href')
            if href.startswith(('http:', 'https:')):
                extract_job_info(href)
                http_links.append(href)

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [None]:
for link in company_http:
    extract_company_info(link)

In [None]:
#Cleaning the data

In [None]:
job_df.isna().sum()

In [None]:
job_df = job_df.dropna(subset=['job_title'])
job_df.drop_duplicates(subset=['company_name', 'job_title'], keep='first', inplace=True)

In [None]:
job_df['Location_Part1'] = job_df['location'].str.split(', ').str[0]
job_df['Location_Part2'] = job_df['location'].str.split(', ').str[1]

In [None]:

result = []

for index, row in job_df.iterrows():
    job_title = row['job_title']
    company_name = row['company_name']
    job_categories = row['job_categories']

    if isinstance(job_categories, str):
        job_categories = job_categories.replace('[', '').replace(']', '').replace("'", "").split(', ')

    for category in job_categories:
        result.append({'job_title': job_title, 'company_name': company_name, 'job_category': category})

job_categories_df = pd.DataFrame(result)

In [None]:
categories = set(job_categories_df['company_name'])

job = set(job_df['company_name'])

missing = categories - job

print("Company names in job_categories_df but not in job_df:")
print(missing)

mask = job_categories_df['company_name'].isin(missing)

job_categories_df = job_categories_df[~mask]

In [None]:
categories = set(job_categories_df['job_title'])

job = set(job_df['job_title'])

missing = categories - job

print("Job titles in job_categories_df but not in job_df:")
print(missing)

mask = job_categories_df['job_title'].isin(missing)

job_categories_df = job_categories_df[~mask]

In [None]:
job_categories_df.to_csv('job_categories.csv', index=False, sep='~')

In [None]:
result = []

for index, row in job_df.iterrows():
    job_title = row['job_title']
    company_name = row['company_name']
    job_types = row['job_types']

    if isinstance(job_types, str):
        job_types = job_types.replace('[', '').replace(']', '').replace("'", "").split(', ')

    for type in job_types:
        result.append({'job_title': job_title, 'company_name': company_name, 'job_type': type})

job_types_df = pd.DataFrame(result)

In [None]:
types = set(job_types_df['job_title'])

job = set(job_df['job_title'])

missing = types - job

print("Job titles in job_types_df but not in job_df:")
print(missing)

mask = job_types_df['job_title'].isin(missing)

job_types_df = job_types_df[~mask]

In [None]:
types = set(job_types_df['company_name'])

company_company_names = set(job_df['company_name'])

missing = types - company_company_names

print("Company names in job_types_df but not in job_df:")
print(missing)

mask = job_types_df['company_name'].isin(missing)

job_types_df = job_types_df[~mask]

In [None]:
job_types_df.to_csv('job_types.csv', index=False, sep='~')

In [None]:
result = []

for index, row in job_df.iterrows():
    job_title = row['job_title']
    company_name = row['company_name']
    job_skills = row['job_skills']

    if isinstance(job_skills, str):
        job_skills = job_skills.replace('[', '').replace(']', '').replace("'", "").split(', ')

    for skill in job_skills:
        result.append({'job_title': job_title, 'company_name': company_name, 'job_skill': skill})

job_skills_df = pd.DataFrame(result)

In [None]:
skills = set(job_skills_df['company_name'])

job = set(job_df['company_name'])

missing = skills - job

print("Company names in job_skills_df but not in job_df:")
print(missing)

mask = job_skills_df['company_name'].isin(missing)

job_skills_df = job_skills_df[~mask]

In [None]:
skills = set(job_skills_df['job_title'])

job = set(job_df['job_title'])

missing = skills - job

print("Job titles names in job_df but not job_skills_df:")
print(missing)

mask = job_skills_df['job_title'].isin(missing)

job_skills_df = job_skills_df[~mask]

In [None]:
job_skills_df['job_skill'] = job_skills_df['job_skill'].str.lower()
job_skills_df.drop_duplicates(subset=['job_title', 'company_name', 'job_skill'], keep='first', inplace=True)
job_skills_df.to_csv('job_skills.csv', index=False, sep='~')

In [None]:
job_df.drop(columns=['location'], inplace=True)
job_df.drop(columns=['job_categories', 'job_types', 'job_skills'], inplace=True)

In [None]:
job_df['experience_needed'] = job_df['experience_needed'].fillna(-1)

job_df['experience_needed'] = job_df['experience_needed'].astype(int)

job_df['min_salary'] = job_df['min_salary'].fillna(-1)

job_df['min_salary'] = job_df['min_salary'].astype(int)

job_df['max_salary'] = job_df['max_salary'].fillna(-1)

job_df['max_salary'] = job_df['max_salary'].astype(int)

job_df['job_description'] = job_df['job_description'].str.replace(r'[\n\r]+', ' ', regex=True)

job_df['job_requirements'] = job_df['job_requirements'].str.replace(r'[\n\r]+', ' ', regex=True)

In [None]:
job_company_names = set(job_df['company_name'])

company_company_names = set(company_df['company_name'])

missing_company_names = job_company_names - company_company_names

print("Company names in job_df but not in company_df:")
print(missing_company_names)

mask = job_df['company_name'].isin(missing_company_names)

job_df = job_df[~mask]

In [None]:
job_df.to_csv('job_postings_data_final.csv', index=False, sep='~')

In [None]:
result = []

for index, row in company_df.iterrows():
    company_name = row['company_name']
    industries = row['industry']

    if isinstance(industries, str) and industries:
        industries = industries.replace('[', '').replace(']', '').replace("'", "").split(', ')

        for industry in industries:
            result.append({'company_name': company_name, 'industry': industry})

company_industries_df = pd.DataFrame(result)

company_industries_df.to_csv('company_industries.csv', index=False, sep='~')

In [None]:
for index, row in company_df.iterrows():
    location_value = row['location']
    if isinstance(location_value, str) and ',' in location_value:
        parts = location_value.split(', ')
        company_df.at[index, 'city'] = parts[0]
        company_df.at[index, 'country'] = parts[1]
    else:
        company_df.at[index, 'country'] = location_value
        company_df.at[index, 'city'] = None

In [None]:
company_df.drop(columns=['specialities'], inplace=True)

company_df.drop(columns=['location'], inplace=True)

company_df.drop(columns=['industry'], inplace=True)

new_row = {
    'company_url': [None],
    'company_name': ['Confidential Company'],
    'foundation_date': [None],
    'min_size': [None],
    'max_size': [None],
    'about': [None],
    'city': [None],
    'country': [None]
}

new_row_df = pd.DataFrame(new_row)

company_df = pd.concat([company_df, new_row_df], ignore_index=True)

company_df['foundation_date'] = company_df['foundation_date'].fillna(-1)

company_df['foundation_date'] = company_df['foundation_date'].astype(int)

company_df['min_size'] = company_df['min_size'].fillna(-1)

company_df['min_size'] = company_df['min_size'].astype(int)

company_df['max_size'] = company_df['max_size'].fillna(-1)

company_df['max_size'] = company_df['max_size'].astype(int)

company_df['about'] = company_df['about'].str.replace(r'[\n\r]+', ' ', regex=True)

company_df.to_csv('company_data_final.csv', index=False, sep='~')