## INSTALLING LIBRARY FOR BEAUTIFULSOUP

In [0]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/187.3 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.13.4 bs4-0.0.2 soupsieve-2.7
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


# DATABRICKS WIDGET SETUP
## # 

In [0]:
dbutils.widgets.text("role", "data engineer", "Job Role")
dbutils.widgets.text("keywords", "python,spark,airflow", "Keywords (comma-separated)")

# Experience Level - user-friendly labels
dbutils.widgets.dropdown(
    "experience",
    "Associate",
    ["Internship", "Entry Level", "Associate", "Mid-Senior Level", "Director", "Executive"],
    "Experience Level"
)

# Work Type - user-friendly labels
dbutils.widgets.dropdown(
    "work_type",
    "Remote",
    ["Remote", "On-site", "Hybrid"],
    "Work Type"
)

dbutils.widgets.text("posted_hours", "24", "Posted within last N hours")

dbutils.widgets.dropdown(
    "location",
    "pakistan",
    ["pakistan", "karachi", "uae", "US", "lahore", "austrailia"],
    "Job Location"
)





## MAPPING INPUTS

In [0]:
# Retrieve widget values
role = dbutils.widgets.get("role").strip()
keywords_input = dbutils.widgets.get("keywords").strip()
experience_label = dbutils.widgets.get("experience")
work_type_label = dbutils.widgets.get("work_type")
posted_hours = dbutils.widgets.get("posted_hours").strip()

# Convert posted hours to seconds
posted_seconds = int(posted_hours) * 3600

# Keyword list
keywords = [kw.strip().lower() for kw in keywords_input.split(",") if kw.strip()]

# Map experience level
experience_map = {
    "Internship": "1",
    "Entry Level": "2",
    "Associate": "3",
    "Mid-Senior Level": "4",
    "Director": "5",
    "Executive": "6"
}
experience_level = experience_map.get(experience_label, "3")  # default to 3 (Associate)

# Map work type
work_type_map = {
    "Remote": "2",
    "On-site": "1",
    "Hybrid": "3"
}
work_type = work_type_map.get(work_type_label, "1")  # default to 1 (Remote)

# Get location input and map to LinkedIn geoId
location_label = dbutils.widgets.get("location").strip().lower()
geo_id_map = {
    "pakistan": "101022442",
    "karachi": "105451800",
    "uae": "104305776",
    "us": "103644278",
    "lahore": "104112529",
    "austrailia": "101452733"
}
geo_id = geo_id_map.get(location_label, "101022442")  # default to Pakistan if unmatched


## SCRAPER FUNCTIONS

In [0]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote
import time

def extract_all_links(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a')
        return list({urljoin("https://www.linkedin.com", link.get('href')) for link in links if link.get('href') and "/jobs/view/" in link.get('href')})
    except Exception as e:
        print(f"❌ Error fetching links: {e}")
        return []

def scrape_description(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            return ""
        soup = BeautifulSoup(response.text, "html.parser")
        for class_name in ["description__text", "show-more-less-html__markup", "jobs-description-content__text", "description", "mt4"]:
            tag = soup.find("div", class_=class_name)
            if tag and tag.get_text(strip=True):
                return tag.get_text(separator=" ", strip=True)
        return ""
    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        return ""

def is_relevant(description: str, keywords: list, role: str) -> bool:
    description_lower = description.lower()
    return (role.lower() in description_lower) and any(kw in description_lower for kw in keywords)
    
def get_matched_keywords(description: str, keywords: list) -> list:
    description_lower = description.lower()
    return [kw for kw in keywords if kw in description_lower]

def is_fully_remote(description: str) -> bool:
    desc = description.lower()

    location_limited_phrases = [
        "remote in", "must be located in", "only for candidates in", 
        "must be based in", "remote within", "usa only", "location: US", "based candidates,", "based candidates"
    ]

    if any(phrase in desc for phrase in location_limited_phrases):
        return False
    else:
        return True


## MAIN LOGIC

In [0]:
import json
encoded_role = quote(role)
search_url = (
    f"https://www.linkedin.com/jobs/search/"
    f"?keywords={encoded_role}"
    f"&f_E={experience_level}"
    f"&f_TPR=r{posted_seconds}"
    f"&f_WT={work_type}"
    f"&geoId={geo_id}"
)

print(f"\n🔍 Searching LinkedIn for: {role}")
print(f"🔗 URL: {search_url}")

job_links = extract_all_links(search_url)
print(f"\n🔗 Found {len(job_links)} job links.\n")

output_jobs = []

for idx, link in enumerate(job_links, 1):
    print(f"[{idx}/{len(job_links)}] Checking: {link}")
    description = scrape_description(link)

    if not description:
        print("⚠️ Description not found.")
        continue

    # 👇 Only apply remote filter if user selected remote
    if work_type == "2":
        remote_check = is_fully_remote(description)
        if remote_check is False:
            print("🥺 Skipped: remote job limited to specific region.")
            continue

    # ✅ Must contain job role and at least one keyword
    if is_relevant(description, keywords, role):
        matched_keywords = get_matched_keywords(description, keywords)
        print(f"✅ Relevant job for '{role}'")
        output_jobs.append({
            "Job Role": role,
            "Job link": link,
            "Keywords Matched": matched_keywords
        })
    else:
        print("❌ Not relevant: missing role or keyword match.")

    time.sleep(2)

output_jobs.sort(key=lambda x: len(x["Keywords Matched"]), reverse=True)


🔍 Searching LinkedIn for: data engineer
🔗 URL: https://www.linkedin.com/jobs/search/?keywords=data%20engineer&f_E=4&f_TPR=r2520000&f_WT=2&geoId=103644278

🔗 Found 60 job links.

[1/60] Checking: https://www.linkedin.com/jobs/view/data-engineer-%E2%80%93-databricks-at-zenius-corporation-4253619939?position=21&pageNum=0&refId=cEoDACRgreRNFvAd4G89mQ%3D%3D&trackingId=fctfZJ7TNJlvXKNBKx0Ufw%3D%3D
✅ Relevant job for 'data engineer'
[2/60] Checking: https://www.linkedin.com/jobs/view/data-engineer-senior-data-engineer-lead-data-engineer-at-clearnote-health-4257708833?position=24&pageNum=0&refId=cEoDACRgreRNFvAd4G89mQ%3D%3D&trackingId=%2B8K8zbEf3kZ7EY2iwY%2FLCQ%3D%3D
✅ Relevant job for 'data engineer'
[3/60] Checking: https://www.linkedin.com/jobs/view/cloud-data-platform-engineer-at-bedrock-robotics-4267508330?position=32&pageNum=0&refId=cEoDACRgreRNFvAd4G89mQ%3D%3D&trackingId=611kHX4mBCQvKzCiy1g6cQ%3D%3D
✅ Relevant job for 'data engineer'
[4/60] Checking: https://www.linkedin.com/jobs/view/

## OUTPUT FROM NOTEBOOK TO WEBHOOK

In [0]:
dbutils.notebook.exit(str(output_jobs))

[{'Job Role': 'data engineer', 'Job link': 'https://www.linkedin.com/jobs/view/data-engineer-%E2%80%93-databricks-at-zenius-corporation-4253619939?position=21&pageNum=0&refId=cEoDACRgreRNFvAd4G89mQ%3D%3D&trackingId=fctfZJ7TNJlvXKNBKx0Ufw%3D%3D', 'Keywords Matched': ['data engineer', 'azure', 'pyspark', 'data pipelines']}, {'Job Role': 'data engineer', 'Job link': 'https://www.linkedin.com/jobs/view/data-engineer-senior-data-engineer-lead-data-engineer-at-clearnote-health-4257708833?position=24&pageNum=0&refId=cEoDACRgreRNFvAd4G89mQ%3D%3D&trackingId=%2B8K8zbEf3kZ7EY2iwY%2FLCQ%3D%3D', 'Keywords Matched': ['data engineer', 'sql']}, {'Job Role': 'data engineer', 'Job link': 'https://www.linkedin.com/jobs/view/cloud-data-platform-engineer-at-bedrock-robotics-4267508330?position=32&pageNum=0&refId=cEoDACRgreRNFvAd4G89mQ%3D%3D&trackingId=611kHX4mBCQvKzCiy1g6cQ%3D%3D', 'Keywords Matched': ['data engineer', 'data pipelines']}, {'Job Role': 'data engineer', 'Job link': 'https://www.linkedin.com/

### CREATING DATAFRAME AND CONNECTING NOTEBOOK TO ADLS

In [0]:
df = spark.createDataFrame(output_jobs)

spark.conf.set("fs.azure.account.auth.type", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id", "*****************")
spark.conf.set("fs.azure.account.oauth2.client.secret", "********************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint", "https://login.microsoftonline.com/********************/oauth2/token")

### UPLOADING FILE AS PARQUET TO ADLS

In [0]:
from pyspark.sql.functions import concat_ws

df_flat = df.withColumn("Keywords Matched", concat_ws(", ", df["Keywords Matched"]))

df_flat.coalesce(1).write.mode("overwrite").parquet("abfss://jobsearch-container@adlsjobsearch.dfs.core.windows.net/raw-data/")

