# DATA SCIENCE JOB MARKET ANALYSIS

## 1. Web Scraping using Selenium

In [4]:
pip install selenium

Collecting seleniumNote: you may need to restart the kernel to use updated packages.

  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3~=2.4.0 (from urllib3[socks]~=2.4.0->selenium)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.4.26 (from selenium)
  Downloading certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.30.0->selenium)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
botocore 1.31.64 requires urllib3<2.1,>=1.25.4; python_version >= "3.10", but you have urllib3 2.4.0 which is incompatible.


In [5]:
import time
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv


## Step-by-Step: How to Know What to Use in Selenium
### 1. Open the Website in Your Browser (Google Chrome)
For example:

https://www.naukri.com/data-scientist-jobs-1

### 2. Right-Click on the Element You Want to Scrape
Let’s say you want to grab the job title or company name.

Right-click the job title → Click “Inspect”

Your browser's Developer Tools (DevTools) will open, and the corresponding HTML element will be highlighted.

### 3. Identify the HTML Tag and Class/ID
For example:

```bash
<div class="cust-job-tuple">
  <a class="title" href="...">Data Scientist</a>
  <a class="comp-name">ABC Corp</a>
  <span class="exp">
    <span class="expwdth">2-5 Yrs</span>
  </span>
```

From this structure, you figure out the correct CSS Selectors:

- Job card wrapper -->	div.cust-job-tuple

- Role/title	-->  a.title

- Company	-->  a.comp-name

- Experience	-->  span.exp span.expwdth

- Location	-->  span.loc span.locWdth

In [11]:
driver = webdriver.Chrome()
driver.maximize_window()

jobs = {
    "roles": [],
    "companies": [],
    "locations": [],
    "experience": [],
    "skills": []
}

for i in range(5):
    driver.get(f"https://www.naukri.com/data-scientist-jobs-{i}")
    
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.cust-job-tuple"))
    )
    
    job_cards = driver.find_elements(By.CSS_SELECTOR, "div.cust-job-tuple")

    for job in job_cards:
        try:
            role = job.find_element(By.CSS_SELECTOR, "a.title").text.strip()
        except:
            role = ""
        try:
            company = job.find_element(By.CSS_SELECTOR, "a.comp-name").text.strip()
        except:
            company = ""
        try:
            exp = job.find_element(By.CSS_SELECTOR, "span.exp span.expwdth").text.strip()
        except:
            exp = ""
        try:
            location = job.find_element(By.CSS_SELECTOR, "span.loc span.locWdth").text.strip()
        except:
            location = ""
        try:
            # finds all <li> elements inside a <ul> with the class tags-gt, within a specific job HTML element.
            # The result is a list of WebElement objects (each representing a skill tag).
            skills_list = job.find_elements(By.CSS_SELECTOR, "ul.tags-gt li")

            # Loops over to extract each elements, then strips whitespaces,
            # then joins all skills into a single string, separated by commas.
            skills = ', '.join([s.text.strip() for s in skills_list])
        except:
            skills = ""

        jobs["roles"].append(role)
        jobs["companies"].append(company)
        jobs["locations"].append(location)
        jobs["experience"].append(exp)
        jobs["skills"].append(skills)

In [None]:
import pandas as pd
DS_jobs_df=pd.DataFrame(jobs)
DS_jobs_df.to_csv("DataScience_jobs.csv")

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_dice_jobs(keyword, pages=5): 
    jobs = []

    for page in range(1, pages + 1):
        url = f"https://www.naukri.com/{keyword}-{page}"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)

        if response.status_code != 200:
            print(f"Failed to retrieve page {page}: Status code {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        job_elements = soup.find_all('div', class_='job-card')

        for job_elem in job_elements:
            try:
                title = job_elem.find('a', class_='job-title').text.strip()
            except:
                title = ''
            try:
                company = job_elem.find('div', class_='company').text.strip()
            except:
                company = ''
            try:
                location = job_elem.find('div', class_='location').text.strip()
            except:
                location = ''
            try:
                link = job_elem.find('a', class_='job-title')['href']
            except:
                link = ''

            jobs.append({
                'title': title,
                'company': company,
                'location': location,
                'link': link
            })

    return pd.DataFrame(jobs)

# --- Run scraper ---
if __name__ == "__main__":
    keyword = "data-scientist-jobs"
    pages_to_scrape = 5

    df = scrape_dice_jobs(keyword, pages=pages_to_scrape)
    print(df.head())

    # Save to CSV (optional)
    df.to_csv("naukari_data_science_jobs.csv", index=False)
    print(f"\nScraped {len(df)} jobs and saved to 'naukari_data_science_jobs.csv'")


Empty DataFrame
Columns: []
Index: []

Scraped 0 jobs and saved to 'naukari_data_science_jobs.csv'


### The scraped data will be saved in the current project directory as "DataScience_jobs.csv".