In [1]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Setup WebDriver
options = Options()
options.binary_location = "C:/Program Files/Google/Chrome/Application/chrome.exe"  # Update if needed
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

job_roles = ["Data Scientist Intern","Data Scientist Associate","Junior Data Scientist",'Data Scientist',
             "Machine Learning Engineer",'Artificial Intelligence Engineer','Senior Data Scientist',"Lead Data Scientist",
             "Principal Data Scientist","Data Science Manager","Data Science Analyst","Data Science Staff",
             "Data Analyst","Lead Data Analyst","Senior Data Analyst","Data Engineer",'Lead Data Engineer',
            "Senior Data Engineer"]
df = pd.DataFrame({'role': [], 'link': [], 'name': [], 'com_name': [], 'experience': [], 'salary': [], 'location': []})

max_pages = 30  # Limit pages to avoid excessive scraping

for job in job_roles:
    driver.get('https://www.naukri.com')  # Reloads the homepage for each role

    try:
        # Wait for the search bar to appear
        input_search = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Enter skills / designations / companies"]'))
        )
        input_search.clear()  # Clears the search box
        input_search.send_keys(job)
        
        # Wait for search button & click
        search_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.qsbSubmit'))
        )
        search_button.click()
    except Exception as e:
        print(f"Error during job search for {job}: {e}")
        continue  

    page_count = 0
    
    while page_count <= max_pages:
        time.sleep(3)  
        soup = BeautifulSoup(driver.page_source, 'lxml')

        posting = soup.find_all('div', class_='srp-jobtuple-wrapper')

        for post in posting:
            try:
                link = post.find('a', class_="title").get('href') if post.find('a', class_="title") else "Not Provided"
                name = post.find('a', class_="title").text.strip() if post.find('a', class_="title") else "Not Provided"
                com_name = post.find('a', class_="comp-name").text.strip() if post.find('a', class_="comp-name") else "Not Provided"
                experience = post.find('span', class_="expwdth").text.strip() if post.find('span', class_="expwdth") else "Not Provided"
                salary = post.find('span', class_="ni-job-tuple-icon ni-job-tuple-icon-srp-rupee sal").text.strip() if post.find('span', class_="ni-job-tuple-icon ni-job-tuple-icon-srp-rupee sal") else "Not Disclosed"
                location = post.find('span', class_="locWdth").text.strip() if post.find('span', class_="locWdth") else "Not Provided"

                new_row = pd.DataFrame({'role': [job], 'link': [link], 'name': [name], 'com_name': [com_name], 
                                        'experience': [experience], 'salary': [salary], 'location': [location]})
                df = pd.concat([df, new_row], ignore_index=True)

            except Exception as e:
                print(f"Skipping job due to error: {e}")
                continue  

        # Try clicking the "Next" button, stop if not found
        try:
            next_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#lastCompMark > a:nth-child(4)'))
            )
            next_button.click()
            print(f"Moving to page {page_count + 1} for {job}...")
            time.sleep(3)  
        except:
            print(f"No more pages for {job}. Moving to next role.")
            break  

        page_count += 1  # Increase page count

print("Scraping completed!")

# Step 2: Identify MNCs dynamically
company_counts = df["com_name"].value_counts()
threshold = 5  # Minimum job listings to be considered an MNC
major_companies = {com for com, count in company_counts.items() if count >= threshold}
print("Major Companies Identified:", major_companies)

# Step 3: Keep all jobs where salary is disclosed OR the company is in the major companies list
df_filtered = df[(df['salary'] != "Not disclosed") | (df['com_name'].isin(major_companies))]

print(f"Filtered dataset size: {df_filtered.shape[0]} jobs retained.")

# Save to CSV
df_filtered.to_csv("naukri_data_filtered.csv", index=False)

print("Final dataset saved!")

Moving to page 1 for Data Scientist Intern...
Moving to page 2 for Data Scientist Intern...
Moving to page 3 for Data Scientist Intern...
No more pages for Data Scientist Intern. Moving to next role.
Moving to page 1 for Data Scientist Associate...
Moving to page 2 for Data Scientist Associate...
Moving to page 3 for Data Scientist Associate...
Moving to page 4 for Data Scientist Associate...
Moving to page 5 for Data Scientist Associate...
Moving to page 6 for Data Scientist Associate...
Moving to page 7 for Data Scientist Associate...
Moving to page 8 for Data Scientist Associate...
Moving to page 9 for Data Scientist Associate...
Moving to page 10 for Data Scientist Associate...
Moving to page 11 for Data Scientist Associate...
Moving to page 12 for Data Scientist Associate...
Moving to page 13 for Data Scientist Associate...
Moving to page 14 for Data Scientist Associate...
Moving to page 15 for Data Scientist Associate...
Moving to page 16 for Data Scientist Associate...
Moving to

Moving to page 10 for Senior Data Scientist...
Moving to page 11 for Senior Data Scientist...
Moving to page 12 for Senior Data Scientist...
Moving to page 13 for Senior Data Scientist...
Moving to page 14 for Senior Data Scientist...
Moving to page 15 for Senior Data Scientist...
Moving to page 16 for Senior Data Scientist...
Moving to page 17 for Senior Data Scientist...
Moving to page 18 for Senior Data Scientist...
Moving to page 19 for Senior Data Scientist...
Moving to page 20 for Senior Data Scientist...
Moving to page 21 for Senior Data Scientist...
Moving to page 22 for Senior Data Scientist...
Moving to page 23 for Senior Data Scientist...
Moving to page 24 for Senior Data Scientist...
Moving to page 25 for Senior Data Scientist...
Moving to page 26 for Senior Data Scientist...
Moving to page 27 for Senior Data Scientist...
Moving to page 28 for Senior Data Scientist...
Moving to page 29 for Senior Data Scientist...
Moving to page 30 for Senior Data Scientist...
Moving to pag

Moving to page 5 for Lead Data Analyst...
Moving to page 6 for Lead Data Analyst...
Moving to page 7 for Lead Data Analyst...
Moving to page 8 for Lead Data Analyst...
Moving to page 9 for Lead Data Analyst...
Moving to page 10 for Lead Data Analyst...
Moving to page 11 for Lead Data Analyst...
Moving to page 12 for Lead Data Analyst...
Moving to page 13 for Lead Data Analyst...
Moving to page 14 for Lead Data Analyst...
Moving to page 15 for Lead Data Analyst...
Moving to page 16 for Lead Data Analyst...
Moving to page 17 for Lead Data Analyst...
Moving to page 18 for Lead Data Analyst...
Moving to page 19 for Lead Data Analyst...
Moving to page 20 for Lead Data Analyst...
Moving to page 21 for Lead Data Analyst...
Moving to page 22 for Lead Data Analyst...
Moving to page 23 for Lead Data Analyst...
Moving to page 24 for Lead Data Analyst...
Moving to page 25 for Lead Data Analyst...
Moving to page 26 for Lead Data Analyst...
Moving to page 27 for Lead Data Analyst...
Moving to page 2

In [2]:
df_filtered

Unnamed: 0,role,link,name,com_name,experience,salary,location
0,Data Scientist Intern,https://www.naukri.com/job-listings-deep-learn...,Deep Learning / Machine Learning / Data Scient...,Pivotchain,Not Provided,Unpaid,Pune
1,Data Scientist Intern,https://www.naukri.com/job-listings-data-scien...,Data Scientist,Codemaya,Not Provided,"20,000/month",Lucknow
2,Data Scientist Intern,https://www.naukri.com/job-listings-data-scien...,Data Scientist,Clustor Computing,Not Provided,Unpaid,Nagpur
3,Data Scientist Intern,https://www.naukri.com/job-listings-junior-dat...,Junior Data Scientist-Intern,Point Perfect Transcription Services,Not Provided,Unpaid,Coimbatore
4,Data Scientist Intern,https://www.naukri.com/job-listings-data-scien...,Data Scientist Intern Rigbot,rigbot.com,Not Provided,Unpaid,"Kolkata, Mumbai, New Delhi, Hyderabad, Pune, C..."
...,...,...,...,...,...,...,...
9975,Senior Data Engineer,https://www.naukri.com/job-listings-senior-dat...,Senior Data Engineer,Ixie Gaming,5-7 Yrs,Not disclosed,Bengaluru
9976,Senior Data Engineer,https://www.naukri.com/job-listings-senior-dat...,Senior Data Engineer (Java expertise required),Indorama,2-6 Yrs,Not disclosed,Bengaluru
9977,Senior Data Engineer,https://www.naukri.com/job-listings-senior-dat...,Senior Data Engineer (Java expertise required),Cermati.com,2-5 Yrs,Not disclosed,Bengaluru
9980,Senior Data Engineer,https://www.naukri.com/job-listings-senior-dat...,Senior Data Engineer,Indium Software,5-7 Yrs,Not disclosed,Bengaluru
