#### Indeed Scraper Version 1 (only gets partial summary)
Almost entirely taken from https://github.com/vittoriotriassi/jobs_scraper with added search option for level. 

In [None]:
#entry_level, mid_level, senior_level
#https://www.indeed.com/jobs?q=-&l=nyc&explvl=mid_level

In [None]:
import requests
import pandas as pd
from time import sleep
import random
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

class JobsScraper:
    """JobsScraper is a simple job postings scraper for Indeed."""

    def __init__(self, country: str, position: str, location: str, level: str, pages: int, max_delay: int = 0, full_urls: bool = False):
        """
        Create a JobsScraper object.
        Parameters
        ------------
        country: str
            Prefix country.
            Available countries:
            AE, AQ, AR, AT, AU, BE, BH, BR, CA, CH, CL, CO,
            CZ, DE, DK, ES, FI, FR, GB, GR, HK, HU, ID, IE,
            IL, IN, IT, KW, LU, MX, MY, NL, NO, NZ, OM, PE,
            PH, PK, PL, PT, QA, RO, RU, SA, SE, SG, TR, TW,
            US, VE, ZA.
        position: str
            Job position.
        location: str
            Job location.
        pages: int
            Number of pages to be scraped. Each page contains 15 results.
        max_delay: int, default = 0
            Max number of seconds of delay for the scraping of a single posting.
        full_urls: bool, default = False
            If set to True, it shows the job url column not truncated in the DataFrame.
        """
        if country.upper() == "US":
            self._url = 'https://indeed.com/jobs?q={}&l={}&explvl={}'.format(position, location, level)
        else:
            self._url = 'https://{}.indeed.com/jobs?q={}&l={}&explvl={}'.format(country, position, location, level)
        self._country = country
        self._headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}
        self._pages = pages
        self._max_delay = max_delay
        self._jobs = []
        print(self._url)
        if full_urls:
            pd.set_option('display.max_colwidth', None)
        else:
            pd.reset_option('display.max_colwidth')


    def _extract_page(self, page):

        with requests.Session() as request:
            r = request.get(url="{}&start={}".format(self._url, page), headers=self._headers)

        soup = BeautifulSoup(r.content, 'html.parser')

        return soup


    def _transform_page(self, soup):

        jobs = soup.find_all('div', class_='job_seen_beacon')

        for job in jobs:

            try:
                title = job.find(
                    'h2', class_='jobTitle').text.strip().replace('\n', '')
            except:
                title = None
            try:
                company = job.find(
                    'span', class_='companyName').text.strip().replace('\n', '')
            except:
                company = None
            try:
                summary = job.find(
                    'div', {'class': 'job-snippet'}).text.strip().replace('\n', '')
            except:
                summary = None

            if job.find('div', class_='companyLocation'):
                try:
                    location = job.find(
                        'div', class_='companyLocation').text.strip().replace('\n', '')
                except:
                    location = None
            else:
                try:
                    location = job.find(
                        'span', class_='location').text.strip().replace('\n', '')
                except:
                    location = None
            try:
                href = job.parent.a.get('href')
                if self._country.upper() == "US":
                    job_url = 'https://indeed.com{}'.format(href)
                else:
                    job_url = 'https://{}.indeed.com{}'.format(self._country, href)
            except:
                job_url = None
            try:
                salary = job.find(
                    'span', class_='salary-snippet').text.strip().replace('\n', '')
            except:
                salary = None

            job = {
                'title': title,
                'location': location,
                'company': company,
                'summary': summary,
                'salary': salary,
                'url': job_url
            }

            self._jobs.append(job)

            print("Scraping {}...".format(title))

            if self._max_delay > 0:
                sleep(random.randint(0, self._max_delay))


    def scrape(self) -> pd.DataFrame:
        """
        Perform the scraping for the parameters provided in the class constructor.
        If duplicates are found, they get dropped.
        Returns
        ------------
        df: pd.DataFrame
            Return a scraped Dataframe.
        """

        for i in tqdm(range(0, self._pages * 10, 10), desc = "Scraping in progress...", total = self._pages):

            page = self._extract_page(i)
            self._transform_page(page)

        df = pd.DataFrame(self._jobs)
        df.drop_duplicates(inplace=True)

        return df

Download job data for each categorie.

In [None]:
%%time
my_list = [["internship", ""], ["a", "entry_level"], ["a", "mid_level"], ["a", "senior_level"]]
my_dfs = []
for e in my_list:
    scraper = JobsScraper(country="US", 
                          position=e[0], 
                          location="",
                          level=e[1],
                          pages=200, 
                          full_urls=True, 
                          max_delay=3)
    df = scraper.scrape()
    if e[0] == "internship":
        print("YESSSSS")
        df["level"] = "internship"
    else:
        df["level"] = e[1]
    my_dfs.append(df)

df_all = pd.concat(my_dfs)

csv_name = "df_all_len_" + str(len(df_all)) + ".csv"

df_all.to_csv(csv_name) 
print(csv_name)

In [None]:
#download one by one
#%%time
scraper = JobsScraper(country="US", 
                      position="a", 
                      location="",
                      level="senior_level",
                      pages=1, 
                      full_urls=True, 
                      max_delay=3)
df = scraper.scrape()
len(df)
df

#### Indeed Scraper Version 2 with VPN changing (gets full summary)

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
import random
import time 
from nordvpn_switcher import initialize_VPN, rotate_VPN, terminate_VPN

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
}

#my_list = [ ["the", "senior_level"], ["the", "entry_level"], ["intern", ""], ["the", "mid_level"] ]
my_list = [ ["intern", ""] ]

for e in my_list:
    
    len_jobs_before = 0
    count = 0
    job = {}
    jobs = []
    job_name = e[0]
    level= e[1]
    page_start = 0
    page_end = 25

    for i in range((page_start * 10), (page_end * 10), 10):

        try:
            url = "https://www.indeed.com/jobs?q=" + job_name + "&l=US&explvl=" + level + "&start=" + str(i)
            print(url)
            api_url = "https://www.indeed.com/viewjob?viewtype=embedded&jk={job_id}"

            soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")

            for job in soup.select('a[id^="job_"]'):
                job_id = job["id"].split("_")[-1]
                s = BeautifulSoup(
                    requests.get(api_url.format(job_id=job_id), headers=headers).content,
                    "html.parser",
                )
                title = s.title.get_text(strip=True).split(" - ", 1)[0]
                description = s.select_one("#jobDescriptionText").get_text(strip=True, separator="\n")
                job = {
                    'title': title,
                    'description': description,
                    'level': level
                    }
                jobs.append(job)
                
            print("Page", int(i/10), "done.")
            print("Len Dict", len(jobs))
            sleep(random.uniform(0.7, 2.0))
            
            if len_jobs_before == len(jobs): #rotate vpn if no additional jobs have been downloaded
                count = count + 1
                print("Break Count:", count)
                print("Changing VPN.")
                initialize_VPN(save=1, area_input=['complete rotation'])
                rotate_VPN()
                print("VPN changed.")
                sleep(5)
            
            if count >= 10: #stop trying if no new jobs have been download 10 times in a row
                break

            len_jobs_before = len(jobs)
        except: print("Error.")
            

    df = pd.DataFrame(jobs)
    df.drop_duplicates(inplace=True)
    
    csv_name = "df_full_sum" + "_job_" + job_name + "_lvl_" + level + "_len_" + str(len(df))+ ".csv"
    print(csv_name)
    df.to_csv(csv_name)