#### **Indeed Python job scraper**

In [120]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd     
from datetime import datetime, timedelta

JOB_SEARCH = 'Data scientist'
JOB_LOCATION = 'Buenos Aires'
HEADER = {"User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
N_PAGES = 10
LAST_DAYS_POSTED = 1

In [122]:
def get_indeed_jobs(search:str,location:str,n_pages:int,post_days:int)->list:
    
    """
        Get principal elements of each job result including the job company, title, description link, description, job_type and date.
        
        Returns a list of dictionaries containing the above elements for every job result.
    """
    
    BASE_URL = 'https://ar.indeed.com'
    posts=[]
    
    skill = JOB_SEARCH.strip()
    place = JOB_LOCATION.strip()

    # Iterate over all pages
    for page in range(n_pages):

            # Connecting to Arg Indeed
            url = BASE_URL + '/jobs?q=' + skill + \
                '&l=' + place + '&sort=date' + '&fromage=' + str(post_days) +'&start='+ str(page * 10) 

            # Get request to indeed with specified headers
            response = requests.get(url, headers=HEADER)
            html = response.text

            # Scrapping the Web 
            soup = BeautifulSoup(html, 'lxml')

            # Get all jobcards :
            jobcards=soup.find('div', attrs= {'id' : 'mosaic-zone-jobcards'}).findAll('div',class_='job_seen_beacon')

            # Iterate over all job cards:
            for i,i_jobcard in enumerate(jobcards):
                
                if i_jobcard==None:
                    print('Job card not founded')
                    continue

                jobcard_header = i_jobcard.find('td')

                if jobcard_header==None:
                    print('Job card  header not founded')
                    continue

                # get Job title from header
                job_title=jobcard_header.find('a')
                if job_title == None:
                    job_title = 'None'
                else:
                    job_title=job_title.text

                # get job location from header
                job_location = jobcard_header.find('div',class_='companyLocation')
                if job_location == None:
                    job_location='None'
                else:
                    job_location=job_location.text

                # get company name
                company = jobcard_header.find('span',{'class':'companyName'})
                if  company== None:
                    company='None'
                else:
                    company=company.text

                # The job description is extracted from the description link for future NLP processing.
                desc_link= jobcard_header.find('a')
                if desc_link==None:
                    desc_link='None'
                    job_descr='None'
                else:
                    desc_link= BASE_URL + desc_link['href']
                    # Get job description
                    response = requests.get(desc_link, headers=HEADER)
                    html_ = response.text
                    soup_ = BeautifulSoup(html_, 'lxml')
                    job_descr = soup_.find('div',{'class':'jobsearch-jobDescriptionText'})
                    if job_descr==None:
                        job_descr='None'
                    else:
                        job_descr=job_descr.text.strip()
                    
            # get job_type if available:
                job_type=i_jobcard.find('div',{'class':'attribute_snippet'})
                if job_type==None:
                    job_type='None'
                else:
                    job_type=job_type.text

            # get Job Post Date:
                post_date = i_jobcard.find('span', attrs={'class': 'date'})
                if post_date== None:
                    'None'
                else :
                    post_date=post_date.text
                    finded_digits = [int(s) for s in post_date.split() if s.isdigit()]
                    if len(finded_digits)==0:
                        post_date=datetime.now().date()
                    else:
                        post_date =  datetime.now().date() - timedelta(days=int(finded_digits[0])) 

            # Put job card elements together in a dict and append it into a list
                posts.append({'Company':company,'Location':job_location,'Title':job_title,'Description_link':desc_link,'Description':job_descr,'Job_type':job_type, 'Posted_date':post_date})

            if i < 14: 
                # job result page has less than max jobcards(15), ergo its the last page.
                break # end search

    return posts

In [123]:
posts = get_indeed_jobs(JOB_SEARCH,JOB_LOCATION,N_PAGES,LAST_DAYS_POSTED)

In [124]:
print('Example of a job link description')
print('\n')

df = pd.DataFrame(posts)

print(df['Description_link'][0]) 

df

Example of a job link description


https://ar.indeed.com/rc/clk?jk=4ca99662931f7c48&fccid=dd616958bd9ddc12&vjs=3


Unnamed: 0,Company,Location,Title,Description_link,Description,Job_type,Posted_date
0,,"Vicente L贸pez, Buenos Aires",Sr Data Scientist,https://ar.indeed.com/rc/clk?jk=4ca99662931f7c...,We are looking for Data Scientists to build ma...,,2022-07-21
1,,"Vicente L贸pez, Buenos Aires",Data Engineer,https://ar.indeed.com/rc/clk?jk=1ffc6481c27c46...,"We are looking for an accomplished, motivated ...",,2022-07-21
2,Visa,"Buenos Aires, Buenos Aires",Data Scientist - Manager,https://ar.indeed.com/rc/clk?jk=1817d0354ad28b...,Company Description\n As the world's leader i...,Tiempo completo,2022-07-20
