#### **Indeed Python job scraper**

In [16]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd     
from datetime import datetime, timedelta

JOB_SEARCH = 'Data scientist'
JOB_LOCATION = 'Buenos Aires'
HEADER = {"User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
N_PAGES = 10

In [12]:
def get_indeed_jobs(search:str,location:str,n_pages:int)->list:
    
    """
        Get principal elements of each job result including the job company, title, description link, description, job_type and date.
        
        Returns a list of dictionaries containing the above elements for every job result.
    """
    
    BASE_URL = 'https://ar.indeed.com'
    posts=[]
    
    skill = JOB_SEARCH.strip()
    place = JOB_LOCATION.strip()
    no_of_pages = N_PAGES

    for page in range(no_of_pages):

            # Connecting to Arg Indeed
            url = BASE_URL + '/jobs?q=' + skill + \
                '&l=' + place + '&sort=date' +'&start='+ str(page * 10)

            # Get request to indeed with headers above 
            response = requests.get(url, headers=HEADER)
            html = response.text

            # Scrapping the Web 
            soup = BeautifulSoup(html, 'lxml')

            # Outer Most Entry Point of HTML:
            outer_most_point=soup.find('div',attrs={'id': 'mosaic-provider-jobcards'})

            # "UL" lists where the data is stored:
            for i in outer_most_point.find('ul'):

            # get Job Title:

                job_content=i.find('td',{'class':'resultContent'})
                job_title = None
            
                if job_content != None:
                    job_title=job_content.find('a').text
                    job_location = job_content.find('div',{'class':'companyLocation'}).text
                if job_title==None:
                    job_title='None'
                if job_location==None:
                    job_location='None'

            # get Company Name:

                if i.find('span',{'class':'companyName'}) != None:
                    company=i.find('span',{'class':'companyName'}).text
                else:
                    company='None'

            # get Links: these Href links will take us to full job description

                if i.find('a') != None:
                    desc_link= i.find('a',{'class':'jcs-JobTitle'})['href']
                    if desc_link!=None:
                        desc_link= BASE_URL + desc_link
                        # Get job description
                        response = requests.get(desc_link, headers=HEADER)
                        html_ = response.text
                        soup_ = BeautifulSoup(html_, 'lxml')
                        job_descr = soup_.find('div',{'class':'jobsearch-jobDescriptionText'}).text.strip()
                        if job_descr==None:
                            job_descr='None'
                else:
                    desc_link='None'

            # get job_type if available:

                if i.find('div',{'class':'attribute_snippet'}) != None:
                    job_type=i.find('div',{'class':'attribute_snippet'}).text

      
                else:
                    job_type='None'

            # get Job Post Date:

                if i.find('span', attrs={'class': 'date'}) != None:
                    post_date = i.find('span', attrs={'class': 'date'}).text
                    if post_date!=None:
                        finded_digits = [int(s) for s in post_date.split() if s.isdigit()]
                        if len(finded_digits)==1:
                            post_date =  datetime.datetime.now() - timedelta(days=int(finded_digits)) 
                        else:
                            post_date=0
                else:
                    post_date='None'

            # Put everything together in a list of dicts

                posts.append({'Company':company,'Location':job_location,'Title':job_title,'Description_link':desc_link,'Description':job_descr,'Job_type':job_type, 'Posted_date':post_date})

            return posts

In [13]:
posts = get_indeed_jobs(JOB_SEARCH,JOB_LOCATION,N_PAGES)

In [14]:
print('Example of a job link description')
print('\n')

df = pd.DataFrame(posts)

print(df['Description_link'][0]) 

df

Example of a job link description


https://ar.indeed.com/rc/clk?jk=4ca99662931f7c48&fccid=dd616958bd9ddc12&vjs=3


Unnamed: 0,Company,Location,Title,Description_link,Description,Job_type,Posted_date
0,,"Vicente López, Buenos Aires",Sr Data Scientist,https://ar.indeed.com/rc/clk?jk=4ca99662931f7c...,We are looking for Data Scientists to build ma...,,0
1,,"Vicente López, Buenos Aires",Data Engineer,https://ar.indeed.com/rc/clk?jk=1ffc6481c27c46...,"We are looking for an accomplished, motivated ...",,0
2,Visa,"Buenos Aires, Buenos Aires",Data Scientist - Manager,https://ar.indeed.com/rc/clk?jk=1817d0354ad28b...,Company Description\n As the world's leader i...,Tiempo completo,0
3,Accenture,"Buenos Aires, Buenos Aires",P&A - Data Scientist - Associate Manager,https://ar.indeed.com/rc/clk?jk=5a3eef7bb8edbc...,OBJECTIVE\n\n Be part of a multi-disciplinary ...,Tiempo completo,0
4,Accenture,"Buenos Aires, Buenos Aires",BDS - ML11 Analyst,https://ar.indeed.com/rc/clk?jk=75b7bd5bed004e...,Quien se sume al equipo como Analista de Data ...,Tiempo completo,0
5,,"Buenos Aires, Buenos Aires",,,Quien se sume al equipo como Analista de Data ...,,
6,Accenture,"Buenos Aires, Buenos Aires",Business Associate Manager for Data Analytics ...,https://ar.indeed.com/rc/clk?jk=3d92331d1607ad...,OBJECTIVE\n\n Be part of a multi-disciplinary ...,Tiempo completo,0
7,Jampp,"Buenos Aires, Buenos Aires",Senior Data Scientist,https://ar.indeed.com/rc/clk?jk=023f6554659d08...,Jampp is looking for a Senior Data Scientists ...,,[2]
8,Jampp,"Buenos Aires, Buenos Aires",Senior Product Manager,https://ar.indeed.com/rc/clk?jk=57769dda9ebfc2...,ABOUT JAMPP\n Jampp is a programmatic advertis...,,[2]
9,Selin,"Buenos Aires, Buenos Aires",Data Scientist Sr . Compañía editorial,https://ar.indeed.com/rc/clk?jk=633d633e4d4b10...,"Para una importante compañía editorial, nos en...",,[5]
