# LinkedIn Scrapper

I use ScrapingDob API to scrap LinkedIn data science jobs from 7 different countries, each at which I scrap 100 jobs.

In [21]:
import requests
from pprint import pprint
import re
import pandas as pd
from tqdm import tqdm

In [43]:
API_KEY = 'YOUR API HERE'
geoids = ['100459316', # Saudi Arabia
          '104305776', # United Arab Emirates
          '106155005', # Egypt
          '102299470', # England
          '104738515', # Ireland
          '103644278', # United States
          '101174742', # Canada
          ] 
npages = list(range(1,5))

### Calculating the number of API calls

To get the jobs postings, every API call returns 25 postings per page per geoid. Number of geoids = 7, number of pages = 4, then the total number of API calls = $7 * 4 = 28$ calls.

For each job posting, we need a separate API call to get the job details. This is $28 * 25 = 700$ calls. Then total number of calls = 728.

We have a total of 1000 free API calls which I already used some of.

In [3]:
# Test
payload = {'api_key': API_KEY, 'field': 'data scientist', 'geoid': '100459316', 'page':'1'}
resp = requests.get('https://api.scrapingdog.com/linkedinjobs', params=payload)
pprint(resp.json())

[{'company_name': 'Halian',
  'company_profile': 'https://uk.linkedin.com/company/halian?trk=public_jobs_jserp-result_job-search-card-subtitle',
  'job_link': 'https://sa.linkedin.com/jobs/view/data-scientist-at-halian-3584041432?refId=i8jN%2BWhNllILFyMOLf85lw%3D%3D&trackingId=WRoLrM29YGLrOwJEsjx4Pw%3D%3D&position=1&pageNum=1&trk=public_jobs_jserp-result_search-card',
  'job_location': 'Riyadh, Riyadh, Saudi Arabia',
  'job_position': 'Data Scientist',
  'job_posting_date': '2023-03-30'},
 {'company_name': 'Halian',
  'company_profile': 'https://uk.linkedin.com/company/halian?trk=public_jobs_jserp-result_job-search-card-subtitle',
  'job_link': 'https://sa.linkedin.com/jobs/view/data-science-specialist-at-halian-3608956611?refId=i8jN%2BWhNllILFyMOLf85lw%3D%3D&trackingId=OlXFOxgknw3%2FKCOwQANseA%3D%3D&position=2&pageNum=1&trk=public_jobs_jserp-result_search-card',
  'job_location': 'Riyadh, Riyadh, Saudi Arabia',
  'job_position': 'Data Science Specialist',
  'job_posting_date': '2023-0

In [4]:
jobs = resp.json()
print('Number of jobs:', len(jobs))

Number of jobs: 25


In [5]:
# Converting the returned JSON data to pandas DataFrames
jobs_df = pd.DataFrame(jobs)
jobs_df.head()

Unnamed: 0,job_position,job_link,company_name,company_profile,job_location,job_posting_date
0,Data Scientist,https://sa.linkedin.com/jobs/view/data-scienti...,Halian,https://uk.linkedin.com/company/halian?trk=pub...,"Riyadh, Riyadh, Saudi Arabia",2023-03-30
1,Data Science Specialist,https://sa.linkedin.com/jobs/view/data-science...,Halian,https://uk.linkedin.com/company/halian?trk=pub...,"Riyadh, Riyadh, Saudi Arabia",2023-05-16
2,,https://sa.linkedin.com/jobs/view/data-scienti...,Garima Interprises,,Saudi Arabia,2023-05-11
3,Data Scientist Engineer,https://sa.linkedin.com/jobs/view/data-scienti...,Energy Jobline,https://uk.linkedin.com/company/energy-jobline...,Makkah Region,2023-06-07
4,"JUNIOR DEVELOPER - Dubai, UAE",https://sa.linkedin.com/jobs/view/junior-devel...,Cobblestone Energy,https://ae.linkedin.com/company/cobblestone-en...,"Riyadh, Riyadh, Saudi Arabia",2023-06-22


In [10]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_position      25 non-null     object
 1   job_link          25 non-null     object
 2   company_name      25 non-null     object
 3   company_profile   18 non-null     object
 4   job_location      25 non-null     object
 5   job_posting_date  25 non-null     object
dtypes: object(6)
memory usage: 1.3+ KB


In [9]:
# Extracting the job id from the job posting URL
def get_job_id(job):
    try:
        id = re.findall('-(\d{10})\?', job['job_link'])[0]
    except:
        id = ''
    return id

get_job_id(jobs[0])

'3584041432'

### Get the job postings for the given geoids and for the given number of pages

In [17]:
jobs_df = pd.DataFrame()    # Empty dataframe
for geoid in geoids:
    for page in npages:
        payload = {'api_key': API_KEY, 'field': 'data scientist', 'geoid': geoid, 'page': str(page)}
        resp = requests.get('https://api.scrapingdog.com/linkedinjobs', params=payload)
        jobs = pd.DataFrame(resp.json())
        if jobs_df.empty:
            jobs_df = jobs
        else:
            jobs_df = jobs_df.append(jobs, ignore_index=True)
print('Scrapping done.')
print('Scrapped dataframe shape is', jobs_df.shape)
jobs_df.to_csv('./data/jobs.csv', index=False)

Scrapping done.
Scrapped dataframe shape is (700, 6)


In [18]:
# Adding a new columns storing the job id for each job
jobs_df['job_id'] = jobs_df.apply(get_job_id, axis=1)
jobs_df.sample(10).head(10)

Unnamed: 0,job_position,job_link,company_name,company_profile,job_location,job_posting_date,job_id
328,Data Scientist,https://uk.linkedin.com/jobs/view/data-scienti...,Altasu Recruitment Group,https://uk.linkedin.com/company/altasu-recruit...,"Nottingham, England, United Kingdom",2023-06-18,3639112591
0,Data Scientist,https://sa.linkedin.com/jobs/view/data-scienti...,MatchaTalent,https://id.linkedin.com/company/matchatalent?t...,"Dhahran, Eastern, Saudi Arabia",2023-06-18,3639381396
525,Data Engineer 2,https://www.linkedin.com/jobs/view/data-engine...,MyFitnessPal,https://www.linkedin.com/company/myfitnesspal?...,"Austin, TX",2023-05-30,3641744063
124,,https://ae.linkedin.com/jobs/view/data-analyst...,Garima Interprises,,"Dubai, Dubai, United Arab Emirates",2023-05-04,3595394797
302,Machine Learning Engineer,https://uk.linkedin.com/jobs/view/machine-lear...,Kaedim,https://www.linkedin.com/company/kaedim?trk=pu...,"London, England, United Kingdom",2023-04-28,3591963105
67,Data engineer,https://sa.linkedin.com/jobs/view/data-enginee...,Arabic Computer Systems,https://sa.linkedin.com/company/arabic-compute...,"Al Kharj, Riyadh, Saudi Arabia",2023-04-03,3590313780
238,Senior Data Ops Engineer,https://eg.linkedin.com/jobs/view/senior-data-...,SSC Egypt,https://eg.linkedin.com/company/ssc-egypt?trk=...,"Cairo, Cairo, Egypt",2023-06-22,3643300043
103,Artificial Intelligence Specialist,https://ae.linkedin.com/jobs/view/artificial-i...,Arthur Lawrence,https://www.linkedin.com/company/arthur-lawren...,"Dubai, Dubai, United Arab Emirates",2023-05-26,3619142293
137,Algorithm Developer with Rust (Remote),https://ae.linkedin.com/jobs/view/algorithm-de...,Keyrock,https://be.linkedin.com/company/keyrock?trk=pu...,"Abu Dhabi, Abu Dhabi Emirate, United Arab Emir...",2023-04-26,3588411402
115,JUNIOR DATA ANALYST - The Emirates Group,https://ae.linkedin.com/jobs/view/junior-data-...,Talent Pal,https://www.linkedin.com/company/talent-pal?tr...,"Dubai, Dubai, United Arab Emirates",2023-06-30,3651833371


## Get job description and job details

In [44]:
# Scraping the actual job pages
job_details_df = pd.DataFrame()
# jobid_mask = ~jobs_df['job_id'].isin(job_details_df['job_id'])
# last_jobid_index = jobs_df[last_jobid_mask].index[0]
# jobids = jobs_df['job_id'].loc[jobid_mask]
jobids = jobs_df['job_id']
for job_id in tqdm(jobids):
    payload = {'api_key': API_KEY, 'job_id': job_id}
    resp = requests.get('https://api.scrapingdog.com/linkedinjobs', params=payload)
    try:
        job_details = resp.json()[0]
        job_details['job_id'] = job_id
        if job_details_df.empty:
            job_details_df = pd.DataFrame([job_details])
        else:
            job_details_df = job_details_df.append(pd.DataFrame([job_details]))
    except:
        print('error:', resp.json())

print('Scrapping job details completed.')
print('Shape of scrapped details is', job_details_df.shape)

 34%|███▎      | 45/134 [03:15<30:06, 20.30s/it]

error: {'message': 'Something went wrong, you will be not charged for this request. Please try again or if you keep getting this error please mail us at info@scrapingdog.com.', 'success': False, 'status': 504}


100%|██████████| 134/134 [12:11<00:00,  5.46s/it]

Scrapping job details completed.
Shape of scrapped details is (613, 14)





In [47]:
# Removing duplicate jobs
job_details_df.drop_duplicates(subset='job_id', inplace=True)
job_details_df.shape

(607, 14)

In [48]:
# Saving the detailed job pages
job_details_df.to_csv('./data/job_details.csv', index=False)

In [51]:
print('Job details shape is:', job_details_df.shape)
job_details_df.tail()

Job details shape is: (607, 14)


Unnamed: 0,job_position,job_location,company_name,company_linkedin_id,job_posting_time,job_description,Seniority_level,Employment_type,Job_function,Industries,recruiter_details,similar_jobs,people_also_viewed,job_id
0,Sr Software Engineer(Python),"Toronto, Ontario, Canada",Techedin,https://www.linkedin.com/company/techedinlabs?...,2 months ago,We are hiring for a Senior Software Engineer(P...,Mid-Senior level,Full-time,Engineering and Information Technology,Software Development,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Javascript Engineer (Canada...,"[{'job_position': 'Software Engineer', 'job_co...",3567017339
0,NLP Data Engineer,Canada,Inworld AI,https://www.linkedin.com/company/inworld-ai?tr...,1 month ago,Why Join InworldInworld is a developer platfor...,Not Applicable,Full-time,Information Technology,Software Development,[{'recruiter_photo': 'https://media.licdn.com/...,[{'job_position': 'Javascript Engineer (Canada...,[{'job_position': 'Software Development Engine...,3442436280
0,Software Developer,"Toronto, Ontario, Canada",WizeHire,https://www.linkedin.com/company/wizehire?trk=...,1 month ago,Would you love to dive in and learn more about...,Entry level,Full-time,Engineering and Information Technology,"Technology, Information and Internet","[{'recruiter_name': '', 'recruiter_title': ''}]","[{'job_position': 'Software Developer', 'job_c...",[{'job_position': 'Industry X Software Develop...,3599982035
0,Software Engineer - AI,"Vancouver, British Columbia, Canada",Timbre Games,https://ca.linkedin.com/company/timbre-games?t...,2 weeks ago,Welcome to Timbre!We are Timbre Games (part of...,Mid-Senior level,Full-time,Engineering and Information Technology,Software Development,"[{'recruiter_name': '', 'recruiter_title': ''}]","[{'job_position': 'Software Engineer, Platform...","[{'job_position': 'Staff Software Engineer, Fr...",3590181627
0,"Senior Software Developer, Machine Learning","Ottawa, Ontario, Canada",Kinaxis,https://ca.linkedin.com/company/kinaxis?trk=pu...,2 weeks ago,"At Kinaxis, who we are is grounded in our comm...",Mid-Senior level,Full-time,Engineering and Information Technology,Software Development,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Software Engineer ( All Lev...,[{'job_position': 'Software Engineer/Developer...,3598027379


In [55]:
# Filtering columns of the job_details_df to avoid duplicated column named after merging
right_cols = job_details_df.columns.difference(['job_position', 'job_location', 'company_name']).to_list()
right_cols

['Employment_type',
 'Industries',
 'Job_function',
 'Seniority_level',
 'company_linkedin_id',
 'job_description',
 'job_id',
 'job_posting_time',
 'people_also_viewed',
 'recruiter_details',
 'similar_jobs']

In [56]:
# Merging the jobs and the job pages data
all_jobs = jobs_df.merge(job_details_df[right_cols], on='job_id')
print('Complete jobs shape is', all_jobs.shape)
all_jobs.head()

Complete jobs shape is (699, 17)


Unnamed: 0,job_position,job_link,company_name,company_profile,job_location,job_posting_date,job_id,Employment_type,Industries,Job_function,Seniority_level,company_linkedin_id,job_description,job_posting_time,people_also_viewed,recruiter_details,similar_jobs
0,Data Scientist,https://sa.linkedin.com/jobs/view/data-scienti...,MatchaTalent,https://id.linkedin.com/company/matchatalent?t...,"Dhahran, Eastern, Saudi Arabia",2023-06-18,3639381396,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,https://id.linkedin.com/company/matchatalent?t...,This role required candidate to permanently re...,2 weeks ago,[{'job_position': 'Data Scientist Job Recruitm...,"[{'recruiter_name': '', 'recruiter_title': ''}]","[{'job_position': 'AI Engineer', 'job_company'..."
1,Data Science Specialist,https://sa.linkedin.com/jobs/view/data-science...,Halian,https://uk.linkedin.com/company/halian?trk=pub...,"Riyadh, Riyadh, Saudi Arabia",2023-05-16,3608956611,Full-time,IT Services and IT Consulting,Engineering and Information Technology,Entry level,https://uk.linkedin.com/company/halian?trk=pub...,Our ClientWe are partnered with one of the big...,1 month ago,[{'job_position': 'Cybersecurity Governance Sp...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Senior Specialist M&A and I...
2,,https://sa.linkedin.com/jobs/view/data-scienti...,Garima Interprises,,Saudi Arabia,2023-06-02,3645495537,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,,We are looking to hire a highly creative data ...,1 month ago,[{'job_position': 'Junior Blockchain Developer...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Junior Python Developer (Re...
3,,https://sa.linkedin.com/jobs/view/data-scienti...,Garima Interprises,,Saudi Arabia,2023-05-11,3626306377,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,,We are looking to hire a highly creative data ...,1 month ago,[{'job_position': 'Relocate to the UK - Nurses...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Recruiter - Human Resources...
4,Data Scientist Engineer,https://sa.linkedin.com/jobs/view/data-scienti...,Energy Jobline,https://uk.linkedin.com/company/energy-jobline...,Makkah Region,2023-06-07,3651133334,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,https://uk.linkedin.com/company/energy-jobline...,Aramco energizes the world economy. Aramco occ...,3 weeks ago,[{'job_position': 'Senior Structural Engineer'...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Sr. Engineer - Data center ...


In [58]:
# Removing duplicated jobs after merging
all_jobs.drop_duplicates(subset='job_id', inplace=True)
all_jobs.reset_index(drop=True, inplace=True)
print('Complete jobs shape after removing duplicates is', all_jobs.shape)
all_jobs.head()

Complete jobs shape after removing duplicates is (607, 17)


Unnamed: 0,job_position,job_link,company_name,company_profile,job_location,job_posting_date,job_id,Employment_type,Industries,Job_function,Seniority_level,company_linkedin_id,job_description,job_posting_time,people_also_viewed,recruiter_details,similar_jobs
0,Data Scientist,https://sa.linkedin.com/jobs/view/data-scienti...,MatchaTalent,https://id.linkedin.com/company/matchatalent?t...,"Dhahran, Eastern, Saudi Arabia",2023-06-18,3639381396,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,https://id.linkedin.com/company/matchatalent?t...,This role required candidate to permanently re...,2 weeks ago,[{'job_position': 'Data Scientist Job Recruitm...,"[{'recruiter_name': '', 'recruiter_title': ''}]","[{'job_position': 'AI Engineer', 'job_company'..."
1,Data Science Specialist,https://sa.linkedin.com/jobs/view/data-science...,Halian,https://uk.linkedin.com/company/halian?trk=pub...,"Riyadh, Riyadh, Saudi Arabia",2023-05-16,3608956611,Full-time,IT Services and IT Consulting,Engineering and Information Technology,Entry level,https://uk.linkedin.com/company/halian?trk=pub...,Our ClientWe are partnered with one of the big...,1 month ago,[{'job_position': 'Cybersecurity Governance Sp...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Senior Specialist M&A and I...
2,,https://sa.linkedin.com/jobs/view/data-scienti...,Garima Interprises,,Saudi Arabia,2023-06-02,3645495537,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,,We are looking to hire a highly creative data ...,1 month ago,[{'job_position': 'Junior Blockchain Developer...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Junior Python Developer (Re...
3,,https://sa.linkedin.com/jobs/view/data-scienti...,Garima Interprises,,Saudi Arabia,2023-05-11,3626306377,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,,We are looking to hire a highly creative data ...,1 month ago,[{'job_position': 'Relocate to the UK - Nurses...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Recruiter - Human Resources...
4,Data Scientist Engineer,https://sa.linkedin.com/jobs/view/data-scienti...,Energy Jobline,https://uk.linkedin.com/company/energy-jobline...,Makkah Region,2023-06-07,3651133334,Full-time,Staffing and Recruiting,Engineering and Information Technology,Entry level,https://uk.linkedin.com/company/energy-jobline...,Aramco energizes the world economy. Aramco occ...,3 weeks ago,[{'job_position': 'Senior Structural Engineer'...,"[{'recruiter_name': '', 'recruiter_title': ''}]",[{'job_position': 'Sr. Engineer - Data center ...


In [59]:
all_jobs.to_csv('./data/full_jobs.csv', index=False)