## Data Scraping

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# requests library docs: https://docs.python-requests.org/en/latest/
# bs4 docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [4]:
#Website to scrape:
link = ['https://www.indeed.com/jobs?q=data+scientist&l=remote']

#loop to generate urls for pages 2 -15
for page in range(1,15):
    url = link[0]+('&start=')+str(page*10) 
    link.append(url)

In [5]:
link

['https://www.indeed.com/jobs?q=data+scientist&l=remote',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=10',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=20',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=30',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=40',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=50',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=60',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=70',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=80',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=90',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=100',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=110',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=120',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=130',
 'https://www.indeed.com/jobs?q=data+scientist&l=remote&start=140']

In [6]:
#list of requests objects for each link
req = []
for i in link:
    r = requests.get(i)
    req.append(r)
    print(r.status_code)

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [7]:
#list of html text for each requests object
htmltext = []
for obj in req:
    txt = obj.text
    htmltext.append(txt)

In [8]:
#create one large html text file
html_full = ''.join(htmltext)

In [9]:
soup = BeautifulSoup(html_full, 'html.parser')

In [10]:
#find all td tags with class resultContent. this contains several job data points we want to scrape
jobcard = soup.find_all('td', attrs={'class':'resultContent'})

#find all span tags with class date. this contains days ago that job was posted
timecard = soup.find_all('span', attrs={'class':'date'})

In [11]:
len(jobcard)
#15 matches the number of jobs listed on page 1.
#15 pages at 15 listings per page = 225

225

In [13]:
#creates blank df with columns
df = pd.DataFrame(columns=['Title','Location','Company','Company_Rating','Salary Range'])

#this will loop through all 225 jobs listed and scrape out job, company, location, rating, salary
for i in range(0,len(jobcard)):
    job = jobcard[i].find('span', attrs={'title':True}).get_text()
    
    if jobcard[i].find('a', attrs={'data-tn-element':"companyName"}) == None:
        if jobcard[i].find('span', attrs={'class':"companyName"}) == None: 
            company = 'None Provided'
        else:
            jobcard[i].find('span', attrs={'class':"companyName"})
    else:    
        company = jobcard[i].find('a', attrs={'data-tn-element':"companyName"}).get_text()
    
    if jobcard[i].find('span', attrs={'aria-hidden':"true"}) ==None:
        rating = 'None Provided'
    else:    
        rating = jobcard[i].find('span', attrs={'aria-hidden':"true"}).get_text()
    
    if jobcard[i].find('span', attrs={'class':False,'title':False, 'aria-hidden':False}) ==None:
        location = 'None Provided'
    else:
        location = jobcard[i].find('span', attrs={'class':False,'title':False, 'aria-hidden':False}).get_text()
    
    if jobcard[i].find('div', attrs={'class':"attribute_snippet"}) == None:
        salary_range = 'None Provided'
    else:
        salary_range = jobcard[i].find('div', attrs={'class':"attribute_snippet"}).get_text()

#add scraped info to a df   
    df=df.append({'Title':job, 'Location':location, 'Company':company, 'Company_Rating':rating, 'Salary Range':salary_range}, ignore_index=True)

In [14]:
#test to see if Spotify appears in search results
df[df['Company'].str.contains('Spotify')]

Unnamed: 0,Title,Location,Company,Company_Rating,Salary Range
21,"Data Scientist, Freemium Innovation",None Provided,Spotify,4.3,None Provided


In [15]:
df

Unnamed: 0,Title,Location,Company,Company_Rating,Salary Range
0,Data Scientist,Remote,World Services LLC,4.5,$80 - $90 an hour
1,Senior Data Scientist (remote),Remote,Liberty Mutual Insurance,3.6,"$129,500 - $184,100 a year"
2,Data Scientist,Remote,Selby Jennings,4.2,None Provided
3,NLP Data Scientist,None Provided,Engtal,5.0,"$130,000 - $180,000 a year"
4,Junior Data Scientist - 100% Remote,Remote,Piper Companies,4.5,$50 - $60 an hour
...,...,...,...,...,...
220,Applied Data Scientist (remote),Remote,Strategic Employment Partners,None Provided,None Provided
221,Data Scientist (Remote),Remote,XSELL Technologies,3.5,Full-time
222,Sr. Data Analyst,Remote,XSELL Technologies,None Provided,$60 - $65 an hour
223,Data Scientist,Remote,XSELL Technologies,None Provided,Full-time


In [16]:
#export data to csv file
df.to_csv('job_scrape.csv', encoding = 'utf-8')