In [2]:
# Libraries
# Scraping
import pandas as pd
from bs4 import BeautifulSoup
import requests
# Visualizations and analysis
import numpy as np
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as plt
from matplotlib import pyplot as plt
%matplotlib inline
from nltk.stem import WordNetLemmatizer
from matplotlib.pyplot import imread
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
np.random.seed(42)

In [None]:
# Get HTML code
url = 'https://www.indeed.com/jobs?q=information+technology&l=OH'
page_response = requests.get(url, timeout=5)

In [None]:
page_response.text

In [None]:
# BeautifulSoup

In [None]:
soup = BeautifulSoup(page_response.text, 'html5lib')
soup

In [None]:
# Scraping job titles

In [None]:
soup.find('a', {'class':'jobtitle'})

In [None]:
soup.find('a', {'class':'jobtitle'}).text, soup.find('a', {'class':'jobtitle'})['title']

In [None]:
# Company name

In [None]:
soup.find('span', {'class':'company'}).text

In [None]:
# find_all()

In [None]:
soup.find_all('span', {'class':'company'})

In [None]:
soup.find_all('span', {'class':'company'}).text

In [None]:
for i in soup.find_all('span', {'class':'company'}):
    print(i.text)

In [None]:
# Scraping function

In [None]:
def indeed_job_scrape(keyword, search_location, no_page, job_type='None', exp_lvl='None'):
    ### 'keyword' transformation to fit in with url
    keyword = keyword.replace(' ','+')
    ### exp_level number convert to query arguement
    if exp_lvl == 1:
        exp_lvl_str = 'entry_level'
    elif exp_lvl == 2:
        exp_lvl_str = 'mid_level'
    elif exp_lvl == 3:
        exp_lvl_str = 'senior_level'
    else:
        raise ValueError('exp_lvl only accpets 1, 2, or 3')
    
    ### Data to scrape
    # Job title
    j_title = []
    # Company name
    company_name = []
    # Salary
    salary = []
    # Location
    location = []
    # Rating
    company_rating = []
    # Posting link
    hyperlink = []
    # Posting description
    j_desc = []
    
    ### Main scraping loop
    for page_index in range(0, no_page*10, 10):
        page = 'https://www.indeed.com/jobs?q=' + keyword + '&l=' + search_location + '&jt=' + job_type + '&explvl=' + exp_lvl_str + '&start=' + str(page_index)
        print(page)
        page_response = requests.get(page, timeout=5)
        main_soup = BeautifulSoup(page_response.text, 'html5lib')
        for i in main_soup.find_all('div', {'class':'jobsearch-SerpJobCard'}):
            # Position title
            j_title.append(i.find('a', {'class':'jobtitle'})['title']) 
            # Company name                       
            company_name.append(i.find('span', {'class':'company'}).text)
            # Salary (if information available, 'None' otherwise)                        
            salary.append(i.find('span', {'class':'salaryText'}).text if i.find('span', {'class':'salaryText'}) else 'None')  
            # Job location                             
            location.append(i.find(attrs={'class':'location'}).text)
            # Comapny rating
            company_rating.append(i.find('span', {'class':'ratingsContent'}).text if i.find('span', {'class':'ratingsContent'}) else 'None')
            # Link to detailed job posting
            hyperlink.append('https://www.indeed.com/' + str(i.find('a', {'class':'jobtitle'})['href']))
            # Fulljob description
            url = 'https://www.indeed.com/' + str(i.find('a', {'class':'jobtitle'})['href'])
            url_response = requests.get(url, timeout=5)
            soupy_soup = BeautifulSoup(url_response.text, 'html5lib')
            j_desc.append(soupy_soup.find('div', {'id':'jobDescriptionText'}).text)
    
    ### Save to pandas dataframe 
    df_local = pd.DataFrame({'job_title' : j_title,
                       'company_name' : company_name,
                       'salary' : salary,
                       'job_location' : location,
                       'direct_link' : hyperlink,
                       'full_description' : j_desc})
    return df_local

In [None]:
df_demo = indeed_job_scrape('information technology', 'Ohio', no_page=1, exp_lvl=1)

In [None]:
df_demo.head()

In [None]:
df_demo.full_description[0]

In [3]:
### Demo analysis
df = pd.read_csv('full_dataset_clean.csv')

In [None]:
# Frequency of each job level
plt.bar(10, len(df.exp_lvl[df.exp_lvl==1]), label='Entry Level')
plt.bar(15, len(df.exp_lvl[df.exp_lvl==2]), label='Mid Level')
plt.bar(20, len(df.exp_lvl[df.exp_lvl==3]), label='Senior Level')
plt.legend(loc=4)

In [None]:
### Exploring salary information

In [None]:
# Percentage of postings with salary information
df.groupby('exp_lvl')[['yearly_avg_salary']].agg(lambda x: x.count()/(x.count()+x.isna().sum()))

In [None]:
# Average of availble salary rate at each level
df.groupby('exp_lvl')[['yearly_avg_salary']].mean()

In [None]:
# Median
df.groupby('exp_lvl')[['yearly_avg_salary']].median()

In [None]:
### Word cloud
stop_words = ['work', 'will', 'system', 'support'] + list(STOPWORDS)

In [None]:
# Word cloud for entry level job
jd_dict_1 = df[df.exp_lvl==1].full_description.values
j1_cloud = WordCloud(stopwords = stop_words,
                     background_color='white',
                     width=2500,
                     height=1800).generate(' '.join(jd_dict_1))
plt.imshow(j1_cloud)
plt.axis('off')

In [None]:
# Word cloud for mid level job
jd_dict_2 = df[df.exp_lvl==2].full_description.values
j2_cloud = WordCloud(stopwords = stop_words,
                     background_color='white',
                     width=2500,
                     height=1800).generate(' '.join(jd_dict_2))
plt.imshow(j2_cloud)
plt.axis('off')

In [None]:
# Word cloud for senior level job
jd_dict_3 = df[df.exp_lvl==3].full_description.values
j3_cloud = WordCloud(stopwords = stop_words,
                     background_color='white',
                     width=2500,
                     height=1800).generate(' '.join(jd_dict_3))
plt.imshow(j3_cloud)
plt.axis('off')

In [4]:
# subclass sklearn tokenizer with nltk lemmatizer
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(i) for i in analyzer(doc))

# Applying the new tokenizer method
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, min_df=5, stop_words='english', decode_error='ignore')
tf_vector = tf_vectorizer.fit_transform(list(df.full_description.values))

# Top 50 frequent clean
word_freq = zip(tf_vectorizer.get_feature_names(),
                tf_vector.toarray().sum(axis=0))
word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)

plot_data_clean = [go.Bar(x=list(zip(*word_freq))[0][0:50],
                   y=list(zip(*word_freq))[1][0:50],
                   marker=dict(colorscale='Jet',
                   color=list(zip(*word_freq))[1][0:50]
                   ),
                   text='Word Counts')]
layout = go.Layout(title='Top 50 Frequent Words after Processing')                    
fig = go.Figure(data=plot_data_clean, layout=layout)
py.plot(fig)

'temp-plot.html'