## Introduction
####  Top Skills of DS ( Data Scientists) on GlassDoor and Indeed
This program aims at presenting the top 10 skills of DS listed in job descriptions of glassdoor and indeed. For the detail background, deliverables and processes, please see the readme in this git.

In [1]:
### Load required libraries

In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np

# Text preprocessing
import os,re
from bs4 import BeautifulSoup
from langdetect import detect

# Disable warning of 3 types
import warnings

#Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Other utils
from tqdm import tqdm  # Progress bar

# Azure text analytics service api
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient


# aws comprehend
import boto3
import json

#EDA tools.
import dtale

# Geopy for location
from geopy.geocoders import Nominatim

# nlp text cleaning
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

### Pre-settings

In [3]:
# Set the width to show the column as much as possible.
pd.set_option('display.max_colwidth', 200)

# Disable 3 types of warning
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=(FutureWarning))
warnings.filterwarnings("ignore",category=(RuntimeWarning))

np.random.seed(1337)

### data cleaning
    - Select the jobs of data related, and keep the data scientists' record for analysis.
    - Remove the duplicated records.
    - Convert job description in HTML to text.
    - Store the cleaned data into main table.

In [4]:
outputfile= './01_data/output/datajobs.csv'
datafile='./01_data/input/glassdoor/glassdoor.csv'

if os.path.exists(datafile):
    if os.path.exists(outputfile):
        Reload=input("The processed data exist, do you want to reload it?(y/n)")
    else:
        Reload='y'
        
    # reload the data file, and re-produce the csv of data scientist
    if Reload.lower()=='y':
        try:
            glassdoor=pd.read_csv(datafile)
            print("Shape of source file:", glassdoor.shape)
             # Produce the list of jobs related to data
            # Only keep the non-duplicated records by employer names and jobs.
            # Only keep the first records if there are duplicated. Here I keep the latest one
            # Sort the jobs by posted date ascendingly
            data_jobs=glassdoor[glassdoor['header.jobTitle'].str.contains(' data ',case=False)].sort_values(
                by='header.posted',ascending=False).loc[:,[
                'gaTrackerData.industry',
                'header.employerName',
                'gaTrackerData.jobTitle',
                'job.jobReqId.long',
                'job.description',
                'header.posted',
                'map.country',
                'map.lat',
                'map.lng',
                'map.location']]
            # Keep the first record if the duplicated exist.
            data_jobs['duplicated']=data_jobs.duplicated()
            data_jobs_unique=data_jobs[data_jobs['duplicated']==False].loc[:,[
                'gaTrackerData.industry',
                'header.employerName',
                'gaTrackerData.jobTitle',
                'job.jobReqId.long',
                'job.description',
                'header.posted',
                'map.country',
                'map.lat',
                'map.lng',
                'map.location']]
            data_jobs_unique.to_csv('./01_Data/Output/datajobs.csv')
            print("Shape of jobs related to data:", data_jobs_unique.shape)
            # Assign id into each posted position for the coming identification
            # Remove all html tag, and convert each requirements into one item for every posted position.
            jobs = pd.DataFrame(
                columns=[
                    'posting_date',
                    'description',
                    'title',
                    'country',
                    'employer',
                    'industry',
                    'id',
                    'source',
                    'lat',
                    'lng',
                    'location']
            )
            #for i in tqdm(range(len(data_jobs_unique))):
            
            for i in range(len(data_jobs_unique)):
                
                html_page=data_jobs_unique.iloc[i,4]
                soup = BeautifulSoup(html_page, 'html.parser')
                jobs_list = soup.find_all("li")
                job_text=''
                for job in jobs_list:
                    try:
                        lang = detect(str(job.contents[0]))
                    except:
                        lang = "error"
                # Only handle the position described in English 
                # since this program is solely focusing on English 
                    if lang=='en':
                        job_text=job_text + str(job.contents[0]).lower().split("\r\n")[0]+'.'
 
                # Create df to store the converted job description in text format.
                if job_text!='':
                    
                    jobs=jobs.append(
                        {
                            "posting_date":data_jobs_unique.iloc[i,5],
                            "description":job_text,
                            "title":data_jobs_unique.iloc[i,2],
                            "country":data_jobs_unique.iloc[i,6],
                            "employer":data_jobs_unique.iloc[i,1],
                            "industry":data_jobs_unique.iloc[i,0],
                            "id":data_jobs_unique.iloc[i,3],
                            "source":"Glassdoor",
                            "lat":data_jobs_unique.iloc[i,7],
                            "lng":data_jobs_unique.iloc[i,8],
                            "location":data_jobs_unique.iloc[i,9]
                        },
                                      ignore_index=True) 
            # Prevent the issue of 'utf-8' encoding.    
            jobs['description'] = jobs['description'].apply(lambda x: 
                                                            x.encode('ascii', 'ignore').decode('ascii'))
            jobs.to_csv(outputfile)
        except Exception as e:
            print("Failed to read the data file due to error:%s, please check the file or path!" %e)
    else:
        jobs=pd.read_csv(outputfile)

The processed data exist, do you want to reload it?(y/n)y
Shape of source file: (165290, 163)
Shape of jobs related to data: (7347, 10)


In [116]:
# Select data scientist jobs
df_main=jobs[jobs['title'].str.contains(r'^(?=.*data)(?=.*scientist)',case=False)]

### Fill / Standardize the country names

- Those job postings without countries can find out countries by:
    - From the job posting who has the same locations, but the country is NOT empty.
    - Based on Location to look for the country names.

- The short names of countries will be converted to full names based on the mapping of glassdoor's table.

In [117]:
# Identify how many jobs'country are empty
len(df_main[df_main['country'].isnull()])

372

In [118]:
# Create the mapping table for those which has country names and locations
a_city=df_main[df_main['country'].isnull()==False]\
    [['country','location']].apply(lambda x: (x.iloc[0],x.iloc[1]),axis=1).unique()

In [119]:
# Create replacing functions to 
# 1) return 1st element if 2nd element in array is equal to target string
# 2) Return empty if target string could be not found
def map_replace(a_source=[],s_target=''):
    for item in a_source:
        if pd.isna(item[1])!=True and pd.isna(s_target) != True:
            if str(item[1]).strip().lower()==s_target.strip().lower():
                return item[0]
    return None

In [120]:
# Fill the country names
df_main['country']=df_main.apply(lambda 
                                 x: map_replace(a_city,x.iloc[10]) if pd.isna(x.iloc[3]) else x.iloc[3],axis=1)

In [121]:
# Check how many postings without country names still are left
len(df_main[df_main['country'].isnull()==True])

146

In [122]:
# Import country mapping table for short names' conversion
countryfile='./01_data/input/glassdoor/country_names_2_digit_codes.csv'
df_country=pd.read_csv(countryfile)

# Create function to get and standardize the country name
def get_country(country='',lat='0',lng='0',city=''):
    try:
        country_name=''
        
        # country name's shortname to full name
        if len(country)<=3 and len(country)>1:
            country_name=df_country[df_country['Code'].str.lower()==country.lower()]['Name']
            
            if not country_name.empty:
                
                return country_name
            
        else:
        # if country name does not exist, look for country name by geo location (latitude, longitude)
            if country=='':
                if (lat!='0' and lng!='0'):
                    # initialize Nominatim API 

                    geolocator = Nominatim(user_agent="geoapiExercises")

                    # Latitude & Longitude input

                    location = geolocator.reverse(lat+","+lng,language='en')
                    country_name = location.raw['address'].get('country', '')
                    if country_name !='':
                        return country_name
        # if no geo location, search for country name by city name
                else:
                    if city !='':
                        geolocator = Nominatim(timeout=10,user_agent="geoapiExercises")
                        #print(city)
                        location = geolocator.geocode(city,language='en')
                        loc_dict = location.raw
                        #print(loc_dict)
                        if loc_dict is not None:
                            if ',' in loc_dict['display_name']:
                                country_name=loc_dict['display_name'].rsplit(',' , 1)[1]
                            else:
                                country_name=loc_dict['display_name']
                            return country_name
            else:
                return country
                    
    except Exception as e:
            print("error:%s" %e)
            print(lat,lng,city,loc_dict)

In [123]:
# Fill the country names based on locations.
df_main['country']=df_main.apply(lambda x: get_country(city=x.iloc[10])
                                 if pd.isna(x.iloc[3]) else x.iloc[3],axis=1)

In [124]:
# Check how many postings without country name are left.
len(df_main[df_main['country'].isna()])

0

In [125]:
# Add 2 Codes for USA and UK given previous mapping table is lack of them
df_temp=pd.DataFrame({'Name':['United Kingdom','United States','Switzerland'],
                      'Code':['UK','USA','CHE']},columns=['Name','Code']
                              )
df_country=df_country.append(df_temp,ignore_index=True)

# Create mapping array for short names to full names
a_name=df_country[['Name','Code']].apply(lambda x: (x.iloc[0],x.iloc[1]),axis=1).unique()

In [126]:
# Display numbers of records which has the short names of country
len(df_main[(df_main['country'].str.len()<=3) & (df_main['country'].str.len()>=2)])

383

In [127]:
# Convert short name of country to full name
df_main['country']=df_main.apply(lambda x: map_replace(a_name,x.iloc[3]) 
                                 if (pd.isna(map_replace(a_name,x.iloc[3]))==False) else x.iloc[3],axis=1)

In [128]:
# Verify what the short names are if these exist.
len(df_main[df_main['country'].str.len()<=3][['country','location']])

0

### Fill / Standardize the industry name

In [131]:
# Identify how many jobs'industry names are empty
len(df_main[df_main['industry'].isnull()])

143

In [132]:
# Verify whether the industry names could be found from those job postings with industry name and same employer
len(set(df_main[df_main['industry'].isnull()==False]['employer']) &\
                set(df_main[df_main['industry'].isnull()]['employer']))

0

In [133]:
# Verify whether the industry names could be found from those job postings with industry name and same employer
len(set(df_main[df_main['industry'].isnull()==False]['employer']) &\
                set(jobs[jobs['industry'].isnull()]['employer']))

0

In [134]:
# Verify whether the industry names could be found from those job postings of indeed
indeedfile='./01_data/input/employer_industry.csv'
indeed=pd.read_csv(indeedfile)
employer_num=len(set(df_main[df_main['industry'].isnull()]['employer']) &\
                set(indeed[indeed['industry'].isnull()==False]['employer']))

#if the matched employers with industry are found, it will fill the industry of glassdoor jobs
if employer_num>0:
    print(" %d employers are found in indeed file" %employer_num)
    # Create the mapping table for those which has industry names from indeed
    a_industry=indeed[indeed['industry'].isnull()==False]\
        [['industry','employer']].apply(lambda x: (x.iloc[0],x.iloc[1]),axis=1).unique()
    #Fill the industry names
    df_main['industry']=df_main.apply(lambda 
                                     x: map_replace(a_industry,x.iloc[4]) if pd.isna(x.iloc[5]) 
                                      else x.iloc[5],axis=1)
else:
    print('No matched employers are found!')

No matched employers are found!


In [138]:
# Update the job postings without industry with "unclassified"
df_main['industry']=df_main['industry'].apply(lambda x: "unclassified" if pd.isna(x) else x)

In [139]:
# Identify how many jobs'industry names are empty
len(df_main[df_main['industry'].isna()])

0

In [151]:
# Export main table only including the jobs of data scientists
ds_file= './01_data/output/datascientists.csv'
df_main.to_csv(ds_file)

In [154]:
# Perform EDA to check main table
d1 = dtale.show(df_main)
d1.open_browser()

## Extract skills from job desription

In [142]:
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters)))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)

    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    #text = stem_text(text) # stemming
    text = remove_special_characters(text) # remove punctuation and symbols
    text = remove_stopwords(text) # remove stopwords
    #text.strip(' ') # strip whitespaces again?

    return text

In [143]:
# Create column "description_cln" to store the result of text cleaning
df_main['description_cln']=df_main['description'].apply(lambda x: clean_text(x, ))

### Extract skills from AWS

In [27]:
sample_frac=1 # % of total records for sample processing.
cf_score=0.4  # confidence score threshold for key phrases

# Call AWS comprehend to extract key phrases

comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')
                
#text = "It is raining today in Seattle"

print('Calling DetectKeyPhrases')
#df=pd.DataFrame()
df_list=[]
j=0
for i in range(round(sample_frac*len(df_main))):
    j=int(len(df_main["description_cln"].iloc[i]) / 5000)+1
    txt=[]
    for x in range(j): 
        if j<=1:
            txt.append(df_main["description_cln"].iloc[i])
        else:
            txt.append(df_main["description_cln"].iloc[i][x*5000:(x+1)*5000])
        dump_json=json.dumps(comprehend.detect_key_phrases(Text=txt[x]
                                                           , LanguageCode='en'), sort_keys=True, indent=4)
        df_phrases=pd.json_normalize(json.loads(dump_json)['KeyPhrases'])
        df_phrases['id']=df_main["id"].iloc[i]
        df_list.append(df_phrases)
        
df=pd.concat(df_list)
print('End of DetectKeyPhrases\n')

2022-05-18 18:02:29,963 - INFO     - Found credentials in shared credentials file: ~/.aws/credentials


Calling DetectKeyPhrases
End of DetectKeyPhrases



In [28]:
# Generate the dataframe of skills
df_skills=pd.DataFrame(df[df['Score']>=cf_score][['id','Text']])
df_skills.columns=['id','skill']
df_skills['type']=''

In [150]:
# Export skills produced by aws
aws_skills= './01_data/output/aws_skills.csv'
df_skills.to_csv(aws_skills)

In [30]:
# EDA on aws results
d1 = dtale.show(df_skills)
d1.open_browser()

In [None]:
# Call Azure text analytics to identify name entities
cred=input("Please input azure's credential")
credential = AzureKeyCredential(cred)
endpoint="https://topskills.cognitiveservices.azure.com/"

text_analytics_client = TextAnalyticsClient(endpoint, credential)

#df_list=[]
df_list=pd.DataFrame(columns=['id','skill','category','confidence score'])
j=0
for i in range(round(sample_frac*len(df_main))):
    j=int(len(df_main["description"].iloc[i]) / 5000)+1
    txt=[]
    for x in range(j): 
        if j<=1:
            txt.append(df_main["description"].iloc[i])
        else:
            txt.append(df_main["description"].iloc[i][x*5000:(x+1)*5000])
    
        documents = [item for item in txt[x:x+1]]
        #print(documents)
        response = text_analytics_client.recognize_entities(documents, language="en")
        result = [doc for doc in response if not doc.is_error]
        #print(result)
        for doc in result:
            #print(doc)
            for entity in doc.entities:
                df_list=df_list.append({'id':df_main['id'].iloc[i],
                                         'skill':entity.text,
                                        'category':entity.category,
                                        'confidence score':entity.confidence_score},ignore_index=True)

df_skills_az=df_list

In [149]:
# Export skills produced by aws
az_skills= './01_data/output/az_skills.csv'
df_skills_az.to_csv(az_skills)

In [32]:
# EDA on azure results
d = dtale.show(df_skills_az)
d.open_browser()

#### Verify the skill extraction results on both AWS and Azure
    1. Randomly choose 10 jobs from glassdoor data scientist'data set.
    2. Mannually identify the skills on those job descriptions.
    3. Output the skill identification results from AWS and Azure respectively.
    4. Compare the manual results with the results of AWS and Azure to choose better one.

In [144]:
state_num=13
samples=10
#li_sample=df_main['id'].sample(n=samples, random_state=state_num).to_list()
li_sample=[4197200540,4035778057,4163505477,4170026582,4101129848,4147274432,4141919159,4182008825,4121953071
,4209136866]
df_samples=df_main[df_main['id'].isin(li_sample)]

In [145]:
# Export samples into csv file for mannual identification of skills
sample_file= './01_data/output/samples.csv'
df_samples.to_csv(sample_file)

In [146]:
df_samples_aw=df_skills[df_skills['id'].isin(li_sample)]
df_samples_az=df_skills_az[df_skills_az['id'].isin(li_sample)]

In [147]:
# Export main table only including the jobs of data scientists
aws_file= './01_data/output/sample_aws.csv'
df_samples_aw.to_csv(aws_file)

In [148]:
# Export main table only including the jobs of data scientists
az_file= './01_data/output/sample_az.csv'
df_samples_az.to_csv(az_file)