## Introduction
####  Top Skills of DS ( Data Scientists) on GlassDoor and Indeed
This program aims at presenting the top 10 skills of DS listed in job descriptions of glassdoor and indeed. For the detail background, deliverables and processes, please see the readme in this git.

In [1]:
### Load required libraries

In [5]:
from __future__ import print_function
import pandas as pd
import numpy as np

# Text preprocessing
import os,re
from bs4 import BeautifulSoup
from langdetect import detect

# Disable warning of 3 types
import warnings

#Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Other utils
from tqdm import tqdm  # Progress bar

# Azure text analytics service api
#from azure.core.credentials import AzureKeyCredential
#from azure.ai.textanalytics import TextAnalyticsClient

# aws comprehend
import boto3
import json

### Pre-settings

In [6]:
# Set the width to show the column as much as possible.
pd.set_option('display.max_colwidth', 200)

# Disable 3 types of warning
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=(FutureWarning))
warnings.filterwarnings("ignore",category=(RuntimeWarning))

np.random.seed(1337)

### Pipeline of data cleansing
    - Select the jobs of data scientists.
    - Remove the duplicated records.
    - Convert job description in HTML to text.
    - Store the cleaned data into main table.

In [47]:
outputfile= './01_data/output/datajobs.csv'
datafile='./01_data/input/glassdoor/glassdoor.csv'
countryfile='./01_data/input/glassdoor/country_names_2_digit_codes.csv'


if os.path.exists(datafile):
    if os.path.exists(outputfile):
        Reload=input("The processed data exist, do you want to reload it?(y/n)")
    else:
        Reload='y'
        
    # reload the data file, and re-produce the csv of data scientist
    if Reload.lower()=='y':
        try:
            glassdoor=pd.read_csv(datafile)
            print("Shape of source file:", glassdoor.shape)
             # Produce the list of jobs related to data
            # Only keep the non-duplicated records by employer names and jobs.
            # Only keep the first records if there are duplicated. Here I keep the latest one
            # Sort the jobs by posted date ascendingly
            data_jobs=glassdoor[glassdoor['header.jobTitle'].str.contains(' data ',case=False)].sort_values(
                by='header.posted',ascending=False).loc[:,[
                'gaTrackerData.industry',
                'header.employerName',
                'gaTrackerData.jobTitle',
                'job.description',
                'header.posted',
                'map.country']]
            # Keep the first record if the duplicated exist.
            data_jobs['duplicated']=data_jobs.duplicated()
            data_jobs_unique=data_jobs[data_jobs['duplicated']==False].loc[:,[
                'gaTrackerData.industry',
                'header.employerName',
                'gaTrackerData.jobTitle',
                'job.description',
                'header.posted',
                'map.country'
            ]]
            data_jobs_unique.to_csv('./01_Data/Output/datajobs.csv')
            print("Shape of jobs related to data:", data_jobs_unique.shape)
            # Assign id into each posted position for the coming identification
            data_jobs_unique['position_id']=data_jobs_unique.index
            # Remove all html tag, and convert each requirements into one item for every posted position.
            jobs = pd.DataFrame(
                columns=[
                    'posting_date',
                    'description',
                    'title',
                    'employer',
                    'industry',
                    'id',
                    'source']
            )
            #for i in tqdm(range(len(data_jobs_unique))):
            for i in range(len(data_jobs_unique)):
                html_page=data_jobs_unique.iloc[i,3]
                soup = BeautifulSoup(html_page, 'html.parser')
                jobs_list = soup.find_all("li")
                job_text=''
                for job in jobs_list:
                    try:
                        lang = detect(str(job.contents[0]))
                    except:
                        lang = "error"
                # Only handle the position described in English 
                # since this program is solely focusing on English 
                    if lang=='en':
                        job_text=job_text + str(job.contents[0]).lower().split("\r\n")[0]+'.'
 
                # Create df to store the converted job description in text format.
                if job_text!='':
                    jobs=jobs.append(
                        {
                            "posting_date":data_jobs_unique.iloc[i,4],
                            "description":job_text,
                            "title":data_jobs_unique.iloc[i,2],
                            "country":data_jobs_unique.iloc[i,5],
                            "employer":data_jobs_unique.iloc[i,1],
                            "industry":data_jobs_unique.iloc[i,0],
                            "id":data_jobs_unique.iloc[i,6],
                            "source":"Glassdoor"
                        },
                                      ignore_index=True) 
                
            jobs.to_csv(outputfile)
        except:
            print("Failed to read the data file, please check the file or path!")
    else:
        jobs=pd.read_csv(outputfile)

Shape of source file: (165290, 163)
Shape of Data Scientists Jobs: (7289, 6)
Failed to read the data file, please check the file or path!


In [57]:
jobs[jobs['title'].str.contains(r'^(?=.*data)(?=.*scientist)',case=False)]['title'].unique()

array(['Young Professional Consultant (w/m/d) Analytics / Data Scientist',
       'AI/ML Data Scientist', 'Senior Data Scientist',
       'Junior Data Scientist', 'Delivery Data Scientist',
       'Lead Data Scientist', 'Statistician/ Data Scientist',
       'Senior Data Scientist ASME IL Data Science Group',
       'Lead Research Data Scientist, Oncology R&D, Knowledge Graph team',
       'Stage - 6 mois - Data Scientist Translation H/F - Paris',
       'Senior Data Scientist (Marketplace/Machine Learning)',
       'Research Fellow - Data Scientist',
       'Senior Professional Data Scientist',
       'Senior Data Scientist / Advanced Data Analyst',
       'Manager- Data Scientist',
       'UniCredit Services SCpA / UniCredit Services SCpA - Area Data & Analytics – Data Scientist',
       'Stage - 6 mois - Data Scientist NLP & Exploitation de connaissances H/F - Paris',
       'Senior Data Scientist - [39 Hours]',
       'Intelligence Data Scientist/Developer',
       'Data Scientist 

In [85]:
# To do list:
# Fill the country.
#country_df=pd.read_csv(countryfile)

# Fill the industry

#Pickup the jobs of data scientists
df_main=jobs[jobs['title'].str.contains(r'^(?=.*data)(?=.*scientist)',case=False)]
df_main

Unnamed: 0,posting_date,description,title,employer,industry,id,source,country
2,"Sep 5, 2019",interesse an innovativen technologie-themen.,Young Professional Consultant (w/m/d) Analytics / Data Scientist,Camelot ITLab GmbH,Consulting,152120,Glassdoor,
5,"Sep 27, 2019","*artificial intelligence/ machine learning data scientist**.*main duties and responsibilities of the job**.investigating the stateoftheart methods in nlp, machine learning and ai.applying nlp meth...",AI/ML Data Scientist,Tec Partners,,39596,Glassdoor,United Kingdom
12,"Sep 23, 2019",demonstrable handson experience in deploying and maintaining machine learning models in production environments.familiar with offline (batch) and online (live/stream) data pipelines.advanced pytho...,Senior Data Scientist,Wade Macdonald,Staffing & Outsourcing,110208,Glassdoor,United Kingdom
17,"Sep 19, 2019","performing fundamental research work on applying deep learning to different types of data (categorical, temporal, etc.)..constructing data pipelines and applying data processing, cleansing and int...",Junior Data Scientist,Giatec,Electrical & Electronic Manufacturing,30157,Glassdoor,Canada
18,"Sep 18, 2019","experience with at least one of the ml related technologies (azure ml, mllib, h20, scikit-learn, various r packages).hands on experience with either r or python for both data prep and modeling.dem...",Delivery Data Scientist,Microsoft,Computer Hardware & Software,127407,Glassdoor,India
...,...,...,...,...,...,...,...,...
3275,"Aug 9, 2019","ms/bs in cs/ee, mathematical or machine learning related disciplines, with 10 or more years of experience.solid understanding ofprobability, statistics, machine learning, data science.a/b testing ...",Principal Applied Data Scientist,Microsoft,Computer Hardware & Software,118265,Glassdoor,India
3276,"Aug 9, 2019","ms/bs in cs/ee, mathematical or machine learning related disciplines, with 10 or more years of experience.experience leading a team of applied data scientists.solid understanding ofprobability, st...",Principal Applied Data Scientist Manager,Microsoft,Computer Hardware & Software,15064,Glassdoor,India
3290,"Aug 23, 2019",agiler workflow.,Mitarbeiter Data Scientist (w/m/d),eClever Entwicklungs OHG,,97849,Glassdoor,DE
3291,"Aug 21, 2019","algorithmes de data mining.algorithmes de machine learning supervisés ou non tels que decision tree, random forest, svm, neural network ....",D/ASE/DATASCIENTIST - Ingénieur Data Scientist Expérimenté H/F,Groupe SII,IT Services,126961,Glassdoor,France


### Extract skills from AWS

In [86]:
sample_frac=1 # % of total records for sample processing.
cf_score=0.8  # confidence score threshold for key phrases

# Call AWS comprehend to extract key phrases

comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')
                
#text = "It is raining today in Seattle"

print('Calling DetectKeyPhrases')
#df=pd.DataFrame()
df_list=[]

for i in range(round(sample_frac*len(main_df))):
    if len(main_df["description"].iloc[i])<=5000: #AWS' limitation on one request
        dump_json=json.dumps(comprehend.detect_key_phrases(Text=main_df["description"].iloc[i]
                                                       , LanguageCode='en'), sort_keys=True, indent=4)
        df_phrases=pd.json_normalize(json.loads(dump_json)['KeyPhrases'])
        df_phrases['id']=main_df["id"].iloc[i]
        df_list.append(df_phrases)

    else:
        None # to be handled
        
df=pd.concat(df_list)
print('End of DetectKeyPhrases\n')

Calling DetectKeyPhrases
End of DetectKeyPhrases



In [87]:
# Generate the dataframe of skills
df_skills=pd.DataFrame(df[df['Score']>=cf_score][['id','Text']])
df_skills.columns=['id','skill']
df_skills['type']=''


In [97]:
df_skills.groupby(['skill']).count()[df_skills.groupby(['skill']).count()['id']>20].sort_values('id',ascending=False)

Unnamed: 0_level_0,id,type
skill,Unnamed: 1_level_1,Unnamed: 2_level_1
experience,402,402
data,390,390
computer science,240,240
machine learning,227,227
machine,218,218
...,...,...
tableau,21,21
predictive modeling,21,21
hands,21,21
<strong,21,21
