## Introduction
####  Top Skills of DS ( Data Scientists) on GlassDoor and Indeed
This program aims at presenting the top 10 skills of DS listed in job descriptions of glassdoor and indeed. For the detail background, deliverables and processes, please see the readme in this git.

In [1]:
### Load required libraries

In [35]:
from __future__ import print_function
import pandas as pd
import numpy as np

# Text preprocessing
import os,re
from bs4 import BeautifulSoup
from langdetect import detect

# Disable warning of 3 types
import warnings

#Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Other utils
from tqdm import tqdm  # Progress bar

# Azure text analytics service api
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient


# aws comprehend
import boto3
import json

#EDA tools.
import dtale

# Geopy for location
from geopy.geocoders import Nominatim

# nlp text cleaning
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

### Pre-settings

In [3]:
# Set the width to show the column as much as possible.
pd.set_option('display.max_colwidth', 200)

# Disable 3 types of warning
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=(FutureWarning))
warnings.filterwarnings("ignore",category=(RuntimeWarning))

np.random.seed(1337)

### data cleaning
    - Select the jobs of data related, and keep the data scientists' record for analysis.
    - Remove the duplicated records.
    - Convert job description in HTML to text.
    - Store the cleaned data into main table.

In [4]:
outputfile= './01_data/output/datajobs.csv'
datafile='./01_data/input/glassdoor/glassdoor.csv'

if os.path.exists(datafile):
    if os.path.exists(outputfile):
        Reload=input("The processed data exist, do you want to reload it?(y/n)")
    else:
        Reload='y'
        
    # reload the data file, and re-produce the csv of data scientist
    if Reload.lower()=='y':
        try:
            glassdoor=pd.read_csv(datafile)
            print("Shape of source file:", glassdoor.shape)
             # Produce the list of jobs related to data
            # Only keep the non-duplicated records by employer names and jobs.
            # Only keep the first records if there are duplicated. Here I keep the latest one
            # Sort the jobs by posted date ascendingly
            data_jobs=glassdoor[glassdoor['header.jobTitle'].str.contains(' data ',case=False)].sort_values(
                by='header.posted',ascending=False).loc[:,[
                'gaTrackerData.industry',
                'header.employerName',
                'gaTrackerData.jobTitle',
                'job.jobReqId.long',
                'job.description',
                'header.posted',
                'map.country',
                'map.lat',
                'map.lng',
                'map.location']]
            # Keep the first record if the duplicated exist.
            data_jobs['duplicated']=data_jobs.duplicated()
            data_jobs_unique=data_jobs[data_jobs['duplicated']==False].loc[:,[
                'gaTrackerData.industry',
                'header.employerName',
                'gaTrackerData.jobTitle',
                'job.jobReqId.long',
                'job.description',
                'header.posted',
                'map.country',
                'map.lat',
                'map.lng',
                'map.location']]
            data_jobs_unique.to_csv('./01_Data/Output/datajobs.csv')
            print("Shape of jobs related to data:", data_jobs_unique.shape)
            # Assign id into each posted position for the coming identification
            # Remove all html tag, and convert each requirements into one item for every posted position.
            jobs = pd.DataFrame(
                columns=[
                    'posting_date',
                    'description',
                    'title',
                    'country',
                    'employer',
                    'industry',
                    'id',
                    'source',
                    'lat',
                    'lng',
                    'location']
            )
            #for i in tqdm(range(len(data_jobs_unique))):
            
            for i in range(len(data_jobs_unique)):
                
                html_page=data_jobs_unique.iloc[i,4]
                soup = BeautifulSoup(html_page, 'html.parser')
                jobs_list = soup.find_all("li")
                job_text=''
                for job in jobs_list:
                    try:
                        lang = detect(str(job.contents[0]))
                    except:
                        lang = "error"
                # Only handle the position described in English 
                # since this program is solely focusing on English 
                    if lang=='en':
                        job_text=job_text + str(job.contents[0]).lower().split("\r\n")[0]+'.'
 
                # Create df to store the converted job description in text format.
                if job_text!='':
                    
                    jobs=jobs.append(
                        {
                            "posting_date":data_jobs_unique.iloc[i,5],
                            "description":job_text,
                            "title":data_jobs_unique.iloc[i,2],
                            "country":data_jobs_unique.iloc[i,6],
                            "employer":data_jobs_unique.iloc[i,1],
                            "industry":data_jobs_unique.iloc[i,0],
                            "id":data_jobs_unique.iloc[i,3],
                            "source":"Glassdoor",
                            "lat":data_jobs_unique.iloc[i,7],
                            "lng":data_jobs_unique.iloc[i,8],
                            "location":data_jobs_unique.iloc[i,9]
                        },
                                      ignore_index=True) 
            # Prevent the issue of 'utf-8' encoding.    
            jobs['description'] = jobs['description'].apply(lambda x: 
                                                            x.encode('ascii', 'ignore').decode('ascii'))
            jobs.to_csv(outputfile)
        except Exception as e:
            print("Failed to read the data file due to error:%s, please check the file or path!" %e)
    else:
        jobs=pd.read_csv(outputfile)

The processed data exist, do you want to reload it?(y/n)y
Shape of source file: (165290, 163)
Shape of jobs related to data: (7347, 10)


In [5]:
# Select data scientist jobs
df_main=jobs[jobs['title'].str.contains(r'^(?=.*data)(?=.*scientist)',case=False)]

Unnamed: 0,posting_date,description,title,country,employer,industry,id,source,lat,lng,location
2,"Sep 5, 2019",interesse an innovativen technologie-themen.,Young Professional Consultant (w/m/d) Analytics / Data Scientist,,Camelot ITLab GmbH,Consulting,4.148184e+09,Glassdoor,50.9381,6.9571,Cologne
3,"Sep 5, 2019",interesse an innovativen technologie-themen.,Young Professional Consultant (w/m/d) Analytics / Data Scientist,,Camelot ITLab GmbH,Consulting,4.148184e+09,Glassdoor,49.4878,8.4663,Mannheim
6,"Sep 27, 2019","*artificial intelligence/ machine learning data scientist**.*main duties and responsibilities of the job**.investigating the stateoftheart methods in nlp, machine learning and ai.applying nlp meth...",AI/ML Data Scientist,United Kingdom,Tec Partners,,4.203393e+09,Glassdoor,51.4833,-0.1167,"Charing Cross, England"
13,"Sep 23, 2019",demonstrable handson experience in deploying and maintaining machine learning models in production environments.familiar with offline (batch) and online (live/stream) data pipelines.advanced pytho...,Senior Data Scientist,United Kingdom,Wade Macdonald,Staffing & Outsourcing,4.202983e+09,Glassdoor,51.4333,-1.0000,"Reading, England"
18,"Sep 19, 2019","performing fundamental research work on applying deep learning to different types of data (categorical, temporal, etc.)..constructing data pipelines and applying data processing, cleansing and int...",Junior Data Scientist,Canada,Giatec,Electrical & Electronic Manufacturing,4.136718e+09,Glassdoor,45.4167,-75.7000,Ottawa
...,...,...,...,...,...,...,...,...,...,...,...
3308,"Jul 11, 2019",challenging and exciting projects for renowned clients.possibility to develop internationally in cooperation with our offices in cologne and paris.office location in the centre of paris.strong tea...,INTERN - Data Scientist (m/w/d),FR,respondi sarl,,4.121410e+09,Glassdoor,48.8667,2.3333,Paris
3313,"Aug 9, 2019","ms/bs in cs/ee, mathematical or machine learning related disciplines, with 10 or more years of experience.solid understanding ofprobability, statistics, machine learning, data science.a/b testing ...",Principal Applied Data Scientist,India,Microsoft,Computer Hardware & Software,4.080983e+09,Glassdoor,12.9670,77.5873,"Bangalore, Karnataka"
3314,"Aug 9, 2019","ms/bs in cs/ee, mathematical or machine learning related disciplines, with 10 or more years of experience.experience leading a team of applied data scientists.solid understanding ofprobability, st...",Principal Applied Data Scientist Manager,India,Microsoft,Computer Hardware & Software,4.080983e+09,Glassdoor,12.9670,77.5873,"Bangalore, Karnataka"
3328,"Aug 23, 2019",agiler workflow.,Mitarbeiter Data Scientist (w/m/d),DE,eClever Entwicklungs OHG,,4.146647e+09,Glassdoor,51.0517,13.7369,Dresden


### Fill / Standardize the country names

- Those job postings without countries can find out countries by:
    - From the job posting who has the same locations, but the country is NOT empty.
    - Based on Location to look for the country names.

- The short names of countries will be converted to full names based on the mapping of glassdoor's table.

In [6]:
# Identify how many jobs'country are empty
len(df_main[df_main['country'].isnull()])

370

In [7]:
# Create the mapping table for those which has country names and locations
a_city=df_main[df_main['country'].isnull()==False]\
    [['country','location']].apply(lambda x: (x.iloc[0],x.iloc[1]),axis=1).unique()

In [8]:
# Create replacing functions to 
# 1) return 1st element if 2nd element in array is equal to target string
# 2) Return empty if target string could be not found
def map_replace(a_source=[],s_target=''):
    for item in a_source:
        if str(item[1]).strip().lower()==s_target.strip().lower():
            return item[0]
    return None

In [9]:
# Fill the country names
df_main['country']=df_main.apply(lambda 
                                 x: map_replace(a_city,x.iloc[10]) if pd.isna(x.iloc[3]) else x.iloc[3],axis=1)

In [10]:
# Check how many postings without country names still are left
len(df_main[df_main['country'].isnull()==True])

148

In [11]:
# Import country mapping table for short names' conversion
countryfile='./01_data/input/glassdoor/country_names_2_digit_codes.csv'
df_country=pd.read_csv(countryfile)

# Create function to get and standardize the country name
def get_country(country='',lat='0',lng='0',city=''):
    try:
        country_name=''
        
        # country name's shortname to full name
        if len(country)<=3 and len(country)>1:
            country_name=df_country[df_country['Code'].str.lower()==country.lower()]['Name']
            
            if not country_name.empty:
                
                return country_name
            
        else:
        # if country name does not exist, look for country name by geo location (latitude, longitude)
            if country=='':
                if (lat!='0' and lng!='0'):
                    # initialize Nominatim API 

                    geolocator = Nominatim(user_agent="geoapiExercises")

                    # Latitude & Longitude input

                    location = geolocator.reverse(lat+","+lng,language='en')
                    country_name = location.raw['address'].get('country', '')
                    if country_name !='':
                        return country_name
        # if no geo location, search for country name by city name
                else:
                    if city !='':
                        geolocator = Nominatim(timeout=10,user_agent="geoapiExercises")
                        #print(city)
                        location = geolocator.geocode(city,language='en')
                        loc_dict = location.raw
                        #print(loc_dict)
                        if loc_dict is not None:
                            if ',' in loc_dict['display_name']:
                                country_name=loc_dict['display_name'].rsplit(',' , 1)[1]
                            else:
                                country_name=loc_dict['display_name']
                            return country_name
            else:
                return country
                    
    except Exception as e:
            print("error:%s" %e)
            print(lat,lng,city,loc_dict)

In [12]:
# Fill the country names based on locations.
df_main['country']=df_main.apply(lambda x: get_country(city=x.iloc[10])
                                 if pd.isna(x.iloc[3]) else x.iloc[3],axis=1)

In [13]:
# Check how many postings without country name are left.
len(df_main[df_main['country'].isna()])

0

In [14]:
# Add 2 Codes for USA and UK given previous mapping table is lack of them
df_temp=pd.DataFrame({'Name':['United Kingdom','United States','Switzerland'],
                      'Code':['UK','USA','CHE']},columns=['Name','Code']
                              )
df_country=df_country.append(df_temp,ignore_index=True)

# Create mapping array for short names to full names
a_name=df_country[['Name','Code']].apply(lambda x: (x.iloc[0],x.iloc[1]),axis=1).unique()

In [15]:
# Display numbers of records which has the short names of country
len(df_main[(df_main['country'].str.len()<=3) & (df_main['country'].str.len()>=2)])

382

In [16]:
# Convert short name of country to full name
df_main['country']=df_main.apply(lambda x: map_replace(a_name,x.iloc[3]) 
                                 if (pd.isna(map_replace(a_name,x.iloc[3]))==False) else x.iloc[3],axis=1)

In [21]:
# Verify what the short names are if these exist.
len(df_main[df_main['country'].str.len()<=3][['country','location']])

0

In [20]:
# Export main table only including the jobs of data scientists
ds_file= './01_data/output/datascientists.csv'
df_main.to_csv(ds_file)

Executing shutdown due to inactivity...


2022-05-12 14:56:00,915 - INFO     - Executing shutdown due to inactivity...


Executing shutdown...


2022-05-12 14:56:01,071 - INFO     - Executing shutdown...


Exception on /shutdown [GET]
Traceback (most recent call last):
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 2077, in wsgi_app
    response = self.full_dispatch_request()
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 1525, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 1523, in full_dispatch_request
    rv = self.dispatch_request()
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 1509, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/dtale/app.py", line 410, in shutdown
    shutdown_server()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/dtale/app.py", line 396, in shutdown_server
    raise Runt

2022-05-12 14:56:01,073 - ERROR    - Exception on /shutdown [GET]
Traceback (most recent call last):
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 2077, in wsgi_app
    response = self.full_dispatch_request()
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 1525, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 1523, in full_dispatch_request
    rv = self.dispatch_request()
  File "/Users/gangli/Library/Python/3.7/lib/python/site-packages/flask/app.py", line 1509, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**req.view_args)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/dtale/app.py", line 410, in shutdown
    shutdown_server()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/dtale/app.py", line 3

In [None]:
# Perform EDA to check main table
d1 = dtale.show(df_main)
d1.open_browser()

### Fill / Standardize the industry name

In [22]:
# Identify how many jobs'industry names are empty
len(df_main[df_main['industry'].isnull()])

143

In [27]:
# Verify whether the industry names could be found from those job postings with industry name and same employer
len(set(df_main[df_main['industry'].isnull()==False]['employer']) &\
                set(df_main[df_main['industry'].isnull()]['employer']))

0

#### To be found way to fill the missing industry names

## Extract skills from job desription

In [45]:
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters)))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)

    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    #text = stem_text(text) # stemming
    text = remove_special_characters(text) # remove punctuation and symbols
    text = remove_stopwords(text) # remove stopwords
    #text.strip(' ') # strip whitespaces again?

    return text

In [46]:
# Create column "description_cln" to store the result of text cleaning
df_main['description_cln']=df_main['description'].apply(lambda x: clean_text(x, ))

In [47]:
df_main['description_cln'].iloc[2]

'artificial intelligence machine learning data scientist main duties responsibilities job investigating stateoftheart methods nlp machine learning aiapplying nlp methods extract insights text email messagesapplying supervised semisupervised machine learning approaches classification large scale datasets cybersecurity domain developing bespoke interactive visualisation tool enable endusers interact machine learning component skills qualifications experience needed 3 years experience visual analytics machine learning data visualisation nlp data scienceexpertise natural language processing visualisationsupported active learningminimum 3 years research machine learning information visualisationproven industry experience machine learning information visualisationproficient pythonexperience research development large scale projects related cybersecurityexperience using machine learning libraries nlp big data spacy gensim nltk sparkml experience developing bespoke interactive data visualisati

In [38]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')
                
#text = "It is raining today in Seattle"

print('Calling DetectKeyPhrases')
#df=pd.DataFrame()
df_list=[]
i=2
if i==2:
    if len(df_main["description"].iloc[i])<=5000: #AWS' limitation on one request
        dump_json=json.dumps(comprehend.detect_key_phrases(Text=df_main["description"].iloc[i]
                                                       , LanguageCode='en'), sort_keys=True, indent=4)
        df_phrases=pd.json_normalize(json.loads(dump_json)['KeyPhrases'])
        df_phrases['id']=df_main["id"].iloc[i]
        df_list.append(df_phrases)

    else:
        None # to be handled
        
df=pd.concat(df_list)
print('End of DetectKeyPhrases\n')

2022-05-12 17:39:09,154 - INFO     - Found credentials in shared credentials file: ~/.aws/credentials


Calling DetectKeyPhrases
End of DetectKeyPhrases



In [39]:
df

Unnamed: 0,BeginOffset,EndOffset,Score,Text,id
0,1,33,0.803214,artificial intelligence/ machine,4203393000.0
1,43,57,0.802882,data scientist,4203393000.0
2,61,93,0.994325,main duties and responsibilities,4203393000.0
3,97,104,0.999277,the job,4203393000.0
4,121,146,0.98772,the stateoftheart methods,4203393000.0
5,150,175,0.753794,"nlp, machine learning and",4203393000.0
6,188,199,0.879215,nlp methods,4203393000.0
7,211,219,0.996371,insights,4203393000.0
8,225,229,0.999825,text,4203393000.0
9,238,261,0.916269,email messages.applying,4203393000.0


In [48]:
comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')
                
#text = "It is raining today in Seattle"

print('Calling DetectKeyPhrases')
#df=pd.DataFrame()
df_list=[]
i=2
if i==2:
    if len(df_main["description_cln"].iloc[i])<=5000: #AWS' limitation on one request
        dump_json=json.dumps(comprehend.detect_key_phrases(Text=df_main["description_cln"].iloc[i]
                                                       , LanguageCode='en'), sort_keys=True, indent=4)
        df_phrases=pd.json_normalize(json.loads(dump_json)['KeyPhrases'])
        df_phrases['id']=df_main["id"].iloc[i]
        df_list.append(df_phrases)

    else:
        None # to be handled
        
df_cln=pd.concat(df_list)
print('End of DetectKeyPhrases\n')

Calling DetectKeyPhrases
End of DetectKeyPhrases



In [49]:
df_cln

Unnamed: 0,BeginOffset,EndOffset,Score,Text,id
0,0,31,0.871532,artificial intelligence machine,4203393000.0
1,41,55,0.847129,data scientist,4203393000.0
2,56,88,0.814921,main duties responsibilities job,4203393000.0
3,103,124,0.996263,stateoftheart methods,4203393000.0
4,125,136,0.555494,nlp machine,4203393000.0
5,146,168,0.88527,aiapplying nlp methods,4203393000.0
6,177,185,0.955149,insights,4203393000.0
7,186,213,0.814103,text email messagesapplying,4203393000.0
8,214,247,0.648581,supervised semisupervised machine,4203393000.0
9,257,267,0.48463,approaches,4203393000.0


### Extract skills from AWS

In [None]:
sample_frac=1 # % of total records for sample processing.
cf_score=0.4  # confidence score threshold for key phrases

# Call AWS comprehend to extract key phrases

comprehend = boto3.client(service_name='comprehend', region_name='us-east-2')
                
#text = "It is raining today in Seattle"

print('Calling DetectKeyPhrases')
#df=pd.DataFrame()
df_list=[]

for i in range(round(sample_frac*len(df_main))):
    if len(main_df["description"].iloc[i])<=5000: #AWS' limitation on one request
        dump_json=json.dumps(comprehend.detect_key_phrases(Text=df_main["description"].iloc[i]
                                                       , LanguageCode='en'), sort_keys=True, indent=4)
        df_phrases=pd.json_normalize(json.loads(dump_json)['KeyPhrases'])
        df_phrases['id']=main_df["id"].iloc[i]
        df_list.append(df_phrases)

    else:
        None # to be handled
        
df=pd.concat(df_list)
print('End of DetectKeyPhrases\n')

In [None]:
# Generate the dataframe of skills
df_skills=pd.DataFrame(df[df['Score']>=cf_score][['id','Text']])
df_skills.columns=['id','skill']
df_skills['type']=''


In [None]:
df_skills=df_skills.groupby(['skill']).count()[df_skills.groupby(['skill']).count()['id']>20].sort_values('id',ascending=False)

In [None]:
d1 = dtale.show(df_skills)
d1.open_browser()

In [53]:
# Call Azure text analytics to identify name entities

credential = AzureKeyCredential("7ad266f033314fc2a29eb6f1da7e0a74")
endpoint="https://topskills.cognitiveservices.azure.com/"

text_analytics_client = TextAnalyticsClient(endpoint, credential)

#df_list=[]
df_list=pd.DataFrame(columns=['id','skill','category','confidence score'])
i=2
if i==2:
    documents = [item for item in df_main['description_cln'][i:i+1]]
    
    response = text_analytics_client.recognize_entities(documents, language="en")
    result = [doc for doc in response if not doc.is_error]
    #print(result)
    for doc in result:
        #print(doc)
        for entity in doc.entities:
            df_list=df_list.append({'id':df_main['id'].iloc[i],
                                     'skill':entity.text,
                                    'category':entity.category,
                                    'confidence score':entity.confidence_score},ignore_index=True)
            #df_list.append(df_phrases)
        
df_list

2022-05-12 17:54:25,340 - INFO     - Request URL: 'https://topskills.cognitiveservices.azure.com/text/analytics/v3.2-preview.2/entities/recognition/general?stringIndexType=UnicodeCodePoint'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '1223'
    'Accept': 'application/json, text/json'
    'x-ms-client-request-id': '818f2ed6-d1d9-11ec-8b90-acde48001122'
    'User-Agent': 'azsdk-python-ai-textanalytics/5.2.0b3 Python/3.7.9 (Darwin-21.4.0-x86_64-i386-64bit)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
A body is sent with the request
2022-05-12 17:54:27,221 - INFO     - Response status: 200
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/json; charset=utf-8'
    'csp-billing-usage': 'REDACTED'
    'x-envoy-upstream-service-time': '91'
    'apim-request-id': 'a22130c6-f509-47ed-87ce-e01611a416e1'
    'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload'
    'x-content-type-opt

Unnamed: 0,id,skill,category,confidence score
0,4203393000.0,artificial intelligence,Skill,0.88
1,4203393000.0,machine learning,Skill,0.64
2,4203393000.0,data scientist,PersonType,0.81
3,4203393000.0,machine learning,Skill,0.65
4,4203393000.0,machine,Skill,0.64
5,4203393000.0,cybersecurity,Skill,0.66
6,4203393000.0,interactive,Skill,0.58
7,4203393000.0,machine learning,Skill,0.64
8,4203393000.0,3 years,DateTime,0.8
9,4203393000.0,visual analytics,Skill,0.89


In [None]:
main_df['description'][0:20]

In [None]:
sample_frac=0.01 # % of total records for sample processing.
cf_score=0.8  # confidence score threshold for key phrases
# Call Azure text analytics to identify name entities

credential = AzureKeyCredential("7ad266f033314fc2a29eb6f1da7e0a74")
endpoint="https://topskills.cognitiveservices.azure.com/"

text_analytics_client = TextAnalyticsClient(endpoint, credential)

#df_list=[]
df_list=pd.DataFrame(columns=['id','skill','category','confidence score'])
for i in range(round(sample_frac*len(main_df))):
    documents = [item for item in main_df['description'][i:i+1]]
    
    response = text_analytics_client.recognize_entities(documents, language="en")
    result = [doc for doc in response if not doc.is_error]
    #print(result)
    for doc in result:
        #print(doc)
        for entity in doc.entities:
            df_list=df_list.append({'id':main_df['id'].iloc[i],
                                     'skill':entity.text,
                                    'category':entity.category,
                                    'confidence score':entity.confidence_score},ignore_index=True)
            #df_list.append(df_phrases)
        
df_list

In [None]:
d = dtale.show(df_list)
d.open_browser()

In [None]:
d0 = dtale.show(df_main)
d0.open_browser()