# This notebook contains the cleaning code for the Indeed Dataset

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import re
import json
import time

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import dtale

# import aws comprehend
import boto3

In [5]:
def clean_html(value):
    soup = BeautifulSoup(value, 'html.parser')
    desc = soup.find(id='jobDescriptionText').text.replace("\n", " ").replace("  ", " ")
    for word in stopwords.words('english'):#STOPWORDS:
        desc = desc.lower().replace(" " + word + " ", " ")
    
    return desc


def preprocessing(filename, file_format):
    if file_format == 'csv':
        df = pd.read_csv(filename)
        df['country'] = df['country']
        df['posting_date'] = pd.to_datetime(df['crawl_timestamp']).dt.date
        df['title'] = df['job_title']
        df['employer'] = df['company_name']
        df['id'] = df['uniq_id']
        df['industry'] = df['industry']
        df['description'] = df['job_description'].apply(clean_html)
    else:
        df = pd.read_json(filename, lines=True)
        df['country'] = 'India'
        df['posting_date'] = pd.to_datetime(df['post_date']).dt.date
        df['employer'] = df['company_name']
        df['title'] = df['job_title']
        df['id'] = df['uniq_id']
        df['industry'] = df['category']
        df['description'] = df['job_description'].str.lower().apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))
    
    df['source'] = 'Indeed'
        
    return df[['posting_date', 'description', 'title', 'employer', 'industry', 'id', 'source', 'country']]

In [8]:
filename = "./01_data/marketing_sample_for_trulia_com_real_estate_20190901_20191031_30k_data.csv"
filename2 = "./01_data/marketing_sample_for_indeed_co_in-indeed_co_in_job__20201101_20201231__5k_data.ldjson"

In [9]:
pd.read_csv(filename)

Unnamed: 0,job_title,job_description,job_type,categories,location,city,state,country,zip_code,address,...,employer_phone,employer_logo,companydescription,employer_location,employer_city,employer_state,employer_country,employer_zip_code,uniq_id,crawl_timestamp
0,Shift Manager,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Mission Hills, CA 91345",Mission Hills,CA,United States,91345,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_squar...,Del Taco is an American quick service restaura...,"Mission Hills, CA 91345",Mission Hills,CA,United States,91345.0,511f9a53920f4641d701d51d3589349f,2019-08-24 09:13:18 +0000
1,Operations Support Manager,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Atlanta, GA 30342",Atlanta,GA,United States,30342,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,"Based in Atlanta, FOCUS Brands Inc. is an inno...",,,,United States,,4955daf0a3facbe2acb6c429ba394e6d,2019-09-19 08:16:55 +0000
2,Senior Product Manager - Data,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Chicago, IL",Chicago,IL,United States,,,...,,,Vibes Corp. reputation was built and establish...,,,,United States,,a0e0d12df1571962b785f17f43ceae12,2019-09-18 02:13:10 +0000
3,Part-Time Office Concierge,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Festus, MO",Festus,MO,United States,,,...,,,,,,,United States,,56e411fd731f76ac916bf4fb169250e9,2019-10-24 16:39:13 +0000
4,Print & Marketing Associate,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Cedar Rapids, IA 52404",Cedar Rapids,IA,United States,52404,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,"Staples is The Worklife Fulfillment Company, h...","Cedar Rapids, IA 52404",Cedar Rapids,IA,United States,52404.0,3fff5c0ad6981bf4bff6260bd5feab63,2019-08-24 22:29:10 +0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29997,Bilingual Teller,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Lakewood, CO 80226",Lakewood,CO,United States,80226,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,Moneytree is a family-run business that has be...,,,,United States,,db18d0e2de28df6e1e605a3800ffd574,2019-10-25 22:42:37 +0000
29998,"Rental Consultant - Harrison, OH","<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Harrison, OH 45030",Harrison,OH,United States,45030,,...,,,,,,,United States,,47f8d7f320223b6f043ef5c345676902,2019-10-25 16:56:18 +0000
29999,Product Support Specialist 5,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Melbourne, FL 32940",Melbourne,FL,United States,32940,,...,,,,,,,United States,,d2a18fb9f52495aa7876d4470e5acc61,2019-10-25 01:39:48 +0000
30000,Marketing Strategist (Contract),"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"San Diego, CA",San Diego,CA,United States,,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,Ajinomoto Bio-Pharma Services is a fully integ...,,,,United States,,ce699d2e94a52dfebb13bcf70ed5ffd7,2019-08-23 15:36:47 +0000


In [10]:
pd.read_json(filename2, lines=True)

Unnamed: 0,uniq_id,crawl_timestamp,url,job_title,category,company_name,city,state,country,post_date,...,dataset,is_remote,postdate_in_indexname_format,fitness_score,apply_url,company_description,test_contact_email,contact_email,inferred_salary_from,inferred_salary_to
0,4c25220299c8d5b46e14ea657ab4b062,2020-12-16 04:31:07 +0000,https://www.indeed.co.in/viewjob?jk=8e946b148b...,"Product Manager, RatingsXpress®",Engineering,S&P Global,Oragadam,Tamil Nadu,IN,2020-12-16,...,[job_board],false,2020.12.11,10,,,,,,
1,687c07a006634d1541ed2ae90d90cce5,2020-11-19 05:10:38 +0000,https://www.indeed.co.in/viewjob?jk=e8de0f5475...,GS Finance Capacity Analysis Lead,Human Resources,The Boston Consulting Group,Gurgaon,Haryana,IN,2020-11-19,...,[job_board],false,2020.11.11,10,https://sjobs.brassring.com/TGnewUI/Search/hom...,YOU BRING (EXPERIENCE & QUALIFICATIONS),,,,
2,376829fb851d36630d75646c1e638a91,2020-11-19 19:46:25 +0000,https://www.indeed.co.in/viewjob?jk=68b3006d27...,Site Reliability Engineer,Engineering,"JPMorgan Chase Bank, N.A.",Hyderabad,Telangana,IN,2020-11-18,...,[job_board],false,2020.11.11,10,https://jpmc.fa.oraclecloud.com/hcmUI/Candidat...,We recognize that our people are our strength ...,,,,
3,4196e3559b41ae3462eebe83d464157f,2020-12-07 12:03:44 +0000,https://www.indeed.co.in/viewjob?jk=a6ebde7e00...,Content Writing Internship,Social Media,Amatra Hotels & Resorts,Delhi,Delhi,IN,2020-12-07,...,[job_board],false,2020.12.01,10,,True to our name with roots in ancient Indian ...,,,,
4,f725bd324d3ace57f3feeb6118962f12,2020-12-16 13:07:22 +0000,https://www.indeed.co.in/viewjob?jk=b9c3915d13...,Recruitment Professional | Part-time | Incenti...,Bpo,Spade Solutions,Patna,Bihar,IN,2020-12-16,...,[job_board],true,2020.12.11,10,,Your job involves –,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,c3a45601b6e0c64d15efe720249d1aec,2020-12-30 10:20:17 +0000,https://www.indeed.co.in/viewjob?jk=cbfc37e2c7...,Sales Executive,Sales,Goodwill wealth management Pvt Ltd,Bhandup,Maharashtra,IN,2020-12-30,...,[job_board],false,2020.12.22,10,,Research and identify new market opportunities.,,,,
4996,a3dd26cc5844a829f27f8188392034ee,2020-12-19 09:50:56 +0000,https://www.indeed.co.in/viewjob?jk=9ae8ecde05...,Tele Caller Executive,Sales,Tejarat Marketing,Gujarat,,IN,2020-12-19,...,[job_board],false,2020.12.11,9,,Job Type: Full-time,,,,
4997,8b4844ab2f8ea987fb546a37a9c341c3,2020-12-24 08:10:39 +0000,https://www.indeed.co.in/viewjob?jk=3d08cbe253...,IT Quality Analyst 2 - C10,Manufacturing,Citi,Noida,Uttar Pradesh,IN,2020-12-24,...,[job_board],false,2020.12.22,10,,Qualifications:,,,,
4998,936e9bc045dc74ded3c809c261640c7d,2020-11-26 01:42:56 +0000,https://www.indeed.co.in/viewjob?jk=40343cdada...,Electrical Engineer,Electrical,AJ Career,Pune,Maharashtra,IN,2020-11-25,...,[job_board],false,2020.11.22,10,https://private-jobs.fresherslive.com/job/aj-c...,,,,,


In [None]:
usa = preprocessing(filename, file_format='csv')
india = preprocessing(filename2, file_format='json')

In [None]:
usa.sample(5)

In [None]:
india.sample(5)

In [None]:
indeed_master = pd.concat([usa, india])

In [None]:
indeed_master.to_csv('./indeed_master.csv', index=False)

In [None]:
# select data related jobs
indeed_main = indeed_master[indeed_master['title'].str.contains('(?i)(data engineer|machine learning|data scientist|data analyst)', regex=True)]

In [None]:
# peek into job titles
indeed_main['title'].unique()

In [None]:
# export data related jobs
indeed_main.to_csv('./indeedjobs.csv', index=False)

In [None]:
# export data employers-industry mapping to assist with Glassdoor missing industries
indeed_main[['employer', 'industry']].drop_duplicates().to_csv('./employer_industry.csv', index=False)

#### Output Employer-Industry for ALL Employers

In [None]:
indeed_master[['employer', 'industry']].dropna().drop_duplicates().to_csv('./all_employer_industry.csv', index=False)

### AWS Key Phrase Extraction

In [147]:
indeed_aws = indeed_main[indeed_main['description'].apply(lambda x: len(x.encode('utf-8'))) <= 5000]

In [None]:
sample_frac = 1 # % of total records for sample processing.
cf_score = 0.8  # confidence score threshold for key phrases

# Call AWS comprehend to extract key phrases
s3 = boto3.Session(
    aws_access_key_id="keyhere",
    aws_secret_access_key="keyhere"
)

comprehend = s3.client(service_name='comprehend', region_name='us-east-2')

print('Calling DetectKeyPhrases')

df_list = []

for i in range(round(sample_frac * len(indeed_main))):
    if len(indeed_main["description"].iloc[i].encode('utf-8')) <= 5000:  # AWS' limitation on one request
        dump_json = json.dumps(comprehend.detect_key_phrases(Text=indeed_main["description"].iloc[i], 
                                                           LanguageCode='en'), sort_keys=True, indent=4)
        df_phrases = pd.json_normalize(json.loads(dump_json)['KeyPhrases'])
        df_phrases['id'] = indeed_main["id"].iloc[i]
        df_list.append(df_phrases)
    else:
        continue # to be handled
        
df = pd.concat(df_list)
print('End of DetectKeyPhrases\n')

In [97]:
df

Unnamed: 0,BeginOffset,EndOffset,Score,Text,id
0,1,16,0.994855,job description,3705b6bdceec9a65cae63741ecff9989
1,19,31,0.988749,bank america,3705b6bdceec9a65cae63741ecff9989
2,40,107,0.899997,senior data scientist analytics team digital t...,3705b6bdceec9a65cae63741ecff9989
3,115,142,0.999753,wholesale credit technology,3705b6bdceec9a65cae63741ecff9989
4,145,159,0.987955,scope projects,3705b6bdceec9a65cae63741ecff9989
...,...,...,...,...,...
92,2634,2646,0.855729,master’s/phd,076196ef66cd3e231bd0d1e97eed18db
93,2648,2658,0.973829,statistics,076196ef66cd3e231bd0d1e97eed18db
94,2660,2749,0.812369,economics quantitative discipline database mar...,076196ef66cd3e231bd0d1e97eed18db
95,2750,2774,0.827273,newer emerging languages,076196ef66cd3e231bd0d1e97eed18db


In [98]:
# Generate the dataframe of skills
df_skills = pd.DataFrame(df[df['Score'] >= cf_score][['id','Text']])
df_skills.columns = ['id','skill']
df_skills['type'] = ''

In [99]:
df_skills.groupby(['skill']).count()[df_skills.groupby(['skill']).count()['id']>10].sort_values('id',ascending=False)

Unnamed: 0_level_0,id,type
skill,Unnamed: 1_level_1,Unnamed: 2_level_1
ability,108,108
data,94,94
world,68,68
role,56,56
company,47,47
...,...,...
globe,11,11
findings,11,11
economics,11,11
data scientists,11,11


### Azure Named Entities Recognition (NER)

In [258]:
key = "keyhere"
endpoint = "https://mads-milestone1-ds-skills.cognitiveservices.azure.com/"

from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

# Authenticate the client using your key and endpoint 
def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

# Example function for recognizing entities from text
def entity_recognition(client, df):
    """
    Args:
        client: object. Azure API
        df: DataFrame. Contains data
        
    Returns:
        df_list: DataFrame. Contains skills extracted from description
    """
    df_list=pd.DataFrame(columns=['id','skill','category','confidence score'])
    num = 0
    for i in list(df.index):
        if num < 100:
            j = int(len(df['description'].iloc[i]) / 5000) + 1
            txt = []
            
            for x in range(j):
                if j <= 1:
                    txt.append(df['description'].iloc[i])
                else:
                    txt.append(df['description'].iloc[i][x*5000:(x+1)*5000])
                try:
                    documents = [item for item in txt[x:x+1]]
                    result = client.recognize_entities(documents = documents)[0]
                    for entity in result.entities:
                        df_list=df_list.append({'id':df['id'].iloc[i],
                                                 'skill':entity.text,
                                                'category':entity.category,
                                                'confidence score':entity.confidence_score},ignore_index=True)
                except Exception as err:
                    print("Encountered exception. {}".format(err))
        else:
            time.sleep(60)
            num = 0
    
    return df_list

#### Evaluate Azure NER

In [None]:
for desc in indeed_main.description[:10]:
    input("")
    print("\n\n", desc)

 




  job description:  bank america seeking senior data scientist analytics team digital transformation program within wholesale credit technology.  scope projects include descriptive, predictive prescriptive analytics. cultivate team infrastructure support modeling analyzing loan data wholesale credit business lines. develop value-added analytics solutions support revenue generation, business insight, risk management, operational efficiency, regulatory compliance, loan pricing, pre post-sale loan facility research. solutions must overcome issues complex data (e.g., vldb, multi-structured, "big data", etc.) well deployment advanced techniques (e.g., machine learning, text mining, statistical analysis, etc.) deliver insights. determine best practices areas data science machine learning. facilitate use technologies within loan business loan system applications including precision lender wholesale credit business processes. liaise global banking markets enterprise level teams determine bes

 




 senior data engineer overview: weedmaps looking senior data engineer help architect, build, scale, maintain world-class data platform. seeking qualified individual eager ensure ability store, process, analyze data keeps business growth ambitions. would early hire fast growing startup significant impact design implementation engineering vision company team grows. impact make: enterprise-wide data modeling database design, processes procedures data governance, change management, taxonomy management master data management. working engineering teams data sme, provide architecture design, development oversight assistance ensuring holistic data strategy scalable data flow architecture. produce documentation standards guidelines technical functional audiences, data flow architecture documentation related platform services. establishing managing ongoing maturity company's business glossary, driving consistent end-to-end data driven integration strategy. develops metadata management reposito

 




  blue skies, data analyst primarily responsible successful completion project activities directed project manager. project activities include understanding customers’ business order complete requirements analysis definition, design solution, prototypes testing, deployment solution post go-live support. data analyst works direction engagement project manager may also required lead small medium engagements. data analyst’s responsibilities fall three major categories: delivery, practice development, sales & marketing. delivery sustain average billable utilization 100% maintain high level customer satisfaction exceeding client expectations self-manage completion assigned activities practice development participate proactively corporate initiatives provide operational support blue skies creation refinement deliverables processes (focused project delivery) sales / sales collaboration & marketing provide sales & business development support assigned maintain expected levels activity within

 




  overview  position reports director compliance operations and, responsible acquire, manage, analyze data report results trends associated compliance risk enable company quickly precisely align resources. responsibilities include items ensuring data quality, identifying resolving issues data logical mathematical nature, identification trends large datasets; develop automation, reporting self-service capabilities duties assigned. ability source data point sale systems associated databases/data marts needed pre-examination, examination, customer remediation reporting package needs.  responsibilities  analyze data associated customer complaints, issues, regulatory examinations, compliance testing findings along associated products/services, financial data, servicing channel geographical data identify shifts trends level within enterprise hierarchy vertical horizontal views. build produce reporting key risk indicators trends associated compliance risk. take active part development conti

 




   req. id: 144857  micron technology’s vision transform world uses information enrich life commitment people, innovation, tenacity, collaboration, customer focus allows us fulfill mission global leader memory storage solutions. means conducting business integrity, accountability, professionalism supporting global community.  smart manufacturing & ai data scientist micron technology, inc., work world-class team data engineers, business intelligence experts, software engineers discover data insights unobservable traditional business solutions. draw broad background data-mining techniques mathematics, statistics, information technology, machine learning, data engineering, design experiments, visualization, text mining discover insightful patterns data. position, take exciting projects across enterprise business processes bring data science methods opportunities sales, supply chain, finance, corporate strategy, marketing, business areas.  responsibilities include, limited to:  strong un

 




   description customer data analyst bmw responsible providing expertise, insight innovation area data management, acquisition enhancement goal providing new insights recommending best practices health digital aspects bmw’s customer experience. incumbent responsible analyzing understanding online marketing, website traffic offline data, partnering various teams grow improve customer experience throughout journey. ensuring data quality, segmentation security data digitally place. ideal candidate work develop strategic data enrichment programs optimize available data targeted marketing activities customer centric platform. duties include analysis quality data capture techniques, educating internal customers relevance data business, championing strategies, identifying data gaps developing best practices identify data opportunities easy report effective digital measurement. incumbent also partner internal teams improve data integrity help develop new innovative ways impact positive custo

 




   managerial direction, wide latitude independent judgment initiative, business data analyst manager (bdam) responsible identifying troubleshooting various issues technical business resolution escalation, appropriate. bdam may responsibility supervising analytical staff. specific duties responsibilities include limited to: collects analyzes statistical quantitative data, employing standard data collection statistical techniques, order produce summary descriptive projective results agency decision making. monitors, reviews, controls coordinates subordinates’ activities, assignments, projects cases personal observation, review statistical narrative data, reports, correspondence. compare work completed milestone expectations, meet program unit objectives. prepare, distribute, schedule work activities assignments based knowledge nature duration activities assignments performed. use basic scheduling techniques, desires abilities staff possible, order provide timetable efficient completio

 




  analyst update internal tracking, create reporting respond escalations within defined responsibilities. main duties / responsibilities build & qa billing/pricing element tables (interchange, auth, data capture, etc.). provide assistance feedback senior data analyst(s) respond operations, product sales escalations. manage card brand 3rd party invoices; query monthly invoices generate p&l reports coordination it/dba commissions/finance teams. build maintain cross-processor interchange code billing descriptor mapping grids (us & canada regions). analyze impact card brand updates/changes production pricing billing elements. develop portfolio specific reporting/metrics using business intelligence tools e.g. ms power bi.  requirements bachelor’s degree preferred. 3+ years financial and/or payments acquiring position. acquiring and/or payment processing industry experience preferred. good understanding clearing settlement concepts (visa, mastercard & pin debit networks). strong proficienc

 




  overview  working part dedicated research team heart fine arts division, individual responsible data input consolidation well management object database. / responsibility partner researcher team build robust database, specialists rely integrity data utilize resource potential business getting.  contract position: december 20, 2019 responsibilities  acting department point person newly identified property, information leads come central point input, upkeep development object database fine arts division. building comprehensive complete lists artists’ oeuvres researching full market history current ownerships works art consolidating intelligence internal sources gathering intelligence external sources centralizing existing internal data  jointly manage object database updating pricing ownership daily entering new intelligence expand database  qualifications  prior data analyst experience strongly preferred exceptional excel proficiency proficiency ms word, outlook general familiarity 

 




 expedia carrentals small rapidly growing company within expedia group portfolio premier car rental booking company web. carrentals powers three different brands across multiple points sale single common platform, bringing suppliers customers together find right car best price. carrentals offers phenomenal opportunity work startup-like environment backed resources, benefits, network large, multi-national online e-commerce company.  senior data analyst, carrentals finance tech team, build solutions automate rhythm business functions, apply statistical forecast models support financial operational forecasting, use data science solve problems problem isn’t obvious. cr finance small team big ambitions transform role finance business use data. passionate data analysis? master vb python? find travel fascinating always wondered looks like behind scenes?  position carrentals finance building team technologists objective transforming finance business. looking someone passion using data hypoth

#### Skills Table Generation Using NER

In [260]:
skills = entity_recognition(client, indeed_main)

In [346]:
skills.to_csv('./indeed_skills.csv', index=False)

## Export Data Scientist Jobs

In [None]:
indeed_ds = indeed_main[indeed_main['title'].str.contains('(?i)(data scientist)', regex=True)]
indeed_ds.to_csv('./indeedDSjobs.csv', index=False)

  """Entry point for launching an IPython kernel.


In [None]:
indeed_ds.reset_index(drop=True, inplace=True)

In [None]:
ds_skills = entity_recognition(client, indeed_ds)

In [None]:
ds_skills.to_csv('./indeed_ds_skills.csv', index=False)

In [361]:
ds_df = pd.merge(ds_skills, indeed_ds)
ds_df.sample(5)

Unnamed: 0,id,skill,category,confidence score,posting_date,description,title,employer,industry,source,country
3425,da78601e21da6ecb53eceba6ee6e4301,25 pounds,Quantity,0.8,2019-10-25,job description visa world’s largest consumer...,Senior Data Scientist,Visa,,Indeed,United States
1308,00112c0c01d44c55fb544ca5d8c657e9,datasets,Skill,0.49,2019-10-04,"criteo, building advertising platform choice ...","Internship Data Scientist, Product Analytics, ...",Criteo,,Indeed,United States
2566,968161a5b323139f074d3b31a34702f1,coding programming,Skill,0.8,2019-10-25,position title data scientist location ralei...,Data Scientist,Genworth,,Indeed,United States
3691,35ad5d2a885fbdd752fa8bc8a2327160,natural language processing,Skill,0.7,2020-11-11,overview: passionate applying data science art...,Data Scientist,"ESRI, Inc.",Healthcare,Indeed,India
2020,88504476dce22903b43ac0ada877fd70,r,Skill,0.59,2019-08-25,live objects live objects delivers continuous...,Data Scientist,LIVE OBJECTS,,Indeed,United States
