In [11]:
import pandas as pd
from pandas import DataFrame

import nltk
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [68]:
job_desc_df = pd.read_csv('filtered_job_desc.csv')

### Additional filtering for description, making it easier to process. 
- at a glance, typical string length for items of requirements and job responsibilities section are long. 
- cheap/lazy yet effective way to trim down excess/eliminating short strings.  

In [69]:
# trimming down the job description
for i in range(len(job_desc_df)):
    desc_list = job_desc_df['description'].iloc[i].split('\n')
    # majority of job requirements and responsibility lines have a length of 45 and shorter
    desc_arr = [i for i in desc_list if len(i) > 45]
    joined_desc = ' '.join(desc_arr)
    job_desc_df['description'].at[i] = joined_desc

In [70]:
ps = PorterStemmer()
wanted_tag = ['NN', 'JJ', 'VBP']

def preppingJobDesc(desc):
    tokens = word_tokenize(desc)
    token_tag = pos_tag(tokens)
    filtered_tokens = [tok for tok, tag in token_tag if tag in wanted_tag]
    stemmed_tokens = [ps.stem(tok).lower() for tok in filtered_tokens]
    return set(stemmed_tokens)

In [71]:
tool_skill = ['cloud','sql', 'mysql', 'python', 'r', 'tableau', 'power bi', 'looker', 'powerpoint', 'google sheet', 'gsuite', 'jupyter', 'javascript', 'java', 'excel', 'statistic', 'model', 'word', 'bi']
# common list of tools and skills appeared in many postings from a quick glance.
# Resource: https://towardsdatascience.com/how-to-use-nlp-in-python-a-practical-step-by-step-example-bd82ca2d2e1e

job_desc_df['desc_word_set'] = job_desc_df['description'].map(preppingJobDesc)
tool_keyword_set = set([ps.stem(tok) for tok in tool_skill])
tool_keyword_dict = {ps.stem(tok):tok for tok in tool_skill} 

{'cloud': 'cloud', 'sql': 'sql', 'mysql': 'mysql', 'python': 'python', 'r': 'r', 'tableau': 'tableau', 'power bi': 'power bi', 'looker': 'looker', 'powerpoint': 'powerpoint', 'google sheet': 'google sheet', 'gsuit': 'gsuite', 'jupyt': 'jupyter', 'javascript': 'javascript', 'java': 'java', 'excel': 'excel', 'statist': 'statistic', 'model': 'model', 'word': 'word', 'bi': 'bi'}


In [72]:
tool_list = []

num_of_jobs = len(job_desc_df)
for i in range(num_of_jobs):
    desc_word_set = job_desc_df['desc_word_set'].iloc[i]
    tool_word_intersect = tool_keyword_set.intersection(desc_word_set)
    
    if len(tool_word_intersect) == 0:
        tool_list.append('N/A')
        
    tool_list += list(tool_word_intersect)

In [73]:
df_tool = pd.DataFrame(data={'count': tool_list})
df_tool = df_tool.replace(tool_keyword_dict)

In [74]:
df_tool = df_tool.value_counts().reset_index().rename(columns = {'count':'tool', 0:'count'}).iloc[:6]

In [75]:
df_tool

Unnamed: 0,tool,count
0,,131
1,excel,80
2,statistic,74
3,model,71
4,cloud,20
5,python,19
