### Simple frequencies at data_analytics job adds

    Small-scale text analytics study to assess word frequencies across data_analytics job add at linkedin to gain insight about structure of current demand. 

#### Reading html, stripping content, creating corpus(dictionary of title(key),content(value) pair.)

In [1]:
import urllib
from bs4 import BeautifulSoup
import subprocess
import re
import itertools

In [2]:
with open('jobadds012023.txt', 'r') as f:
    urls = [line.strip() for line in f]

In [3]:
#number of adds in current corpus
len(urls)

17

In [4]:
mycorpus={}
for page in urls:
    rawhtml=urllib.request.urlopen(page).read()
    soup=BeautifulSoup(rawhtml)
    key=soup.title.string
    value=soup.body.get_text(strip=True)
    mykey="".join(key.split(", ")[0].split())
    myvalue = re.search('companyReportReportBackSubmit(.*)Show moreShow', value).group(1)
    mycorpus[mykey]=myvalue

#### Preprocessing methods (creating relevant tokens from corpus)
    common steps in preprocessing pipelines:
    1. stop_word removal
    2. tokanisation
    3. pos-tagging
    4. lemmatization

In [5]:
import nltk
import os
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer



In [6]:
#tag map for lemmatizer
tag_map = {
    'CC':None, # coordin. conjunction (and, but, or)  
    'CD':wn.NOUN, # cardinal number (one, two)             
    'DT':None, # determiner (a, the)                    
    'EX':wn.ADV, # existential ‘there’ (there)           
    'FW':None, # foreign word (mea culpa)             
    'IN':wn.ADV, # preposition/sub-conj (of, in, by)   
    'JJ':wn.ADJ, # adjective (yellow)                  
    'JJR':wn.ADJ, # adj., comparative (bigger)          
    'JJS':wn.ADJ, # adj., superlative (wildest)           
    'LS':None, # list item marker (1, 2, One)          
    'MD':None, # modal (can, should)                    
    'NN':wn.NOUN, # noun, sing. or mass (llama)          
    'NNS':wn.NOUN, # noun, plural (llamas)                  
    'NNP':wn.NOUN, # proper noun, sing. (IBM)              
    'NNPS':wn.NOUN, # proper noun, plural (Carolinas)
    'PDT':wn.ADJ, # predeterminer (all, both)            
    'POS':None, # possessive ending (’s )               
    'PRP':None, # personal pronoun (I, you, he)     
    'PRP$':None, # possessive pronoun (your, one’s)    
    'RB':wn.ADV, # adverb (quickly, never)            
    'RBR':wn.ADV, # adverb, comparative (faster)        
    'RBS':wn.ADV, # adverb, superlative (fastest)     
    'RP':wn.ADJ, # particle (up, off)
    'SYM':None, # symbol (+,%, &)
    'TO':None, # “to” (to)
    'UH':None, # interjection (ah, oops)
    'VB':wn.VERB, # verb base form (eat)
    'VBD':wn.VERB, # verb past tense (ate)
    'VBG':wn.VERB, # verb gerund (eating)
    'VBN':wn.VERB, # verb past participle (eaten)
    'VBP':wn.VERB, # verb non-3sg pres (eat)
    'VBZ':wn.VERB, # verb 3sg pres (eats)
    'WDT':None, # wh-determiner (which, that)
    'WP':None, # wh-pronoun (what, who)
    'WP$':None, # possessive (wh- whose)
    'WRB':None, # wh-adverb (how, where)
    '$':None, #  dollar sign ($)
    '#':None, # pound sign (#)
    '“':None, # left quote (‘ or “)
    '”':None, # right quote (’ or ”)
    '(':None, # left parenthesis ([, (, {, <)
    ')':None, # right parenthesis (], ), }, >)
    ',':None, # comma (,)
    '.':None, # sentence-final punc (. ! ?)
    ':':None # mid-sentence punc (: ; ... – -)
}

In [7]:
def preprocessing_all(text,tag_map):
    """function to create tokens list from all preprocessed words"""
    stop_words = set(stopwords.words('english'))
    text_tokens = nltk.word_tokenize(text.lower())
    text_tokens_wo_stop_words=[w for w in text_tokens if not w in stop_words and w.isalnum()]    
    text_with_pos=nltk.pos_tag(text_tokens_wo_stop_words)
    wnl = nltk.WordNetLemmatizer()
    def convert_tags(tag):
        return tag_map.get(tag)
    lemmatized_tag_list_wo_stop_word=[]
    for item in text_with_pos:
        new_tag=convert_tags(item[1])
        if new_tag== None:
            out= item[0]
        else:
            out=wnl.lemmatize(item[0],new_tag)
        lemmatized_tag_list_wo_stop_word.append(out)
    return lemmatized_tag_list_wo_stop_word

In [8]:
def preprocessing_adj(text,tag_map):
    """function to create tokens list from preprocessed adjectives"""
    stop_words = set(stopwords.words('english'))
    text_tokens = nltk.word_tokenize(text.lower())
    text_tokens_wo_stop_words=[w for w in text_tokens if not w in stop_words and w.isalnum()]    
    text_with_pos=nltk.pos_tag(text_tokens_wo_stop_words)
    wnl = nltk.WordNetLemmatizer()
    def convert_tags(tag):
        return tag_map.get(tag)
    lemmatized_tag_list_wo_stop_word=[]
    for item in text_with_pos:
        new_tag=convert_tags(item[1])
        if new_tag==wn.ADJ:
            out=wnl.lemmatize(item[0],new_tag)
            lemmatized_tag_list_wo_stop_word.append(out)
    return lemmatized_tag_list_wo_stop_word

In [9]:
def preprocessing_adj_postadj(text,tag_map):
    """function to create tokens list from preprocessed adjectives and words came after an adjective"""
    stop_words = set(stopwords.words('english'))
    text_tokens = nltk.word_tokenize(text.lower())
    text_tokens_wo_stop_words=[w for w in text_tokens if not w in stop_words and w.isalnum()]    
    text_with_pos=nltk.pos_tag(text_tokens_wo_stop_words)
    wnl = nltk.WordNetLemmatizer()
    def convert_tags(tag):
        return tag_map.get(tag)
    lemmatized_tag_list_wo_stop_word=[]
    past_tag=None
    for item in text_with_pos:
        new_tag=convert_tags(item[1])
        if new_tag==wn.ADJ:
            out=wnl.lemmatize(item[0],new_tag)
            lemmatized_tag_list_wo_stop_word.append(out)
        elif past_tag==wn.ADJ:
            if new_tag== None:
                out= item[0]
            else:
                out=wnl.lemmatize(item[0],new_tag)
            lemmatized_tag_list_wo_stop_word.append(out)
        past_tag=new_tag
    return lemmatized_tag_list_wo_stop_word

#### Extracting insight in the form of simple word and bi-gram frequencies. 

In [10]:
#top 10 word frequencies
my_tokens={}
for key, value in mycorpus.items():
    text_tokens=preprocessing_all(value,tag_map)
    my_tokens[key]=text_tokens
all_tokens= list(itertools.chain.from_iterable(my_tokens.values()))
cummulative_freqdist_all = nltk.FreqDist(all_tokens)
print(cummulative_freqdist_all.most_common(10))

[('data', 197), ('work', 90), ('team', 56), ('experience', 51), ('business', 50), ('learn', 44), ('company', 39), ('customer', 39), ('opportunity', 36), ('new', 33)]


In [11]:
#top 10 adjective frequencies
my_adjtokens={}
for key, value in mycorpus.items():
    text_tokens=preprocessing_adj(value,tag_map)
    my_adjtokens[key]=text_tokens
all_adj_tokens= list(itertools.chain.from_iterable(my_adjtokens.values()))
cummulative_freqdist_adj = nltk.FreqDist(all_adj_tokens)
print(cummulative_freqdist_adj.most_common(10))

[('new', 33), ('technical', 18), ('big', 16), ('global', 12), ('analytical', 11), ('large', 10), ('strong', 10), ('sexual', 10), ('equal', 9), ('national', 9)]


In [12]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk import ngrams
from collections import Counter

In [13]:
#finding frequencies of all bi-grams while ignoring bigrams co-occur less then 2 times in corpus
bigram_measures = BigramAssocMeasures()
bigrams_all = BigramCollocationFinder.from_words(all_tokens)
bigrams_all.apply_freq_filter(2)
sorted(bigrams_all.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10]

[(('machine', 'learn'), 27),
 (('data', 'science'), 18),
 (('big', 'data'), 11),
 (('data', 'engineer'), 10),
 (('computer', 'science'), 9),
 (('data', 'analytics'), 9),
 (('data', 'scientist'), 9),
 (('sexual', 'orientation'), 9),
 (('national', 'origin'), 8),
 (('office', '365'), 8)]

In [14]:
#finding frequencies of adjective paired bi-grams while ignoring bigrams co-occur less then 2 times in corpus
my_adj_postadj_tokens={}
for key, value in mycorpus.items():
    text_tokens=preprocessing_adj_postadj(value,tag_map)
    my_adj_postadj_tokens[key]=text_tokens
all_adj_postadj_tokens= list(itertools.chain.from_iterable(my_adj_postadj_tokens.values()))
bigrams_adj_postadj = BigramCollocationFinder.from_words(all_adj_postadj_tokens)
bigrams_adj_postadj.apply_freq_filter(2)
sorted(bigrams_adj_postadj.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10]

[(('big', 'data'), 11),
 (('sexual', 'orientation'), 9),
 (('national', 'origin'), 8),
 (('equal', 'opportunity'), 7),
 (('marital', 'status'), 5),
 (('mental', 'disability'), 4),
 (('physical', 'mental'), 4),
 (('verbal', 'write'), 4),
 (('ethnic', 'national'), 3),
 (('military', 'veteran'), 3)]